Code-generate QU8 GEMM and IGEMM microkernels for SSE2/SSSE3/SSE4.1

PiperOrigin-RevId: 382681546
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index 2a2ede8..cb6b952 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
index 4a815b9..3a6470c 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
index 925cfa5..02159fa 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -149,7 +149,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
index d8f5d52..99a6611 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -149,7 +149,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index d363292..8003b56 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
index 43fdb55..bbce5a9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index 1e3d6a8..dd3c288 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -97,7 +97,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -153,7 +153,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
index c9cd4c0..0499b23 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -97,7 +97,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -153,7 +153,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
index d84e43a..fbe2503 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
index 0c41602..3c7cc20 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
index 28b48ca..e426312 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -181,7 +181,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
index 2b65df2..7835c4d 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -181,7 +181,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
index 1f8b868..d1ead47 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
index 6f97176..c6f5af2 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -92,7 +92,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
index d006ea0..ec18bb0 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -181,7 +181,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
index 2d323e1..0b57f70 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -181,7 +181,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
index a8fb8da..734dca9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
@@ -97,7 +97,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -169,7 +169,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
index 7cdc634..26b8ceb 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
@@ -97,7 +97,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -169,7 +169,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
index c9be575..22ad639 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -120,7 +120,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
index 0568e12..2fd9087 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -122,7 +122,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
index 6711743..6cc6702 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
@@ -121,7 +121,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
index f16ea89..d62f339 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -123,7 +123,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
index 4bd1f41..c68b7cc 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -120,7 +120,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
index 2a31258..23fd0eb 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -122,7 +122,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
index 6f85867..a53437f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
@@ -121,7 +121,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
index db23866..e61ae95 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
@@ -123,7 +123,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
index 3fdbf3a..f4aa440 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -125,7 +125,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
index 6e56155..8226e3b 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -127,7 +127,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
index 17b8f27..74f69f7 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
@@ -136,7 +136,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
index 69009c7..ca23fea 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
@@ -138,7 +138,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
index ec61f42..1d309a0 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -153,7 +153,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
index 50c2d49..66beb10 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -155,7 +155,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
index d406f4e..4cfa48f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -136,7 +136,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
index e2b9bef..43a3161 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -138,7 +138,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
index faaf3f5..c8ce0c8 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -153,7 +153,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
index b8bd4c5..2601753 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -155,7 +155,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
index c8d1f8e..e05b5f6 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
@@ -141,7 +141,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
index 0d1cb95..9922c8b 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
@@ -143,7 +143,7 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index 6d12bd6..9a2b742 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
index e89696a..a2800d9 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
index b7f7f8d..3de5bc2 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -185,8 +185,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
index 398ee81..666d22b 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -185,8 +185,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index 791ba21..dd270a9 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
index 1b839e6..56b1069 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index d80dda7..47ec5ca 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -120,7 +120,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -189,8 +189,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
index f4f8b9a..24fa887 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -120,7 +120,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -189,8 +189,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
index 81f4c32..3f24371 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
index 9d164eb..cd7dd70 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
index 39aea8e..f4f6283 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -232,8 +232,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
index 128ac34..5e5e497 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -232,8 +232,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
index 9245e7e..abd2c8e 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
index f97b73c..65cf168 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -115,7 +115,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
index d3c9714..f744b3a 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -232,8 +232,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
index bf47976..7e9f2a7 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -232,8 +232,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
index aefa9c1..6291abf 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
@@ -120,7 +120,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -212,8 +212,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
index f88b0f1..80929e3 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
@@ -120,7 +120,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -212,8 +212,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
index aa4745d..6a24f81 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -149,8 +149,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
index 26f50da..4ca40e1 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
@@ -151,8 +151,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
index e46e6ed..17d7560 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
@@ -150,8 +150,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
index 82e5115..49e69bf 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
@@ -152,8 +152,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
index 0cacb43..5f0e258 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -149,8 +149,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
index 69e9ef8..e6c77fd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
@@ -151,8 +151,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
index dc0a620..59090cb 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
@@ -150,8 +150,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
index e8e086e..f6d6210 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
@@ -152,8 +152,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
index ada4f99..187b8bd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -154,8 +154,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
index 0f3b693..c82921f 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
@@ -156,8 +156,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
index 987dcb0..d6477d3 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
@@ -172,8 +172,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
index 3206354..337df9a 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
@@ -174,8 +174,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
index 4f88e7c..f24fc1a 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -197,8 +197,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
index 29f6bb9..04fec0c 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -199,8 +199,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
index fc1ce4d..a999775 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -172,8 +172,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
index 696aea2..b30c7eb 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -174,8 +174,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
index ce4dbf1..630f5c0 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -197,8 +197,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
index e7c1e34..2ccb6b0 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -199,8 +199,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
index c690081..14be109 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
@@ -177,8 +177,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
index f69db77..e2e3ed8 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
@@ -179,8 +179,8 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index 39e1fc4..304a4f7 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
index d939ab6..c1cec78 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
index a2ecb0f..59ba39d 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -223,9 +223,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
index 4642e48..83ee714 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -223,9 +223,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index 020f028..210f54a 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
index 9e3ff00..fbac5d1 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index d14a6c9..6190ade 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -143,7 +143,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -226,9 +226,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
index a2b6c23..31575f4 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -143,7 +143,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -226,9 +226,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
index abed9b6..fef31cf 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
index 37cc36d..bf89939 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
index 3836866..35b26a3 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -285,9 +285,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
index 5d1886d..7d345ea 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -285,9 +285,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
index 1ffbff8..fc27bef 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
index 84d280b..975ce10 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -138,7 +138,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
index 92e9447..8074fcb 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -285,9 +285,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
index 75700c3..ff2e060 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -285,9 +285,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
index 6356e76..91e7c30 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
@@ -143,7 +143,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -256,9 +256,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
index 4912f55..983aa01 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
@@ -143,7 +143,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -256,9 +256,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
index f198a7d..b4a8597 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -179,9 +179,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
index fb028a4..9cc0420 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
@@ -181,9 +181,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
index 8dc7fdb..b1095e5 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
@@ -181,9 +181,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
index 079ee69..4dfa033 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
@@ -183,9 +183,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
index d262753..5a51a7c 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -179,9 +179,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
index a3f8f88..303b7f8 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
@@ -181,9 +181,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
index 8ffc6fc..118cdd9 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
@@ -181,9 +181,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
index 6c712bf..e9bf06c 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
@@ -183,9 +183,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
index 55639ef..1a4e7db 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -184,9 +184,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
index 19f5904..1314632 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
@@ -186,9 +186,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
index a13dd17..e3cf40a 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
@@ -209,9 +209,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
index fbad091..dc1751e 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
@@ -211,9 +211,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
index e4200cd..8b65958 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -243,9 +243,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
index a5f3732..b26e36d 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -245,9 +245,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
index 15de5b5..6627108 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -209,9 +209,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
index 610c200..7217bd9 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -211,9 +211,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
index b5919d7..0adb317 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -243,9 +243,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
index 3f6204f..00811cf 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -245,9 +245,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
index a866141..2c25542 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
@@ -214,9 +214,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
index c297cc2..f2f08b8 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
@@ -216,9 +216,9 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index 9575adb..96dbae3 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
index 86dbc98..e0fd57e 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
index 31c84c2..06be3e4 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -259,10 +259,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
index 0d6a7da..958b7a9 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -259,10 +259,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 112e243..06d0d8a 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index c8a3cca..ae535c4 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index efa3ada..ab8e9bb 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -166,7 +166,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -262,10 +262,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
index 9f9ad98..b1f20e1 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -166,7 +166,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -262,10 +262,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
index 681ea5f..a5f5764 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
index 7e66cc1..9a4da26 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
index 5875e0e..0c0d4d9 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -336,10 +336,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
index 89ba44a..b586305 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -336,10 +336,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
index 64e1ff5..3d24064 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
index 8571512..0585add 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
index 35d07a0..6f78980 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -336,10 +336,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
index 5d141e0..341314f 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -336,10 +336,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
-        *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+        *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+        *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+        *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+        *c0 = (int8_t) _mm_cvtsi128_si32(vout);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index 49717e2..1d28198 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -166,7 +166,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -299,10 +299,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
index d4b022a..61e46f2 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
@@ -166,7 +166,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -299,10 +299,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;