Improve unpacking in SSE4+ QC8/QS8/QU8 GEMM/IGEMM microkernels

PiperOrigin-RevId: 390004983
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index d785cde..0d5ce99 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -52,9 +52,8 @@
       a0 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -62,9 +61,8 @@
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index cbb929b..d23b094 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -52,9 +52,8 @@
       a0 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -62,9 +61,8 @@
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index 9d93d68..110a478 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -57,9 +57,8 @@
       a0 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -67,9 +66,8 @@
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
index b3e0c9c..96fcbb8 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -54,16 +54,14 @@
       a0 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
index 017f821..aff979d 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -54,16 +54,14 @@
       a0 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
index 0581260..f3e275c 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -59,16 +59,14 @@
       a0 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
       vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index b275aa7..3cf1f61 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -62,9 +62,8 @@
       a1 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -76,9 +75,8 @@
       vacc1x0123 = _mm_add_epi32(vacc1x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index a0a0928..c680c51 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -62,9 +62,8 @@
       a1 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -76,9 +75,8 @@
       vacc1x0123 = _mm_add_epi32(vacc1x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index ddc31f4..9ad6476 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -67,9 +67,8 @@
       a1 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -81,9 +80,8 @@
       vacc1x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc1x0123);
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
index 6043edc..3f5f8bb 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -67,18 +67,16 @@
       a1 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
index a46186c..8b16617 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -67,18 +67,16 @@
       a1 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
index 0d31f0a..ef9c888 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -72,18 +72,16 @@
       a1 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
       vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
       vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
       vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index 4f26f19..e7d1a12 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -72,9 +72,8 @@
       a2 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -90,9 +89,8 @@
       vacc2x0123 = _mm_add_epi32(vacc2x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index a9c4419..a29e1d6 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -72,9 +72,8 @@
       a2 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -90,9 +89,8 @@
       vacc2x0123 = _mm_add_epi32(vacc2x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index 89b0165..a7a657d 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -77,9 +77,8 @@
       a2 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -95,9 +94,8 @@
       vacc2x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
index 1522a40..5b50811 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -80,9 +80,8 @@
       a2 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
@@ -91,9 +90,8 @@
       vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
       vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
index dfd6c2d..c0e6792 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -80,9 +80,8 @@
       a2 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
@@ -91,9 +90,8 @@
       vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
       vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
index c0436db..b7866e3 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -85,9 +85,8 @@
       a2 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
@@ -96,9 +95,8 @@
       vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
       vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
       vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index fbe796c..e22b64d 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -82,9 +82,8 @@
       a3 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -104,9 +103,8 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 3d90961..d29750b 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -82,9 +82,8 @@
       a3 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -104,9 +103,8 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index 821879c..570a2d4 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -87,9 +87,8 @@
       a3 += 8;
 
       const __m128i vb01 = _mm_loadu_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -109,9 +108,8 @@
       vacc3x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
       const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);