Improve unpacking in SSE4+ QC8/QS8/QU8 GEMM/IGEMM microkernels

PiperOrigin-RevId: 390004983
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
index dfd6c2d..c0e6792 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -80,9 +80,8 @@
       a2 += 8;
 
       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-      const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-      const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-      const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+      const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+      const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
@@ -91,9 +90,8 @@
       vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
       vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-      const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-      const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-      const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+      const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+      const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));