Improve unpacking in SSE4+ QC8/QS8/QU8 GEMM/IGEMM microkernels

PiperOrigin-RevId: 390004983
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
index 22ad639..dfb6a2d 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -65,16 +65,14 @@
         a0 += 8;
 
         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
-        const __m128i vsb01 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb01);
-        const __m128i vxb0 = _mm_unpacklo_epi8(vb01, vsb01);
-        const __m128i vxb1 = _mm_unpackhi_epi8(vb01, vsb01);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
+        const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
 
         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
-        const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
-        const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
-        const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
+        const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
+        const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
 
         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));