Code-generate QU8 GEMM and IGEMM microkernels for SSE2/SSSE3/SSE4.1

PiperOrigin-RevId: 382681546
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index 73c236c..2455fe9 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -161,7 +161,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
           _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
             _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
               _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -258,10 +258,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;