Code-generate QU8 GEMM and IGEMM microkernels for SSE2/SSSE3/SSE4.1

PiperOrigin-RevId: 382681546
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index 49717e2..1d28198 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -166,7 +166,7 @@
 
         const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
         w = (const void*) ((const int8_t*) w + 8);
-        const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+        const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
         vacc0x0123 = _mm_maddd_epi16(
           _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
         if (k > 2 * sizeof(int8_t)) {
           const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
           w = (const void*) ((const int8_t*) w + 8);
-          const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+          const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
           vacc0x0123 = _mm_maddd_epi16(
             _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
           if (k > 4 * sizeof(int8_t)) {
             const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
             w = (const void*) ((const int8_t*) w + 8);
-            const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+            const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
             vacc0x0123 = _mm_maddd_epi16(
               _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -299,10 +299,10 @@
         vout = _mm_srli_epi32(vout, 16);
       }
       if (nc & 1) {
-        *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
-        *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
-        *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
-        *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+        *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+        *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+        *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+        *c0 = (int8_t) _mm_extract_epi8(vout, 0);
       }
 
       nc = 0;