Code-generate QU8 GEMM and IGEMM microkernels for SSE2/SSSE3/SSE4.1
PiperOrigin-RevId: 382681546
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index 49717e2..1d28198 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -166,7 +166,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -299,10 +299,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;