Code-generate QU8 GEMM and IGEMM microkernels for SSE2/SSSE3/SSE4.1
PiperOrigin-RevId: 382681546
diff --git a/src/init.c b/src/init.c
index 29d1008..f18e49e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -2107,8 +2107,8 @@
#ifndef XNN_NO_QU8_OPERATORS
init_flags |= XNN_INIT_FLAG_QU8;
- xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2);
- xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2);
+ xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64);
+ xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64);
xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_gemmlowp_sse2_params;
xnn_params.qu8.gemm.mr = 4;
xnn_params.qu8.gemm.nr = 4;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index a7e330c..d785cde 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
index 1e364fd..d416eb0 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
index 5239bde..6956b2f 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -139,7 +139,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
index 8a59055..154ea26 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -139,7 +139,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index 9dcb26f..cbb929b 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
index 1a60399..2840101 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index 9ac6301..9d93d68 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -143,7 +143,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
index 6ee7bf9..b848141 100644
--- a/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -143,7 +143,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
index d9c1d2c..b3e0c9c 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -109,7 +109,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
index 3235071..5787b52 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -111,7 +111,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
index 2cad520..8e5476c 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
@@ -110,7 +110,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
index d595a83..520725e 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -112,7 +112,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
index 57e2d8a..017f821 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -109,7 +109,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
index 7e33913..ef739b8 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -111,7 +111,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
index 3de92c0..0581260 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -114,7 +114,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
index 8c3f029..df03a69 100644
--- a/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qc8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -116,7 +116,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index 865a606..b275aa7 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -173,8 +173,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
index fcf7354..4632064 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -173,8 +173,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
index d539fca..6730c36 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -175,8 +175,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
index 7190ea9..bf7ccd7 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -175,8 +175,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index 8ac0f3e..a0a0928 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -173,8 +173,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
index 6e0bdea..332a29e 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -173,8 +173,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index e9b1522..ddc31f4 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -178,8 +178,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
index 0f6a85f..eaecd04 100644
--- a/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -178,8 +178,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
index 8246e47..6043edc 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -137,8 +137,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
index 9e36562..19e00a2 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
@@ -139,8 +139,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
index abc9ae3..03cba79 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
@@ -139,8 +139,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
index 2818fd5..b9f7cb2 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
@@ -141,8 +141,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
index ca0ee71..a46186c 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -137,8 +137,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
index 5ef3176..2e122f4 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
@@ -139,8 +139,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
index 9506d4c..0d31f0a 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -142,8 +142,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
index f204d5a..8d0e8b8 100644
--- a/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qc8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
@@ -144,8 +144,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index 00e88af..4f26f19 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -209,9 +209,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
index b61b9da..53b2b1f 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -209,9 +209,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
index eef063f..088c32b 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -213,9 +213,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
index c17ae1f..0b81995 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -213,9 +213,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index 042edf5..a9c4419 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -209,9 +209,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
index e193a29..b92d45e 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -209,9 +209,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index aa84b24..89b0165 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -214,9 +214,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
index 0fe3848..a5122f5 100644
--- a/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -214,9 +214,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
index c637a2b..1522a40 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -166,9 +166,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
index 98dd5ae..de7093a 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
@@ -168,9 +168,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
index 53df1af..45281b8 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
@@ -170,9 +170,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
index 11e8f69..ab5581d 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
@@ -172,9 +172,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
index a6baeff..dfd6c2d 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -166,9 +166,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
index 3638c0b..0362ca3 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
@@ -168,9 +168,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
index 7c52bd7..c0436db 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -171,9 +171,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
index 9f03b0f..28d1727 100644
--- a/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qc8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
@@ -173,9 +173,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index 0ac2672..fbe796c 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -244,10 +244,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
index 109c96e..85e5940 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -244,10 +244,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
index a697ab2..4faf506 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -249,10 +249,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
index 2f1385f..e69598a 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -249,10 +249,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 4af3657..3d90961 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -244,10 +244,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index 3fb83bb..6e075f5 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -244,10 +244,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index 425cb41..821879c 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -249,10 +249,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
index 50da9d5..64a74e5 100644
--- a/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -249,10 +249,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index 37e5e14..75c05ee 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -149,7 +149,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
index 2922f31..7d3b38c 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -149,7 +149,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
index 1c092df..5417897 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -150,7 +150,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
index 6d7e3db..a56b3cf 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -150,7 +150,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index 7c4b5d9..b499822 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -149,7 +149,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
index bef8386..cbd3024 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -149,7 +149,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index 68b4668..4fe9617 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -97,7 +97,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -154,7 +154,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
index ab20c98..46284b5 100644
--- a/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -97,7 +97,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -154,7 +154,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
index 4f41a4a..adeb7a6 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -121,7 +121,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
index c886df4..0ed0230 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -123,7 +123,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
index f08d4ad..f4ca858 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
@@ -122,7 +122,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
index bb4d21f..381ea49 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -124,7 +124,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
index 682f78c..05091ad 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -121,7 +121,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
index 98f80bf..dc78672 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -123,7 +123,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
index 8ab761a..726c488 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -126,7 +126,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
index fb13c69..e9057ed 100644
--- a/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qc8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -128,7 +128,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index b563212..3820219 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
index 3d0d957..7b67edd 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
index 8b9932d..cac3868 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -186,8 +186,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
index 248de43..6c6dbd7 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -186,8 +186,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index 30a132d..f4daad7 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
index fbb67cd..ebf1355 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index a4d0108..1c76d0b 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -120,7 +120,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -190,8 +190,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
index 6faebfb..ef3d9d1 100644
--- a/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -120,7 +120,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -190,8 +190,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
index 564303d..4cbb7ed 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -150,8 +150,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
index f620ac3..192bc67 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
@@ -152,8 +152,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
index 0acc002..b18a54b 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
@@ -151,8 +151,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
index 5be9dd9..a36e388 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
@@ -153,8 +153,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
index b56c544..2cf99ae 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -150,8 +150,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
index 65b76cc..0f74e6f 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
@@ -152,8 +152,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
index 53ccc2e..8b4e67f 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -155,8 +155,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
index a9724f0..a0c6870 100644
--- a/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qc8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
@@ -157,8 +157,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index adce9d5..0bb8731 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -222,9 +222,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
index 5ba6759..c27ffb0 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -222,9 +222,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
index 03db5db..1fdf285 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -224,9 +224,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
index 9f01bff..31d21e3 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -224,9 +224,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index c3ee33f..d49e053 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -222,9 +222,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
index 434f062..981d5b0 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -222,9 +222,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index 0d80e05..5852a41 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -143,7 +143,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -227,9 +227,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
index d71485e..0af863f 100644
--- a/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -143,7 +143,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -227,9 +227,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
index fa28748..5344018 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -180,9 +180,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
index 156751a..ee95d71 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
@@ -182,9 +182,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
index 4e5ad2a..a4fe534 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
@@ -182,9 +182,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
index 547507a..70c2b5a 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
@@ -184,9 +184,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
index 9b5b688..1d7630b 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -180,9 +180,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
index b786576..b76000e 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
@@ -182,9 +182,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
index edcbe65..5e53098 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -185,9 +185,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
index 00c7c77..c633f97 100644
--- a/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qc8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
@@ -187,9 +187,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index afef149..34a3d7b 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -258,10 +258,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
index 715a6f4..e5a21d4 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -258,10 +258,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
index 4a33891..73988b1 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -260,10 +260,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
index c6ec484..6e3a2c5 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -260,10 +260,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 559fa02..0748810 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -258,10 +258,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index 73c236c..2455fe9 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -258,10 +258,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index f29ac8e..9a19c21 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -166,7 +166,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -263,10 +263,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
index 5678e70..17d6083 100644
--- a/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qc8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -166,7 +166,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -263,10 +263,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/MRx4c2-sse.c.in b/src/qs8-gemm/MRx4c2-sse.c.in
index b655f78..9fc97e8 100644
--- a/src/qs8-gemm/MRx4c2-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-sse.c.in
@@ -7,7 +7,7 @@
$assert not XOP or AVX
$assert not AVX or SSE == 4
$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
-$assert DATATYPE in ["QC8", "QS8"]
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
$assert SSE != 3 or REQUANTIZATION != "FP32"
$assert VARIANT in ["LD64", "LD128", "EXTENDED"]
@@ -32,17 +32,18 @@
$LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
$GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_qs8_conv_minmax_params"
-$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
+$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_%s_conv_minmax_params" % DATATYPE.lower()
+$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE == 4 and DATATYPE != "QU8" else "sse2")
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
void xnn_${DATATYPE.lower()}_gemm${GEMM_SUFFIX}_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c2__${ISA}${LOAD_SUFFIX}(
size_t mr,
size_t nc,
size_t kc,
- const int8_t* restrict a,
+ const ${XINT8_T}* restrict a,
size_t a_stride,
const void* restrict w,
- int8_t* restrict c,
+ ${XINT8_T}* restrict c,
size_t cm_stride,
size_t cn_stride,
const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
@@ -51,17 +52,17 @@
assert(mr <= ${MR});
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(kc % sizeof(${XINT8_T}) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
+ const ${XINT8_T}* a0 = a;
+ ${XINT8_T}* c0 = c;
$for M in range(1, MR):
- const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
- int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+ const ${XINT8_T}* a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M-1} + a_stride);
+ ${XINT8_T}* c${M} = (${XINT8_T}*) ((uintptr_t) c${M-1} + cm_stride);
$if M % 2 == 0:
if XNN_UNPREDICTABLE(mr <= ${M}) {
a${M} = a${M-1};
@@ -85,13 +86,23 @@
w = (const void*) ((const int32_t*) w + 4);
size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
+ $if DATATYPE == "QU8":
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point);
+ $if SSE < 4:
+ const __m128i vzero = _mm_setzero_si128();
+ while (k >= 8 * sizeof(${XINT8_T})) {
$for M in range(MR):
const __m128i va${M} = _mm_loadl_epi64((const __m128i*) a${M});
- $if SSE == 4:
- const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepu8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_unpacklo_epi8(va${M}, vzero);
$else:
- const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
a${M} += 8;
$if VARIANT == "LD128":
@@ -99,10 +110,14 @@
$if K == 0:
const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) w);
$else:
- const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + ${K * 8}));
- const __m128i vsb${K}${K+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K}${K+1});
- const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1});
- const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1});
+ const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((const ${XINT8_T}*) w + ${K * 8}));
+ $if DATATYPE == "QU8":
+ const __m128i vxb${K} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${K}${K+1}, vzero), vb_zero_point);
+ const __m128i vxb${K+1} = _mm_sub_epi16(_mm_unpackhi_epi8(vb${K}${K+1}, vzero), vb_zero_point);
+ $else:
+ const __m128i vsb${K}${K+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K}${K+1});
+ const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1});
+ const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1});
$for M in range(MR):
$if XOP:
@@ -125,11 +140,17 @@
$if K == 0:
const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) w);
$else:
- const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + ${K * 8}));
- $if SSE == 4:
- const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K});
+ const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((const ${XINT8_T}*) w + ${K * 8}));
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb${K} = _mm_sub_epi16(_mm_cvtepu8_epi16(vb${K}), vb_zero_point);
+ $else:
+ const __m128i vxb${K} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${K}, vzero), vb_zero_point);
$else:
- const __m128i vxb${K} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${K}, vb${K}), 8);
+ $if SSE == 4:
+ const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K});
+ $else:
+ const __m128i vxb${K} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${K}, vb${K}), 8);
$elif VARIANT == "EXTENDED":
$if K == 0:
const __m128i vxb${K} = _mm_load_si128((const __m128i*) w);
@@ -147,28 +168,40 @@
$if VARIANT == "EXTENDED":
w = (const void*) ((const int16_t*) w + 32);
$else:
- w = (const void*) ((const int8_t*) w + 32);
- k -= 8 * sizeof(int8_t);
+ w = (const void*) ((const ${XINT8_T}*) w + 32);
+ k -= 8 * sizeof(${XINT8_T});
}
if (k != 0) {
$for M in range(MR):
const __m128i va${M} = _mm_loadl_epi64((const __m128i*) a${M});
- $if SSE == 4:
- const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepu8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_unpacklo_epi8(va${M}, vzero);
$else:
- const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
- a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
+ a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + k);
$if VARIANT == "EXTENDED":
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
w = (const void*) ((const int16_t*) w + 8);
$else:
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- $if SSE == 4:
- const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+ $else:
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
$else:
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
- w = (const void*) ((const int8_t*) w + 8);
+ $if SSE == 4:
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+ $else:
+ const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ w = (const void*) ((const ${XINT8_T}*) w + 8);
$for M in range(MR):
$if XOP:
@@ -178,17 +211,23 @@
vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- if (k > 2 * sizeof(int8_t)) {
+ if (k > 2 * sizeof(${XINT8_T})) {
$if VARIANT == "EXTENDED":
const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
w = (const void*) ((const int16_t*) w + 8);
$else:
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- $if SSE == 4:
- const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+ $else:
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
$else:
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
- w = (const void*) ((const int8_t*) w + 8);
+ $if SSE == 4:
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+ $else:
+ const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ w = (const void*) ((const ${XINT8_T}*) w + 8);
$for M in range(MR):
$if XOP:
@@ -198,17 +237,23 @@
vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- if (k > 4 * sizeof(int8_t)) {
+ if (k > 4 * sizeof(${XINT8_T})) {
$if VARIANT == "EXTENDED":
const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
w = (const void*) ((const int16_t*) w + 8);
$else:
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- $if SSE == 4:
- const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+ $else:
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
$else:
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
- w = (const void*) ((const int8_t*) w + 8);
+ $if SSE == 4:
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+ $else:
+ const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ w = (const void*) ((const ${XINT8_T}*) w + 8);
$for M in range(MR):
$if XOP:
@@ -319,20 +364,29 @@
$for M in range(0, MR, 2):
__m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
- $if SSE < 4:
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
- $for M in range(0, MR, 2):
- vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+ $if DATATYPE == "QU8":
+ $if MR > 2:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
- $if MR > 2:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
$else:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+ $if SSE < 4:
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
- $if SSE == 4:
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
+ $if MR > 2:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+
+ $if SSE == 4:
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
@@ -344,10 +398,10 @@
*((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(vout);
$for M in range(MR):
- c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+ c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);
$for M in range(MR):
- a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+ a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} - kc);
nc -= 4;
} else {
@@ -360,11 +414,11 @@
if (nc & 1) {
$if SSE == 4:
$for M in range(MR):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi8(vout, ${M * 4});
+ *c${M} = (${XINT8_T}) _mm_extract_epi8(vout, ${M * 4});
$else:
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (${XINT8_T}) _mm_cvtsi128_si32(vout);
$for M in range(1, MR):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi16(vout, ${M * 2});
+ *c${M} = (${XINT8_T}) _mm_extract_epi16(vout, ${M * 2});
}
nc = 0;
diff --git a/src/qs8-gemm/MRx4c8-sse.c.in b/src/qs8-gemm/MRx4c8-sse.c.in
index e02a1d6..635e9e1 100644
--- a/src/qs8-gemm/MRx4c8-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-sse.c.in
@@ -7,7 +7,7 @@
$assert not XOP or AVX
$assert not AVX or SSE == 4
$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
-$assert DATATYPE in ["QC8", "QS8"]
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
$assert VARIANT in ["LD64", "LD128", "EXTENDED"]
$assert MR <= 4
@@ -30,17 +30,18 @@
$LOAD_SUFFIX = {"LD128": "_ld128", "LD64": "_ld64", "EXTENDED": ""}[VARIANT]
$GEMM_SUFFIX = "_xw" if VARIANT == "EXTENDED" else ""
-$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_qs8_conv_minmax_params"
-$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
+$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_%s_conv_minmax_params" % DATATYPE.lower()
+$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 and DATATYPE != "QU8" else "sse2")
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
void xnn_${DATATYPE.lower()}_gemm${GEMM_SUFFIX}_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c8__${ISA}${LOAD_SUFFIX}(
size_t mr,
size_t nc,
size_t kc,
- const int8_t* restrict a,
+ const ${XINT8_T}* restrict a,
size_t a_stride,
const void* restrict w,
- int8_t* restrict c,
+ ${XINT8_T}* restrict c,
size_t cm_stride,
size_t cn_stride,
const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
@@ -49,17 +50,17 @@
assert(mr <= ${MR});
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(kc % sizeof(${XINT8_T}) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
+ const ${XINT8_T}* a0 = a;
+ ${XINT8_T}* c0 = c;
$for M in range(1, MR):
- const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
- int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+ const ${XINT8_T}* a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M-1} + a_stride);
+ ${XINT8_T}* c${M} = (${XINT8_T}*) ((uintptr_t) c${M-1} + cm_stride);
$if M % 2 == 0:
if XNN_UNPREDICTABLE(mr <= ${M}) {
a${M} = a${M-1};
@@ -85,13 +86,23 @@
w = (const void*) ((const int32_t*) w + 4);
size_t k = 0;
+ $if DATATYPE == "QU8":
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point);
+ $if SSE < 4:
+ const __m128i vzero = _mm_setzero_si128();
while (k < kc) {
$for M in range(MR):
const __m128i va${M} = _mm_loadl_epi64((const __m128i*) a${M});
- $if SSE == 4:
- const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepu8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_unpacklo_epi8(va${M}, vzero);
$else:
- const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
a${M} += 8;
$if VARIANT == "LD128":
@@ -99,10 +110,14 @@
$if N == 0:
const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
$else:
- const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((const int8_t*) w + ${N * 8}));
- const __m128i vsb${N}${N+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N}${N+1});
- const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1});
- const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1});
+ const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((const ${XINT8_T}*) w + ${N * 8}));
+ $if DATATYPE == "QU8":
+ const __m128i vxb${N} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${N}${N+1}, vzero), vb_zero_point);
+ const __m128i vxb${N+1} = _mm_sub_epi16(_mm_unpackhi_epi8(vb${N}${N+1}, vzero), vb_zero_point);
+ $else:
+ const __m128i vsb${N}${N+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N}${N+1});
+ const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1});
+ const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1});
$for M in range(MR):
$if XOP:
@@ -117,11 +132,17 @@
$if N == 0:
const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) w);
$else:
- const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + ${N * 8}));
- $if SSE == 4:
- const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N});
+ const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((const ${XINT8_T}*) w + ${N * 8}));
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb${N} = _mm_sub_epi16(_mm_cvtepu8_epi16(vb${N}), vb_zero_point);
+ $else:
+ const __m128i vxb${N} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${N}, vzero), vb_zero_point);
$else:
- const __m128i vxb${N} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${N}, vb${N}), 8);
+ $if SSE == 4:
+ const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N});
+ $else:
+ const __m128i vxb${N} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${N}, vb${N}), 8);
$elif VARIANT == "EXTENDED":
$if N == 0:
const __m128i vxb${N} = _mm_load_si128((const __m128i*) w);
@@ -137,8 +158,8 @@
$if VARIANT == "EXTENDED":
w = (const void*) ((const int16_t*) w + 32);
$else:
- w = (const void*) ((const int8_t*) w + 32);
- k += 8 * sizeof(int8_t);
+ w = (const void*) ((const ${XINT8_T}*) w + 32);
+ k += 8 * sizeof(${XINT8_T});
}
$if SSE >= 3:
@@ -254,20 +275,29 @@
$for M in range(0, MR, 2):
__m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
- $if SSE < 4:
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
- $for M in range(0, MR, 2):
- vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+ $if DATATYPE == "QU8":
+ $if MR > 2:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
- $if MR > 2:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
$else:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+ $if SSE < 4:
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
- $if SSE == 4:
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
+ $if MR > 2:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+
+ $if SSE == 4:
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
@@ -279,10 +309,10 @@
*((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(vout);
$for M in range(MR):
- c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+ c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);
$for M in range(MR):
- a${M} = (const int8_t*) ((uintptr_t) a${M} - kc);
+ a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} - kc);
nc -= 4;
} else {
@@ -295,11 +325,11 @@
if (nc & 1) {
$if SSE == 4:
$for M in range(MR):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi8(vout, ${M * 4});
+ *c${M} = (${XINT8_T}) _mm_extract_epi8(vout, ${M * 4});
$else:
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (${XINT8_T}) _mm_cvtsi128_si32(vout);
$for M in range(1, MR):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi16(vout, ${M * 2});
+ *c${M} = (${XINT8_T}) _mm_extract_epi16(vout, ${M * 2});
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index 09f74fb..27de809 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -137,7 +137,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
index fe3ebcc..d7ab625 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -137,7 +137,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
index fac8d30..df327a7 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
index 4a9ca5c..d6506e6 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index 4fedbb9..8e5784b 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -137,7 +137,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
index 58ee439..a48f30a 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -137,7 +137,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index 7f34c7f..90f4131 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -142,7 +142,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
index 0e89ca5..b7d288d 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -142,7 +142,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
index 23d15dc..7e90089 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
index 7ba0084..e8f444a 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
index 5ae5fb1..d690d14 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -170,7 +170,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
index 6054c56..620129e 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -170,7 +170,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
index a8735ad..05d27ab 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
index 0f68b75..559396c 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
index d30ba65..bd6c0f6 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -170,7 +170,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
index 576491b..e5ec3a4 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -170,7 +170,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
index 8413264..69028dd 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
@@ -158,7 +158,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
index ef8c317..8a416b0 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
@@ -158,7 +158,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
index 0d472bc..b649211 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
@@ -146,7 +146,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
index f979507..0f03fdf 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
@@ -163,7 +163,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
index cbe2dc5..ed5fd68 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
@@ -146,7 +146,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
index 2b18295..f6bf029 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -163,7 +163,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
index ce70471..534ee1b 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
@@ -151,7 +151,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
index 0ece629..ef99baf 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -108,7 +108,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
index 201a49e..0b2e0ce 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -110,7 +110,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
index 3036281..2cd12cf 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
@@ -109,7 +109,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
index 51b7f20..a10f4a4 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -111,7 +111,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
index 807831d..3afc5c5 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -108,7 +108,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
index c0fb59f..efc14ce 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -110,7 +110,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
index 9900974..51cbfe4 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
@@ -109,7 +109,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
index 2be2b09..ab92cb0 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
@@ -111,7 +111,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
index be748d0..6d861cc 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -113,7 +113,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
index f8730d0..df4c4e1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -115,7 +115,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
index e328c28..8ac5629 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
@@ -124,7 +124,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
index c7d811e..a787e2f 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
@@ -126,7 +126,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
index d10dc77..a3750b0 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -141,7 +141,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
index 4b75667..0c14082 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -143,7 +143,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
index 86efa19..d621130 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -124,7 +124,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
index 8a4b07a..6377876 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -126,7 +126,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
index 9f1d1e2..7401f1f 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -141,7 +141,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
index e2a0f6c..c4c9e16 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -143,7 +143,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
index 9fb39fc..a234007 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
@@ -129,7 +129,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
index 29f15b4..e5c655b 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
@@ -131,7 +131,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
index ffadfbe..27ac294 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
@@ -122,7 +122,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
index 0ceef7f..a32f77c 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
@@ -139,7 +139,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
index e0f9722..8f5cae1 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
@@ -122,7 +122,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
index 258f3a8..0be1783 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
@@ -139,7 +139,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
index e59b00e..c85e7af 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
@@ -127,7 +127,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index b6700d5..8595595 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -172,8 +172,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
index 46a5d8a..7ccd0e6 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -172,8 +172,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
index 35ca7ab..d363d6c 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -174,8 +174,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
index 268d7ba..ea7023b 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -174,8 +174,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index bef9f23..aefd318 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -172,8 +172,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
index f493e3d..0f9e502 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -172,8 +172,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index 7f0b000..03c2aab 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -177,8 +177,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
index 7c83331..7b03529 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -177,8 +177,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
index a100e20..861b51c 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
@@ -195,8 +195,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
index 5af0fe1..cf619ff 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
@@ -195,8 +195,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
index 6e2428a..345d0f3 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -221,8 +221,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
index 7d30307..7f596e0 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -221,8 +221,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
index 2aaebb6..e0adff0 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -195,8 +195,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
index 7d3030f..fdc42a5 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -195,8 +195,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
index 77d47a2..080b5af 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -221,8 +221,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
index 336cf15..1317048 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -221,8 +221,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
index b42385c..13a1e1c 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
@@ -200,8 +200,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
index 3871cae..0ced592 100644
--- a/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
@@ -200,8 +200,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
index 217ac57..80f4f6e 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
@@ -188,8 +188,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
index f597fe4..3ca4e3f 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
@@ -214,8 +214,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
index 01cb877..3520996 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
@@ -188,8 +188,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
index fa1f77b..4182e21 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -214,8 +214,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
index 19bc947..6359db1 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
@@ -193,8 +193,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
index 30e816d..60ca7cb 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -136,8 +136,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
index 1de0119..7b33153 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
@@ -138,8 +138,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
index 3aac3b2..dc96050 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
@@ -138,8 +138,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
index 4958f6c..fa393f9 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
@@ -140,8 +140,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
index 108775e..f6c4d52 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -136,8 +136,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
index f77fed1..defcb87 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
@@ -138,8 +138,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
index 21cfc72..08266dc 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
@@ -138,8 +138,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
index d78a28f..c12b8fa 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
@@ -140,8 +140,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
index e0d0e16..7d568c4 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -141,8 +141,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
index f4751f4..272f8ec 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
@@ -143,8 +143,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
index 493d47c..44e1ee2 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
@@ -159,8 +159,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
index 053a146..a8e6704 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
@@ -161,8 +161,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
index 15709ee..703fc86 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
index 13a1790..94971d7 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -187,8 +187,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
index 1f25285..4122f9d 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -159,8 +159,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
index 4050bd5..baba16c 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -161,8 +161,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
index 1d16fcf..59d73cf 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
index c3c8c15..0170812 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -187,8 +187,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
index 305c931..f021cf3 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
@@ -164,8 +164,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
index e4058c5..8ec30f4 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
@@ -166,8 +166,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
index 3b91043..573ab1c 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
@@ -157,8 +157,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
index c27f102..31f4048 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
@@ -183,8 +183,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
index 89429b2..a448b3f 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
@@ -157,8 +157,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
index cfdd6be..9e275fa 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
@@ -183,8 +183,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
index 55514b6..54a91c4 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
@@ -162,8 +162,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index 7f80b43..5f9b224 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -208,9 +208,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
index 6eecce2..8038a24 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -208,9 +208,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
index d5206b1..d1b927a 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -212,9 +212,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
index 03c5f0e..ef8baa1 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -212,9 +212,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index 6ee0312..5ed2f71 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -208,9 +208,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
index afab4c3..2493c6d 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -208,9 +208,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index 31fbe9b..ed9ae13 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -213,9 +213,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
index b9ca245..cb603fa 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -213,9 +213,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
index 1cec4fd..a94e8ae 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
@@ -238,9 +238,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
index a569469..6dc809a 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
@@ -238,9 +238,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
index 92eead7..df33351 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -274,9 +274,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
index d24f297..41fc414 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -274,9 +274,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
index 16ac9cc..3c17cfd 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -238,9 +238,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
index 66a2622..642efe5 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -238,9 +238,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
index 8a87b06..6e43d11 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -274,9 +274,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
index 11b6d60..9f83b6a 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -274,9 +274,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
index 3b43e36..85b670f 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
@@ -243,9 +243,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
index 4208b98..d876c7e 100644
--- a/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
@@ -243,9 +243,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
index 2f95444..84e896d 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
@@ -231,9 +231,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
index 25d9d51..ec7add0 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
@@ -267,9 +267,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
index 1889b5b..dfdcc16 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
@@ -231,9 +231,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
index a5e7f12..7146a7f 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -267,9 +267,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
index e9bf737..3312851 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
@@ -236,9 +236,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
index 51ced97..8c4e3ba 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -165,9 +165,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
index 14080c3..7f30490 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
@@ -167,9 +167,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
index 8b48c2f..bf29d4c 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
@@ -169,9 +169,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
index bb50736..adc0ea8 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
@@ -171,9 +171,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
index adf2154..d7d2663 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -165,9 +165,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
index 914e429..880efc8 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
@@ -167,9 +167,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
index 4b4bcce..c264b9a 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
@@ -169,9 +169,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
index 48f9034..f570529 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
@@ -171,9 +171,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
index 64c9089..639fe6b 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -170,9 +170,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
index f658d1c..85683fd 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
@@ -172,9 +172,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
index 97b3a83..b9f27ac 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
@@ -195,9 +195,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
index 9e3edf6..7385a51 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
@@ -197,9 +197,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
index ec25dbd..4d0053e 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -231,9 +231,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
index d8ff958..434f426 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -233,9 +233,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
index 8e503ef..bfa58ce 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -195,9 +195,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
index dfb509a..cf997be 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -197,9 +197,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
index 8cb390d..3b4aafe 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -231,9 +231,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
index 831ea5c..9229723 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -233,9 +233,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
index 7c308da..54fc170 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
@@ -200,9 +200,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
index a86ce19..ac8ef87 100644
--- a/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
@@ -202,9 +202,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
index 07de1be..0e1fd6b 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
@@ -193,9 +193,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
index 4e6fece..2a4c9b1 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
@@ -229,9 +229,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
index 81d37be..fc8a9ec 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
@@ -193,9 +193,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
index c173fda..a7d781b 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
@@ -229,9 +229,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
index 3311ccf..a54da33 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
@@ -198,9 +198,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index faaf01f..aadd64a 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -243,10 +243,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
index 0543d49..2c797f2 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -243,10 +243,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
index 387525d..7d7edf8 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -248,10 +248,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
index e175021..7c90352 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -248,10 +248,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 52ed1b4..6fc8889 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -243,10 +243,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index daee210..6d5fa95 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -243,10 +243,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index 484dbfb..891683f 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -248,10 +248,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
index c49adf5..07837d6 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -248,10 +248,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
index 6beac29..1027bec 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
@@ -280,10 +280,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
index 21f777b..1f4bf5a 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
@@ -280,10 +280,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
index 35fca32..9172eb1 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -325,10 +325,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
index 5a4a15d..6d84f3f 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -325,10 +325,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
index 62b9158..13ae014 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -280,10 +280,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
index 9ea1783..8522e8c 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -280,10 +280,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
index a16334b..1f4f901 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -325,10 +325,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
index 83b8adc..474c1dd 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -325,10 +325,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index 08a979a..03bd704 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -285,10 +285,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
index a1c0442..430d927 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
@@ -285,10 +285,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
index ffb91cb..c33e288 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
@@ -273,10 +273,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
index 20c8575..488e22d 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
@@ -318,10 +318,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
index 84c4413..6aaba0d 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
@@ -273,10 +273,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
index caf7650..48ac690 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
@@ -318,10 +318,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
index 6c0ead0..462b459 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
@@ -278,10 +278,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
}
nc = 0;
diff --git a/src/qs8-igemm/MRx4c2-sse.c.in b/src/qs8-igemm/MRx4c2-sse.c.in
index 9a103a2..208a884 100644
--- a/src/qs8-igemm/MRx4c2-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-sse.c.in
@@ -7,7 +7,7 @@
$assert not XOP or AVX
$assert not AVX or SSE == 4
$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
-$assert DATATYPE in ["QC8", "QS8"]
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
$assert SSE != 3 or REQUANTIZATION != "FP32"
$assert VARIANT in ["LD64", "LD128"]
@@ -29,21 +29,22 @@
#include <xnnpack/math.h>
-$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_qs8_conv_minmax_params"
-$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
+$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_%s_conv_minmax_params" % DATATYPE.lower()
+$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 and DATATYPE != "QU8" else "sse2")
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c2__${ISA}_${VARIANT.lower()}(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
- const int8_t** restrict a,
+ const ${XINT8_T}** restrict a,
const void* restrict w,
- int8_t* restrict c,
+ ${XINT8_T}* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
- const int8_t* zero,
+ const ${XINT8_T}* zero,
const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
assert(mr != 0);
@@ -52,15 +53,15 @@
assert(kc != 0);
assert(ks != 0);
assert(ks % (${MR} * sizeof(void*)) == 0);
- assert(a_offset % sizeof(int8_t) == 0);
+ assert(a_offset % sizeof(${XINT8_T}) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
kc = round_up_po2(kc, 2);
- int8_t* c0 = c;
+ ${XINT8_T}* c0 = c;
$for M in range(1, MR):
- int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+ ${XINT8_T}* c${M} = (${XINT8_T}*) ((uintptr_t) c${M-1} + cm_stride);
$if M % 2 == 0:
if XNN_UNPREDICTABLE(mr <= ${M}) {
c${M} = c${M-1};
@@ -83,20 +84,30 @@
size_t p = ks;
do {
$for M in range(MR):
- const int8_t* restrict a${M} = a[${M}];
+ const ${XINT8_T}* restrict a${M} = a[${M}];
if XNN_UNPREDICTABLE(a${M} != zero) {
- a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+ a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + a_offset);
}
a += ${MR};
size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
+ $if DATATYPE == "QU8":
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point);
+ $if SSE < 4:
+ const __m128i vzero = _mm_setzero_si128();
+ while (k >= 8 * sizeof(${XINT8_T})) {
$for M in range(MR):
const __m128i va${M} = _mm_loadl_epi64((const __m128i*) a${M});
- $if SSE == 4:
- const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepu8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_unpacklo_epi8(va${M}, vzero);
$else:
- const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
a${M} += 8;
$if VARIANT == "LD128":
@@ -104,10 +115,14 @@
$if K == 0:
const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) w);
$else:
- const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((const int8_t*) w + ${K * 8}));
- const __m128i vsb${K}${K+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K}${K+1});
- const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1});
- const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1});
+ const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((const ${XINT8_T}*) w + ${K * 8}));
+ $if DATATYPE == "QU8":
+ const __m128i vxb${K} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${K}${K+1}, vzero), vb_zero_point);
+ const __m128i vxb${K+1} = _mm_sub_epi16(_mm_unpackhi_epi8(vb${K}${K+1}, vzero), vb_zero_point);
+ $else:
+ const __m128i vsb${K}${K+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K}${K+1});
+ const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1});
+ const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1});
$for M in range(MR):
$if XOP:
@@ -129,11 +144,17 @@
$if K == 0:
const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) w);
$else:
- const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + ${K * 8}));
- $if SSE == 4:
- const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K});
+ const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((const ${XINT8_T}*) w + ${K * 8}));
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb${K} = _mm_sub_epi16(_mm_cvtepu8_epi16(vb${K}), vb_zero_point);
+ $else:
+ const __m128i vxb${K} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${K}, vzero), vb_zero_point);
$else:
- const __m128i vxb${K} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${K}, vb${K}), 8);
+ $if SSE == 4:
+ const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K});
+ $else:
+ const __m128i vxb${K} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${K}, vb${K}), 8);
$for M in range(MR):
$if XOP:
@@ -143,21 +164,36 @@
vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K}));
- w = (const void*) ((const int8_t*) w + 32);
- k -= 8 * sizeof(int8_t);
+ w = (const void*) ((const ${XINT8_T}*) w + 32);
+ k -= 8 * sizeof(${XINT8_T});
}
if (k != 0) {
$for M in range(MR):
const __m128i va${M} = _mm_loadl_epi64((const __m128i*) a${M});
- $if SSE == 4:
- const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepu8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_unpacklo_epi8(va${M}, vzero);
$else:
- const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
- a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
+ a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ w = (const void*) ((const ${XINT8_T}*) w + 8);
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+ $else:
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+ $else:
+ $if SSE == 4:
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
+ $else:
+ const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
$for M in range(MR):
$if XOP:
@@ -167,10 +203,19 @@
vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- if (k > 2 * sizeof(int8_t)) {
+ if (k > 2 * sizeof(${XINT8_T})) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ w = (const void*) ((const ${XINT8_T}*) w + 8);
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+ $else:
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+ $else:
+ $if SSE == 4:
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
+ $else:
+ const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
$for M in range(MR):
$if XOP:
@@ -180,10 +225,19 @@
vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- if (k > 4 * sizeof(int8_t)) {
+ if (k > 4 * sizeof(${XINT8_T})) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ w = (const void*) ((const ${XINT8_T}*) w + 8);
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+ $else:
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+ $else:
+ $if SSE == 4:
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
+ $else:
+ const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
$for M in range(MR):
$if XOP:
@@ -296,20 +350,29 @@
$for M in range(0, MR, 2):
__m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
- $if SSE < 4:
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
- $for M in range(0, MR, 2):
- vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+ $if DATATYPE == "QU8":
+ $if MR > 2:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
- $if MR > 2:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
$else:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+ $if SSE < 4:
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
- $if SSE == 4:
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
+ $if MR > 2:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+
+ $if SSE == 4:
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
if (nc >= 4) {
$for M in reversed(range(1, MR)):
@@ -317,11 +380,11 @@
*((uint32_t*) c${M}) = (uint32_t) _mm_extract_epi32(vout, ${M});
$else:
*((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(${M}, ${M}, ${M}, ${M})));
- c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+ c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c0 = (${XINT8_T}*) ((uintptr_t) c0 + cn_stride);
- a = (const int8_t**restrict) ((uintptr_t) a - ks);
+ a = (const ${XINT8_T}**restrict) ((uintptr_t) a - ks);
nc -= 4;
} else {
@@ -334,11 +397,11 @@
if (nc & 1) {
$if SSE == 4:
$for M in reversed(range(MR)):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi8(vout, ${M * 4});
+ *c${M} = (${XINT8_T}) _mm_extract_epi8(vout, ${M * 4});
$else:
$for M in reversed(range(1, MR)):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi16(vout, ${M * 2});
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c${M} = (${XINT8_T}) _mm_extract_epi16(vout, ${M * 2});
+ *c0 = (${XINT8_T}) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/MRx4c8-sse.c.in b/src/qs8-igemm/MRx4c8-sse.c.in
index 1364e26..2a72587 100644
--- a/src/qs8-igemm/MRx4c8-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-sse.c.in
@@ -7,7 +7,7 @@
$assert not XOP or AVX
$assert not AVX or SSE == 4
$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
-$assert DATATYPE in ["QC8", "QS8"]
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
$assert VARIANT in ["LD64", "LD128"]
$assert MR <= 4
@@ -28,21 +28,22 @@
#include <xnnpack/math.h>
-$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_qs8_conv_minmax_params"
-$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 else "sse2")
+$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_%s_conv_minmax_params" % DATATYPE.lower()
+$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("sse4" if SSE >= 4 and DATATYPE != "QU8" else "sse2")
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
$ISA = "xop" if XOP else "avx" if AVX else {2: "sse2", 3: "ssse3", 4: "sse41"}[SSE]
void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x4c8__${ISA}_${VARIANT.lower()}(
size_t mr,
size_t nc,
size_t kc,
size_t ks,
- const int8_t** restrict a,
+ const ${XINT8_T}** restrict a,
const void* restrict w,
- int8_t* restrict c,
+ ${XINT8_T}* restrict c,
size_t cm_stride,
size_t cn_stride,
size_t a_offset,
- const int8_t* zero,
+ const ${XINT8_T}* zero,
const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
assert(mr != 0);
@@ -51,15 +52,15 @@
assert(kc != 0);
assert(ks != 0);
assert(ks % (${MR} * sizeof(void*)) == 0);
- assert(a_offset % sizeof(int8_t) == 0);
+ assert(a_offset % sizeof(${XINT8_T}) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
kc = round_up_po2(kc, 8);
- int8_t* c0 = c;
+ ${XINT8_T}* c0 = c;
$for M in range(1, MR):
- int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+ ${XINT8_T}* c${M} = (${XINT8_T}*) ((uintptr_t) c${M-1} + cm_stride);
$if M % 2 == 0:
if XNN_UNPREDICTABLE(mr <= ${M}) {
c${M} = c${M-1};
@@ -84,20 +85,30 @@
size_t p = ks;
do {
$for M in range(MR):
- const int8_t* restrict a${M} = a[${M}];
+ const ${XINT8_T}* restrict a${M} = a[${M}];
if XNN_UNPREDICTABLE(a${M} != zero) {
- a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+ a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + a_offset);
}
a += ${MR};
size_t k = 0;
+ $if DATATYPE == "QU8":
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.kernel_zero_point);
+ $if SSE < 4:
+ const __m128i vzero = _mm_setzero_si128();
while (k < kc) {
$for M in range(MR):
const __m128i va${M} = _mm_loadl_epi64((const __m128i*) a${M});
- $if SSE == 4:
- const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepu8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_unpacklo_epi8(va${M}, vzero);
$else:
- const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
+ $if SSE == 4:
+ const __m128i vxa${M} = _mm_cvtepi8_epi16(va${M});
+ $else:
+ const __m128i vxa${M} = _mm_srai_epi16(_mm_unpacklo_epi8(va${M}, va${M}), 8);
a${M} += 8;
$if VARIANT == "LD128":
@@ -105,10 +116,14 @@
$if N == 0:
const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
$else:
- const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((const int8_t*) w + ${N * 8}));
- const __m128i vsb${N}${N+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N}${N+1});
- const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1});
- const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1});
+ const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((const ${XINT8_T}*) w + ${N * 8}));
+ $if DATATYPE == "QU8":
+ const __m128i vxb${N} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${N}${N+1}, vzero), vb_zero_point);
+ const __m128i vxb${N+1} = _mm_sub_epi16(_mm_unpackhi_epi8(vb${N}${N+1}, vzero), vb_zero_point);
+ $else:
+ const __m128i vsb${N}${N+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N}${N+1});
+ const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1});
+ const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1});
$for M in range(MR):
$if XOP:
@@ -122,11 +137,17 @@
$if N == 0:
const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) w);
$else:
- const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((const int8_t*) w + ${N * 8}));
- $if SSE == 4:
- const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N});
+ const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((const ${XINT8_T}*) w + ${N * 8}));
+ $if DATATYPE == "QU8":
+ $if SSE == 4:
+ const __m128i vxb${N} = _mm_sub_epi16(_mm_cvtepu8_epi16(vb${N}), vb_zero_point);
+ $else:
+ const __m128i vxb${N} = _mm_sub_epi16(_mm_unpacklo_epi8(vb${N}, vzero), vb_zero_point);
$else:
- const __m128i vxb${N} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${N}, vb${N}), 8);
+ $if SSE == 4:
+ const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N});
+ $else:
+ const __m128i vxb${N} = _mm_srai_epi16(_mm_unpacklo_epi8(vb${N}, vb${N}), 8);
$for M in range(MR):
$if XOP:
@@ -134,8 +155,8 @@
$else:
vacc${M}x${N} = _mm_add_epi32(vacc${M}x${N}, _mm_madd_epi16(vxa${M}, vxb${N}));
- w = (const void*) ((const int8_t*) w + 32);
- k += 8 * sizeof(int8_t);
+ w = (const void*) ((const ${XINT8_T}*) w + 32);
+ k += 8 * sizeof(${XINT8_T});
}
p -= ${MR} * sizeof(void*);
} while (p != 0);
@@ -253,20 +274,29 @@
$for M in range(0, MR, 2):
__m128i vacc${M}${min(M+1, MR-1)}x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc${M}x0123, vacc${min(M+1, MR-1)}x0123), voutput_zero_point);
- $if SSE < 4:
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
- $for M in range(0, MR, 2):
- vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
+ $if DATATYPE == "QU8":
+ $if MR > 2:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packus_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
- $if MR > 2:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
$else:
- __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+ $if SSE < 4:
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max);
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
- $if SSE == 4:
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
+ $if MR > 2:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
+ $else:
+ __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
+
+ $if SSE == 4:
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_max));
if (nc >= 4) {
$for M in reversed(range(1, MR)):
@@ -274,11 +304,11 @@
*((uint32_t*) c${M}) = (uint32_t) _mm_extract_epi32(vout, ${M});
$else:
*((uint32_t*) c${M}) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(${M}, ${M}, ${M}, ${M})));
- c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+ c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c0 = (${XINT8_T}*) ((uintptr_t) c0 + cn_stride);
- a = (const int8_t**restrict) ((uintptr_t) a - ks);
+ a = (const ${XINT8_T}**restrict) ((uintptr_t) a - ks);
nc -= 4;
} else {
@@ -291,11 +321,11 @@
if (nc & 1) {
$if SSE == 4:
$for M in reversed(range(MR)):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi8(vout, ${M * 4});
+ *c${M} = (${XINT8_T}) _mm_extract_epi8(vout, ${M * 4});
$else:
$for M in reversed(range(1, MR)):
- *((int8_t*) c${M}) = (int8_t) _mm_extract_epi16(vout, ${M * 2});
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c${M} = (${XINT8_T}) _mm_extract_epi16(vout, ${M * 2});
+ *c0 = (${XINT8_T}) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
index 2a2ede8..cb6b952 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
index 4a815b9..3a6470c 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
index 925cfa5..02159fa 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
@@ -149,7 +149,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
index d8f5d52..99a6611 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
@@ -149,7 +149,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
index d363292..8003b56 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
index 43fdb55..bbce5a9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -148,7 +148,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
index 1e3d6a8..dd3c288 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
@@ -97,7 +97,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
index c9cd4c0..0499b23 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
@@ -97,7 +97,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
index d84e43a..fbe2503 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld128.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
index 0c41602..3c7cc20 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-avx-ld64.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
index 28b48ca..e426312 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -181,7 +181,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
index 2b65df2..7835c4d 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -181,7 +181,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
index 1f8b868..d1ead47 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
index 6f97176..c6f5af2 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -92,7 +92,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -100,7 +100,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -108,7 +108,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -164,7 +164,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
index d006ea0..ec18bb0 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -181,7 +181,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
index 2d323e1..0b57f70 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -181,7 +181,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
index a8fb8da..734dca9 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld128.c
@@ -97,7 +97,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -169,7 +169,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
index 7cdc634..26b8ceb 100644
--- a/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c2-minmax-gemmlowp-xop-ld64.c
@@ -97,7 +97,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -105,7 +105,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -113,7 +113,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -169,7 +169,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
index c9be575..22ad639 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld128.c
@@ -120,7 +120,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
index 0568e12..2fd9087 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -122,7 +122,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
index 6711743..6cc6702 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
@@ -121,7 +121,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
index f16ea89..d62f339 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -123,7 +123,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
index 4bd1f41..c68b7cc 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
@@ -120,7 +120,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
index 2a31258..23fd0eb 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -122,7 +122,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
index 6f85867..a53437f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
@@ -121,7 +121,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
index db23866..e61ae95 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
@@ -123,7 +123,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
index 3fdbf3a..f4aa440 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld128.c
@@ -125,7 +125,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
index 6e56155..8226e3b 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -127,7 +127,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
index 17b8f27..74f69f7 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld128.c
@@ -136,7 +136,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
index 69009c7..ca23fea 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-avx-ld64.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
index ec61f42..1d309a0 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
index 50c2d49..66beb10 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -155,7 +155,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
index d406f4e..4cfa48f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -136,7 +136,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
index e2b9bef..43a3161 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -138,7 +138,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
index faaf3f5..c8ce0c8 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -153,7 +153,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
index b8bd4c5..2601753 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -155,7 +155,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
index c8d1f8e..e05b5f6 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld128.c
@@ -141,7 +141,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
index 0d1cb95..9922c8b 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-gemmlowp-xop-ld64.c
@@ -143,7 +143,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
index 6d12bd6..9a2b742 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld128.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
index e89696a..a2800d9 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-avx-ld64.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
index b7f7f8d..3de5bc2 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
index 398ee81..666d22b 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
@@ -185,8 +185,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
index 791ba21..dd270a9 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
index 1b839e6..56b1069 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -184,8 +184,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
index d80dda7..47ec5ca 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld128.c
@@ -120,7 +120,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -189,8 +189,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
index f4f8b9a..24fa887 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-fp32-xop-ld64.c
@@ -120,7 +120,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -189,8 +189,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
index 81f4c32..3f24371 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld128.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
index 9d164eb..cd7dd70 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-avx-ld64.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
index 39aea8e..f4f6283 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -232,8 +232,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
index 128ac34..5e5e497 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -232,8 +232,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
index 9245e7e..abd2c8e 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
index f97b73c..65cf168 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -115,7 +115,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -125,7 +125,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -135,7 +135,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -207,8 +207,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
index d3c9714..f744b3a 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -232,8 +232,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
index bf47976..7e9f2a7 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -232,8 +232,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
index aefa9c1..6291abf 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld128.c
@@ -120,7 +120,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -212,8 +212,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
index f88b0f1..80929e3 100644
--- a/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c2-minmax-gemmlowp-xop-ld64.c
@@ -120,7 +120,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -130,7 +130,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -140,7 +140,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -212,8 +212,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
index aa4745d..6a24f81 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld128.c
@@ -149,8 +149,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
index 26f50da..4ca40e1 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-avx-ld64.c
@@ -151,8 +151,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
index e46e6ed..17d7560 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
@@ -150,8 +150,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
index 82e5115..49e69bf 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
@@ -152,8 +152,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
index 0cacb43..5f0e258 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
@@ -149,8 +149,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
index 69e9ef8..e6c77fd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
@@ -151,8 +151,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
index dc0a620..59090cb 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
@@ -150,8 +150,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
index e8e086e..f6d6210 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
@@ -152,8 +152,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
index ada4f99..187b8bd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld128.c
@@ -154,8 +154,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
index 0f3b693..c82921f 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-fp32-xop-ld64.c
@@ -156,8 +156,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
index 987dcb0..d6477d3 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld128.c
@@ -172,8 +172,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
index 3206354..337df9a 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-avx-ld64.c
@@ -174,8 +174,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
index 4f88e7c..f24fc1a 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -197,8 +197,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
index 29f6bb9..04fec0c 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -199,8 +199,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
index fc1ce4d..a999775 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -172,8 +172,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
index 696aea2..b30c7eb 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -174,8 +174,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
index ce4dbf1..630f5c0 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -197,8 +197,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
index e7c1e34..2ccb6b0 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -199,8 +199,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
index c690081..14be109 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld128.c
@@ -177,8 +177,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
index f69db77..e2e3ed8 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-gemmlowp-xop-ld64.c
@@ -179,8 +179,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
index 39e1fc4..304a4f7 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld128.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
index d939ab6..c1cec78 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-avx-ld64.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
index a2ecb0f..59ba39d 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
@@ -223,9 +223,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
index 4642e48..83ee714 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
@@ -223,9 +223,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
index 020f028..210f54a 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
index 9e3ff00..fbac5d1 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -221,9 +221,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
index d14a6c9..6190ade 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld128.c
@@ -143,7 +143,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -226,9 +226,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
index a2b6c23..31575f4 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-fp32-xop-ld64.c
@@ -143,7 +143,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -226,9 +226,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
index abed9b6..fef31cf 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld128.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
index 37cc36d..bf89939 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-avx-ld64.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
index 3836866..35b26a3 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -285,9 +285,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
index 5d1886d..7d345ea 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -285,9 +285,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
index 1ffbff8..fc27bef 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
index 84d280b..975ce10 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -138,7 +138,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -150,7 +150,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -162,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -251,9 +251,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
index 92e9447..8074fcb 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -285,9 +285,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
index 75700c3..ff2e060 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -285,9 +285,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
index 6356e76..91e7c30 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld128.c
@@ -143,7 +143,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -256,9 +256,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
index 4912f55..983aa01 100644
--- a/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c2-minmax-gemmlowp-xop-ld64.c
@@ -143,7 +143,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -167,7 +167,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -256,9 +256,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
index f198a7d..b4a8597 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld128.c
@@ -179,9 +179,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
index fb028a4..9cc0420 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-avx-ld64.c
@@ -181,9 +181,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
index 8dc7fdb..b1095e5 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
@@ -181,9 +181,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
index 079ee69..4dfa033 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
@@ -183,9 +183,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
index d262753..5a51a7c 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
@@ -179,9 +179,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
index a3f8f88..303b7f8 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
@@ -181,9 +181,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
index 8ffc6fc..118cdd9 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
@@ -181,9 +181,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
index 6c712bf..e9bf06c 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
@@ -183,9 +183,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
index 55639ef..1a4e7db 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld128.c
@@ -184,9 +184,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
index 19f5904..1314632 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-fp32-xop-ld64.c
@@ -186,9 +186,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
index a13dd17..e3cf40a 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld128.c
@@ -209,9 +209,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
index fbad091..dc1751e 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-avx-ld64.c
@@ -211,9 +211,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
index e4200cd..8b65958 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld128.c
@@ -243,9 +243,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
index a5f3732..b26e36d 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -245,9 +245,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
index 15de5b5..6627108 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld128.c
@@ -209,9 +209,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
index 610c200..7217bd9 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -211,9 +211,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
index b5919d7..0adb317 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld128.c
@@ -243,9 +243,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
index 3f6204f..00811cf 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -245,9 +245,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
index a866141..2c25542 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld128.c
@@ -214,9 +214,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
index c297cc2..f2f08b8 100644
--- a/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-xop-ld64.c
@@ -216,9 +216,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
index 9575adb..96dbae3 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld128.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
index 86dbc98..e0fd57e 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-avx-ld64.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
index 31c84c2..06be3e4 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
@@ -259,10 +259,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
index 0d6a7da..958b7a9 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
@@ -259,10 +259,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
index 112e243..06d0d8a 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
index c8a3cca..ae535c4 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -257,10 +257,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
index efa3ada..ab8e9bb 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld128.c
@@ -166,7 +166,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -262,10 +262,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
index 9f9ad98..b1f20e1 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-fp32-xop-ld64.c
@@ -166,7 +166,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -262,10 +262,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
index 681ea5f..a5f5764 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld128.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
index 7e66cc1..9a4da26 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-avx-ld64.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
index 5875e0e..0c0d4d9 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld128.c
@@ -336,10 +336,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
index 89ba44a..b586305 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -336,10 +336,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
index 64e1ff5..3d24064 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld128.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
index 8571512..0585add 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -161,7 +161,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -175,7 +175,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -189,7 +189,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -294,10 +294,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
index 35d07a0..6f78980 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld128.c
@@ -336,10 +336,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
index 5d141e0..341314f 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -336,10 +336,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi16(vout, 6);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi16(vout, 4);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi16(vout, 2);
- *((int8_t*) c0) = (int8_t) _mm_cvtsi128_si32(vout);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
index 49717e2..1d28198 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld128.c
@@ -166,7 +166,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -299,10 +299,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
index d4b022a..61e46f2 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-gemmlowp-xop-ld64.c
@@ -166,7 +166,7 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb0 = _mm_srai_epi16(_mm_unpacklo_epi8(vb0, vb0), 8);
+ const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
@@ -180,7 +180,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb1 = _mm_srai_epi16(_mm_unpacklo_epi8(vb1, vb1), 8);
+ const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
@@ -194,7 +194,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
w = (const void*) ((const int8_t*) w + 8);
- const __m128i vxb2 = _mm_srai_epi16(_mm_unpacklo_epi8(vb2, vb2), 8);
+ const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
@@ -299,10 +299,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((int8_t*) c3) = (int8_t) _mm_extract_epi8(vout, 12);
- *((int8_t*) c2) = (int8_t) _mm_extract_epi8(vout, 8);
- *((int8_t*) c1) = (int8_t) _mm_extract_epi8(vout, 4);
- *((int8_t*) c0) = (int8_t) _mm_extract_epi8(vout, 0);
+ *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_extract_epi8(vout, 0);
}
nc = 0;
diff --git a/src/qu8-gemm/2x4c8-minmax-gemmlowp-sse2.c b/src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
similarity index 62%
rename from src/qu8-gemm/2x4c8-minmax-gemmlowp-sse2.c
rename to src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
index c3ea059..f158c2d 100644
--- a/src/qu8-gemm/2x4c8-minmax-gemmlowp-sse2.c
+++ b/src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -1,38 +1,21 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
//
-// Copyright 2019 Google LLC
+// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
-#include <immintrin.h>
+#include <emmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-static inline __m128i sse_reduce4_i32(__m128i x, __m128i y, __m128i z, __m128i w) {
-#if defined(__SSSE3__) && !defined(__ANDROID__)
- // xxyy = ( y2 + y3, y0 + y1, x2 + x3, x0 + x1 )
- const __m128i xxyy = _mm_hadd_epi32(x, y);
- // zzww = ( w2 + w3, w0 + w1, z2 + z3, z0 + z1 )
- const __m128i zzww = _mm_hadd_epi32(z, w);
- // xyzw = ( w0 + w1 + w2 + w3, y0 + y1 + y2 + y3, z0 + z1 + z2 + z3, x0 + x1 + x2 + x3 )
- return _mm_hadd_epi32(xxyy, zzww);
-#else
- // xzxz = ( z1 + z3, x1 + x3, z0 + z2, x0 + x2 )
- const __m128i xzxz = _mm_add_epi32(_mm_unpacklo_epi32(x, z), _mm_unpackhi_epi32(x, z));
- // ywyw = ( w1 + w3, y1 + y3, w0 + w2, y0 + y2 )
- const __m128i ywyw = _mm_add_epi32(_mm_unpacklo_epi32(y, w), _mm_unpackhi_epi32(y, w));
- // xyzw = ( w0 + w2 + w1 + w3, y0 + y2 + y1 + y3, z0 + z2 + z1 + z3, x0 + x2 + x1 + x3 )
- return _mm_add_epi32(_mm_unpacklo_epi32(xzxz, ywyw), _mm_unpackhi_epi32(xzxz, ywyw));
-#endif
-}
-
-void xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2(
+void xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64(
size_t mr,
size_t nc,
size_t kc,
@@ -42,13 +25,13 @@
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
- const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
assert(mr != 0);
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(kc % sizeof(uint8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
@@ -58,26 +41,26 @@
uint8_t* c0 = c;
const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
- if (mr != 2) {
+ if XNN_UNPREDICTABLE(mr != 2) {
a1 = a0;
c1 = c0;
}
- const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
-
do {
- __m128i vacc00 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc01 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc02 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc03 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc10 = vacc00;
- __m128i vacc11 = vacc01;
- __m128i vacc12 = vacc02;
- __m128i vacc13 = vacc03;
- w = (const void*) ((uintptr_t) w + 16);
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+ size_t k = 0;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
const __m128i vzero = _mm_setzero_si128();
- for (size_t k = 0; k < kc; k += 8 * sizeof(uint8_t)) {
+ while (k < kc) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
a0 += 8;
@@ -87,26 +70,36 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
- const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
- const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
- const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
- w = (const void*) ((uintptr_t) w + 32);
- vacc00 = _mm_add_epi32(vacc00, _mm_madd_epi16(vxa0, vxb0));
- vacc01 = _mm_add_epi32(vacc01, _mm_madd_epi16(vxa0, vxb1));
- vacc02 = _mm_add_epi32(vacc02, _mm_madd_epi16(vxa0, vxb2));
- vacc03 = _mm_add_epi32(vacc03, _mm_madd_epi16(vxa0, vxb3));
- vacc10 = _mm_add_epi32(vacc10, _mm_madd_epi16(vxa1, vxb0));
- vacc11 = _mm_add_epi32(vacc11, _mm_madd_epi16(vxa1, vxb1));
- vacc12 = _mm_add_epi32(vacc12, _mm_madd_epi16(vxa1, vxb2));
- vacc13 = _mm_add_epi32(vacc13, _mm_madd_epi16(vxa1, vxb3));
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k += 8 * sizeof(uint8_t);
}
- __m128i vacc0x0123 = sse_reduce4_i32(vacc00, vacc01, vacc02, vacc03);
- __m128i vacc1x0123 = sse_reduce4_i32(vacc10, vacc11, vacc12, vacc13);
+ const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+ const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+ const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+ const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+ __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+ __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
@@ -117,8 +110,8 @@
const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
- const __m128i vabsacc0x1032 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc1x1032 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
@@ -132,8 +125,8 @@
const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1032, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1032, vmultiplier);
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -153,7 +146,6 @@
const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-
const __m128i vrem0x0123 =
_mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
const __m128i vrem1x0123 =
@@ -161,28 +153,30 @@
const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-
vacc0x0123 =
_mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
vacc1x0123 =
_mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- const __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
__m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_epi64(vout, 32));
-
- a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
- a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+
nc -= 4;
} else {
if (nc & 2) {
@@ -193,8 +187,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((uint8_t*) c0) = (uint8_t) _mm_cvtsi128_si32(vout);
- *((uint8_t*) c1) = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
new file mode 100644
index 0000000..b1d7194
--- /dev/null
+++ b/src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -0,0 +1,171 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const uint8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const uint8_t* a0 = a;
+ uint8_t* c0 = c;
+ const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
+ a1 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k += 8 * sizeof(uint8_t);
+ }
+
+ const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+ const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+ const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+ const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+ __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+ __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+
+ const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
+ const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
+ const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
+ const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
+
+ const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
+ const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-gemm/2x4c8-minmax-gemmlowp-sse2.c b/src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
similarity index 60%
copy from src/qu8-gemm/2x4c8-minmax-gemmlowp-sse2.c
copy to src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
index c3ea059..c814be4 100644
--- a/src/qu8-gemm/2x4c8-minmax-gemmlowp-sse2.c
+++ b/src/qu8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -1,38 +1,21 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
//
-// Copyright 2019 Google LLC
+// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
-#include <immintrin.h>
+#include <tmmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-static inline __m128i sse_reduce4_i32(__m128i x, __m128i y, __m128i z, __m128i w) {
-#if defined(__SSSE3__) && !defined(__ANDROID__)
- // xxyy = ( y2 + y3, y0 + y1, x2 + x3, x0 + x1 )
- const __m128i xxyy = _mm_hadd_epi32(x, y);
- // zzww = ( w2 + w3, w0 + w1, z2 + z3, z0 + z1 )
- const __m128i zzww = _mm_hadd_epi32(z, w);
- // xyzw = ( w0 + w1 + w2 + w3, y0 + y1 + y2 + y3, z0 + z1 + z2 + z3, x0 + x1 + x2 + x3 )
- return _mm_hadd_epi32(xxyy, zzww);
-#else
- // xzxz = ( z1 + z3, x1 + x3, z0 + z2, x0 + x2 )
- const __m128i xzxz = _mm_add_epi32(_mm_unpacklo_epi32(x, z), _mm_unpackhi_epi32(x, z));
- // ywyw = ( w1 + w3, y1 + y3, w0 + w2, y0 + y2 )
- const __m128i ywyw = _mm_add_epi32(_mm_unpacklo_epi32(y, w), _mm_unpackhi_epi32(y, w));
- // xyzw = ( w0 + w2 + w1 + w3, y0 + y2 + y1 + y3, z0 + z2 + z1 + z3, x0 + x2 + x1 + x3 )
- return _mm_add_epi32(_mm_unpacklo_epi32(xzxz, ywyw), _mm_unpackhi_epi32(xzxz, ywyw));
-#endif
-}
-
-void xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2(
+void xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64(
size_t mr,
size_t nc,
size_t kc,
@@ -42,13 +25,13 @@
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
- const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
assert(mr != 0);
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(kc % sizeof(uint8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
@@ -58,26 +41,26 @@
uint8_t* c0 = c;
const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
- if (mr != 2) {
+ if XNN_UNPREDICTABLE(mr != 2) {
a1 = a0;
c1 = c0;
}
- const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
-
do {
- __m128i vacc00 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc01 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc02 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc03 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc10 = vacc00;
- __m128i vacc11 = vacc01;
- __m128i vacc12 = vacc02;
- __m128i vacc13 = vacc03;
- w = (const void*) ((uintptr_t) w + 16);
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+ size_t k = 0;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
const __m128i vzero = _mm_setzero_si128();
- for (size_t k = 0; k < kc; k += 8 * sizeof(uint8_t)) {
+ while (k < kc) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
a0 += 8;
@@ -87,26 +70,36 @@
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
- const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
- const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
- const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
- w = (const void*) ((uintptr_t) w + 32);
- vacc00 = _mm_add_epi32(vacc00, _mm_madd_epi16(vxa0, vxb0));
- vacc01 = _mm_add_epi32(vacc01, _mm_madd_epi16(vxa0, vxb1));
- vacc02 = _mm_add_epi32(vacc02, _mm_madd_epi16(vxa0, vxb2));
- vacc03 = _mm_add_epi32(vacc03, _mm_madd_epi16(vxa0, vxb3));
- vacc10 = _mm_add_epi32(vacc10, _mm_madd_epi16(vxa1, vxb0));
- vacc11 = _mm_add_epi32(vacc11, _mm_madd_epi16(vxa1, vxb1));
- vacc12 = _mm_add_epi32(vacc12, _mm_madd_epi16(vxa1, vxb2));
- vacc13 = _mm_add_epi32(vacc13, _mm_madd_epi16(vxa1, vxb3));
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k += 8 * sizeof(uint8_t);
}
- __m128i vacc0x0123 = sse_reduce4_i32(vacc00, vacc01, vacc02, vacc03);
- __m128i vacc1x0123 = sse_reduce4_i32(vacc10, vacc11, vacc12, vacc13);
+ const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+ const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+ const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+ const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+ __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+ __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
@@ -114,11 +107,11 @@
const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
+ const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
+ const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
- const __m128i vabsacc0x1032 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc1x1032 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
@@ -132,8 +125,8 @@
const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1032, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1032, vmultiplier);
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -153,7 +146,6 @@
const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-
const __m128i vrem0x0123 =
_mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
const __m128i vrem1x0123 =
@@ -161,28 +153,30 @@
const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-
vacc0x0123 =
_mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
vacc1x0123 =
_mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- const __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
__m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_epi64(vout, 32));
-
- a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
- a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+
nc -= 4;
} else {
if (nc & 2) {
@@ -193,8 +187,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((uint8_t*) c0) = (uint8_t) _mm_cvtsi128_si32(vout);
- *((uint8_t*) c1) = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qu8-gemm/4x4c2-minmax-gemmlowp-sse2.c b/src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
similarity index 85%
rename from src/qu8-gemm/4x4c2-minmax-gemmlowp-sse2.c
rename to src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
index 98b483f..720379d 100644
--- a/src/qu8-gemm/4x4c2-minmax-gemmlowp-sse2.c
+++ b/src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -1,20 +1,22 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c2-sse.c.in
+// Generator: tools/xngen
//
-// Copyright 2019 Google LLC
+// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
-#include <immintrin.h>
+#include <emmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-void xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2(
+
+void xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64(
size_t mr,
size_t nc,
size_t kc,
@@ -24,13 +26,13 @@
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
- const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(kc % sizeof(uint8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
@@ -52,22 +54,21 @@
}
const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
- if (mr != 4) {
+ if XNN_UNPREDICTABLE(mr != 4) {
a3 = a2;
c3 = c2;
}
- const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
-
do {
__m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
__m128i vacc1x0123 = vacc0x0123;
__m128i vacc2x0123 = vacc0x0123;
__m128i vacc3x0123 = vacc0x0123;
- w = (const void*) ((uintptr_t) w + 16);
+ w = (const void*) ((const int32_t*) w + 4);
- const __m128i vzero = _mm_setzero_si128();
size_t k = kc;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ const __m128i vzero = _mm_setzero_si128();
while (k >= 8 * sizeof(uint8_t)) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
@@ -93,8 +94,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -105,8 +105,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -117,10 +116,8 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
- w = (const void*) ((uintptr_t) w + 32);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
@@ -131,25 +128,26 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ w = (const void*) ((const uint8_t*) w + 32);
k -= 8 * sizeof(uint8_t);
}
if (k != 0) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- a0 = (const uint8_t*) ((uintptr_t) a0 + k);
const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 = (const uint8_t*) ((uintptr_t) a0 + k);
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- a1 = (const uint8_t*) ((uintptr_t) a1 + k);
const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 = (const uint8_t*) ((uintptr_t) a1 + k);
const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- a2 = (const uint8_t*) ((uintptr_t) a2 + k);
const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
+ a2 = (const uint8_t*) ((uintptr_t) a2 + k);
const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- a3 = (const uint8_t*) ((uintptr_t) a3 + k);
const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
+ a3 = (const uint8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -162,8 +160,8 @@
if (k > 2 * sizeof(uint8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -176,8 +174,8 @@
if (k > 4 * sizeof(uint8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -204,10 +202,10 @@
const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
const __m128i vabsacc3x0123 = _mm_sub_epi32(_mm_xor_si128(vacc3x0123, vnmask3x0123), vnmask3x0123);
- const __m128i vabsacc0x1032 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc1x1032 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc2x1032 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc3x1032 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc3x1133 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
@@ -229,10 +227,10 @@
const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
const __m128i vq31prod3x02 = _mm_srli_epi64(_mm_add_epi64(vprod3x02, vrounding), 31);
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1032, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1032, vmultiplier);
- const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1032, vmultiplier);
- const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1032, vmultiplier);
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
+ const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
+ const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1133, vmultiplier);
const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -264,7 +262,6 @@
const __m128i vq31prod3x0123 = _mm_shuffle_epi32(vq31prod3x0213, _MM_SHUFFLE(3, 1, 2, 0));
const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-
const __m128i vrem0x0123 =
_mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
const __m128i vrem1x0123 =
@@ -275,8 +272,7 @@
_mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-
+ const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
vacc0x0123 =
_mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
vacc1x0123 =
@@ -287,28 +283,33 @@
_mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- const __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- const __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
__m128i vout = _mm_packus_epi16(vacc01x0123, vacc23x0123);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_epi64(vout, 32));
- *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_unpackhi_epi32(vout, vout));
- *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_si128(vout, 12));
-
- a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
- a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
- a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
- a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
+ a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
+
nc -= 4;
} else {
if (nc & 2) {
@@ -323,10 +324,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((uint8_t*) c0) = (uint8_t) _mm_cvtsi128_si32(vout);
- *((uint8_t*) c1) = (uint8_t) _mm_extract_epi16(vout, 2);
- *((uint8_t*) c2) = (uint8_t) _mm_extract_epi16(vout, 4);
- *((uint8_t*) c3) = (uint8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (uint8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (uint8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
new file mode 100644
index 0000000..b9a2cb3
--- /dev/null
+++ b/src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -0,0 +1,292 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c2-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+
+void xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const uint8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 2);
+ const uint8_t* a0 = a;
+ uint8_t* c0 = c;
+ const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ do {
+ __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+ __m128i vacc1x0123 = vacc0x0123;
+ __m128i vacc2x0123 = vacc0x0123;
+ __m128i vacc3x0123 = vacc0x0123;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = kc;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ while (k >= 8 * sizeof(uint8_t)) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
+ a1 += 8;
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
+ a2 += 8;
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_cvtepu8_epi16(va3);
+ a3 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k -= 8 * sizeof(uint8_t);
+ }
+ if (k != 0) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
+ a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
+ a1 = (const uint8_t*) ((uintptr_t) a1 + k);
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
+ a2 = (const uint8_t*) ((uintptr_t) a2 + k);
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_cvtepu8_epi16(va3);
+ a3 = (const uint8_t*) ((uintptr_t) a3 + k);
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ }
+ }
+ }
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+
+ const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
+ const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
+ const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
+ const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
+ const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
+ const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
+ const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
+ const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
+ const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
+ const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
+
+ const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
+ const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
+ const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
+ const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+ const __m128i vrem2x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
+ const __m128i vrem3x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+ vacc2x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
+ vacc3x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc23x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+ *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+ *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
+ a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+ c2 += 2;
+ *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+ c3 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
+ *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (uint8_t) _mm_extract_epi8(vout, 8);
+ *c3 = (uint8_t) _mm_extract_epi8(vout, 12);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-gemm/4x4c2-minmax-gemmlowp-sse2.c b/src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
similarity index 83%
copy from src/qu8-gemm/4x4c2-minmax-gemmlowp-sse2.c
copy to src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
index 98b483f..12b5a9c 100644
--- a/src/qu8-gemm/4x4c2-minmax-gemmlowp-sse2.c
+++ b/src/qu8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -1,20 +1,22 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c2-sse.c.in
+// Generator: tools/xngen
//
-// Copyright 2019 Google LLC
+// Copyright 2020 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <assert.h>
-#include <immintrin.h>
+#include <tmmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-void xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2(
+
+void xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64(
size_t mr,
size_t nc,
size_t kc,
@@ -24,13 +26,13 @@
uint8_t* restrict c,
size_t cm_stride,
size_t cn_stride,
- const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
assert(mr != 0);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(kc % sizeof(uint8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
@@ -52,22 +54,21 @@
}
const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
- if (mr != 4) {
+ if XNN_UNPREDICTABLE(mr != 4) {
a3 = a2;
c3 = c2;
}
- const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
-
do {
__m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
__m128i vacc1x0123 = vacc0x0123;
__m128i vacc2x0123 = vacc0x0123;
__m128i vacc3x0123 = vacc0x0123;
- w = (const void*) ((uintptr_t) w + 16);
+ w = (const void*) ((const int32_t*) w + 4);
- const __m128i vzero = _mm_setzero_si128();
size_t k = kc;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ const __m128i vzero = _mm_setzero_si128();
while (k >= 8 * sizeof(uint8_t)) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
@@ -93,8 +94,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -105,8 +105,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -117,10 +116,8 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
- w = (const void*) ((uintptr_t) w + 32);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
@@ -131,25 +128,26 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ w = (const void*) ((const uint8_t*) w + 32);
k -= 8 * sizeof(uint8_t);
}
if (k != 0) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- a0 = (const uint8_t*) ((uintptr_t) a0 + k);
const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 = (const uint8_t*) ((uintptr_t) a0 + k);
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- a1 = (const uint8_t*) ((uintptr_t) a1 + k);
const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 = (const uint8_t*) ((uintptr_t) a1 + k);
const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- a2 = (const uint8_t*) ((uintptr_t) a2 + k);
const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
+ a2 = (const uint8_t*) ((uintptr_t) a2 + k);
const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- a3 = (const uint8_t*) ((uintptr_t) a3 + k);
const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
+ a3 = (const uint8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
@@ -162,8 +160,8 @@
if (k > 2 * sizeof(uint8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
@@ -176,8 +174,8 @@
if (k > 4 * sizeof(uint8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+ w = (const void*) ((const uint8_t*) w + 8);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
@@ -199,15 +197,15 @@
const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
const __m128i vnmask3x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc3x0123);
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
- const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
- const __m128i vabsacc3x0123 = _mm_sub_epi32(_mm_xor_si128(vacc3x0123, vnmask3x0123), vnmask3x0123);
+ const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
+ const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
+ const __m128i vabsacc2x0123 = _mm_abs_epi32(vacc2x0123);
+ const __m128i vabsacc3x0123 = _mm_abs_epi32(vacc3x0123);
- const __m128i vabsacc0x1032 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc1x1032 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc2x1032 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc3x1032 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc3x1133 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
@@ -229,10 +227,10 @@
const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
const __m128i vq31prod3x02 = _mm_srli_epi64(_mm_add_epi64(vprod3x02, vrounding), 31);
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1032, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1032, vmultiplier);
- const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1032, vmultiplier);
- const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1032, vmultiplier);
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
+ const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
+ const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1133, vmultiplier);
const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
@@ -264,7 +262,6 @@
const __m128i vq31prod3x0123 = _mm_shuffle_epi32(vq31prod3x0213, _MM_SHUFFLE(3, 1, 2, 0));
const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-
const __m128i vrem0x0123 =
_mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
const __m128i vrem1x0123 =
@@ -275,8 +272,7 @@
_mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-
+ const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
vacc0x0123 =
_mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
vacc1x0123 =
@@ -287,28 +283,33 @@
_mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- const __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- const __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
__m128i vout = _mm_packus_epi16(vacc01x0123, vacc23x0123);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_epi64(vout, 32));
- *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_unpackhi_epi32(vout, vout));
- *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_si128(vout, 12));
-
- a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
- a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
- a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
- a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
+ a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
+
nc -= 4;
} else {
if (nc & 2) {
@@ -323,10 +324,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *((uint8_t*) c0) = (uint8_t) _mm_cvtsi128_si32(vout);
- *((uint8_t*) c1) = (uint8_t) _mm_extract_epi16(vout, 2);
- *((uint8_t*) c2) = (uint8_t) _mm_extract_epi16(vout, 4);
- *((uint8_t*) c3) = (uint8_t) _mm_extract_epi16(vout, 6);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (uint8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (uint8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qu8-igemm/4x4c2-minmax-gemmlowp-sse2.c b/src/qu8-igemm/4x4c2-minmax-gemmlowp-sse2.c
deleted file mode 100644
index ec29e4b..0000000
--- a/src/qu8-igemm/4x4c2-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,307 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include <xnnpack/igemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2(
- size_t mr,
- size_t nc,
- size_t kc,
- size_t ks,
- const uint8_t** restrict a,
- const void* restrict w,
- uint8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- size_t a_offset,
- const uint8_t* zero,
- const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(mr != 0);
- assert(mr <= 4);
- assert(nc != 0);
- assert(kc != 0);
- assert(ks != 0);
- assert(ks % (4 * sizeof(void*)) == 0);
- assert(a_offset % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- uint8_t* c0 = c;
- uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- c1 = c0;
- }
- uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- c2 = c1;
- }
- uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
- if (mr != 4) {
- c3 = c2;
- }
-
- const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc1x0123 = vacc0x0123;
- __m128i vacc2x0123 = vacc0x0123;
- __m128i vacc3x0123 = vacc0x0123;
- w = (const void*) ((uintptr_t) w + 16);
-
- const __m128i vzero = _mm_setzero_si128();
- size_t p = ks;
- do {
- const uint8_t* restrict a0 = a[0];
- if XNN_UNPREDICTABLE(a0 != zero) {
- a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
- }
- const uint8_t* restrict a1 = a[1];
- if XNN_UNPREDICTABLE(a1 != zero) {
- a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
- }
- const uint8_t* restrict a2 = a[2];
- if XNN_UNPREDICTABLE(a2 != zero) {
- a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
- }
- const uint8_t* restrict a3 = a[3];
- if XNN_UNPREDICTABLE(a3 != zero) {
- a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
- }
- a += 4;
-
- size_t k = kc;
- while (k >= 8 * sizeof(uint8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
- a2 += 8;
- const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
- a3 += 8;
-
- const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
- vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
- const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
- vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
- const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
- vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
- const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
- vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (void*) ((uintptr_t) w + 32);
-
- k -= 8 * sizeof(uint8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
- const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
-
- const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
- w = (void*) ((uintptr_t) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(uint8_t)) {
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
- w = (void*) ((uintptr_t) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(uint8_t)) {
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
- w = (void*) ((uintptr_t) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc3x0123 = _mm_add_epi32(vacc3x0123, _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
- p -= 4 * sizeof(void*);
- } while (p != 0);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
- const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
- const __m128i vnmask3x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc3x0123);
-
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
- const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
- const __m128i vabsacc3x0123 = _mm_sub_epi32(_mm_xor_si128(vacc3x0123, vnmask3x0123), vnmask3x0123);
-
- const __m128i vabsacc0x1032 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc1x1032 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc2x1032 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(2, 3, 0, 1));
- const __m128i vabsacc3x1032 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(2, 3, 0, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
- const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
- const __m128i vabsprod3x02 = _mm_mul_epu32(vabsacc3x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask3x02 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
- const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
- const __m128i vprod3x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x02, vnmask3x02), vnmask3x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
- const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
- const __m128i vq31prod3x02 = _mm_srli_epi64(_mm_add_epi64(vprod3x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1032, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1032, vmultiplier);
- const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1032, vmultiplier);
- const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1032, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask3x13 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
- const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
- const __m128i vprod3x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x13, vnmask3x13), vnmask3x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
- const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
- const __m128i vq31prod3x13 = _mm_srli_epi64(_mm_add_epi64(vprod3x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod3x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod3x02), _mm_castsi128_ps(vq31prod3x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod3x0123 = _mm_shuffle_epi32(vq31prod3x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
-
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
- const __m128i vrem3x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
-
- vacc0x0123 = _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 = _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 = _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
- vacc3x0123 = _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- const __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- const __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
- __m128i vout = _mm_packus_epi16(vacc01x0123, vacc23x0123);
- vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
- vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
-
- if XNN_LIKELY(nc >= 4) {
- *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_si128(vout, 12));
- c3 += cn_stride;
- *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_unpackhi_epi32(vout, vout));
- c2 += cn_stride;
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_srli_epi64(vout, 32));
- c1 += cn_stride;
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- c0 += cn_stride;
-
- a = (const uint8_t**restrict) ((uintptr_t) a - ks);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6); c3 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4); c2 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2); c1 += 2;
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0); c0 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *((uint8_t*) c3) = (uint8_t) _mm_extract_epi16(vout, 6);
- *((uint8_t*) c2) = (uint8_t) _mm_extract_epi16(vout, 4);
- *((uint8_t*) c1) = (uint8_t) _mm_extract_epi16(vout, 2);
- *((uint8_t*) c0) = (uint8_t) _mm_cvtsi128_si32(vout);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c b/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
new file mode 100644
index 0000000..bf90382
--- /dev/null
+++ b/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = 0;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ const __m128i vzero = _mm_setzero_si128();
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k += 8 * sizeof(uint8_t);
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+ const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+ const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+ const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+ __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+ __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
+ const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
+
+ const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
+ const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
+
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
+ const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
+
+ const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
+
+ const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
+ const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
+
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
+
+ const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
+ const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
+
+ const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
+ const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
+
+ const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c b/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
new file mode 100644
index 0000000..2973cc8
--- /dev/null
+++ b/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
@@ -0,0 +1,184 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = 0;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
+ a1 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k += 8 * sizeof(uint8_t);
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+ const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+ const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+ const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+ __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+ __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+
+ const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
+ const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
+ const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
+ const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
+
+ const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
+ const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c b/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
new file mode 100644
index 0000000..041ce3e
--- /dev/null
+++ b/src/qu8-igemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
@@ -0,0 +1,209 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = 0;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ const __m128i vzero = _mm_setzero_si128();
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k += 8 * sizeof(uint8_t);
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+ const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+ const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+ const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+ __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+ __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
+ const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
+
+ const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
+ const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
+
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
+ const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
+
+ const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
+
+ const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
+ const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
+
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
+
+ const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
+ const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
+
+ const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
+ const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
+
+ const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c b/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
new file mode 100644
index 0000000..64e9094
--- /dev/null
+++ b/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
@@ -0,0 +1,347 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx4c2-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 2);
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+ __m128i vacc1x0123 = vacc0x0123;
+ __m128i vacc2x0123 = vacc0x0123;
+ __m128i vacc3x0123 = vacc0x0123;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const uint8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const uint8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ const __m128i vzero = _mm_setzero_si128();
+ while (k >= 8 * sizeof(uint8_t)) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 += 8;
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
+ a2 += 8;
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
+ a3 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k -= 8 * sizeof(uint8_t);
+ }
+ if (k != 0) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 = (const uint8_t*) ((uintptr_t) a1 + k);
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
+ a2 = (const uint8_t*) ((uintptr_t) a2 + k);
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
+ a3 = (const uint8_t*) ((uintptr_t) a3 + k);
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
+ const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
+ const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
+ const __m128i vnmask3x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc3x0123);
+
+ const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
+ const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
+ const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
+ const __m128i vabsacc3x0123 = _mm_sub_epi32(_mm_xor_si128(vacc3x0123, vnmask3x0123), vnmask3x0123);
+
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc3x1133 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
+ const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
+ const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
+ const __m128i vabsprod3x02 = _mm_mul_epu32(vabsacc3x0123, vmultiplier);
+
+ const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask3x02 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(2, 2, 0, 0));
+
+ const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
+ const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
+ const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
+ const __m128i vprod3x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x02, vnmask3x02), vnmask3x02);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
+ const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
+ const __m128i vq31prod3x02 = _mm_srli_epi64(_mm_add_epi64(vprod3x02, vrounding), 31);
+
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
+ const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
+ const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1133, vmultiplier);
+
+ const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask3x13 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
+ const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
+ const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
+ const __m128i vprod3x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x13, vnmask3x13), vnmask3x13);
+
+ const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
+ const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
+ const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
+ const __m128i vq31prod3x13 = _mm_srli_epi64(_mm_add_epi64(vprod3x13, vrounding), 31);
+
+ const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod3x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod3x02), _mm_castsi128_ps(vq31prod3x13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod3x0123 = _mm_shuffle_epi32(vq31prod3x0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+ const __m128i vrem2x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
+ const __m128i vrem3x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+ vacc2x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
+ vacc3x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc23x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+ c3 += 2;
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+ c2 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c3 = (uint8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (uint8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c b/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
new file mode 100644
index 0000000..cc53eaf
--- /dev/null
+++ b/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
@@ -0,0 +1,306 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx4c2-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 2);
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+ __m128i vacc1x0123 = vacc0x0123;
+ __m128i vacc2x0123 = vacc0x0123;
+ __m128i vacc3x0123 = vacc0x0123;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const uint8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const uint8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ while (k >= 8 * sizeof(uint8_t)) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
+ a1 += 8;
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
+ a2 += 8;
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_cvtepu8_epi16(va3);
+ a3 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb3), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k -= 8 * sizeof(uint8_t);
+ }
+ if (k != 0) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
+ a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
+ a1 = (const uint8_t*) ((uintptr_t) a1 + k);
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_cvtepu8_epi16(va2);
+ a2 = (const uint8_t*) ((uintptr_t) a2 + k);
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_cvtepu8_epi16(va3);
+ a3 = (const uint8_t*) ((uintptr_t) a3 + k);
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb0), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb1 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb1), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb2 = _mm_sub_epi16(_mm_cvtepu8_epi16(vb2), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+
+ const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
+ const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
+ const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
+ const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
+ const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
+ const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
+ const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
+ const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
+ const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
+ const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
+
+ const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
+ const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
+ const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
+ const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+ const __m128i vrem2x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
+ const __m128i vrem3x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+ vacc2x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
+ vacc3x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc23x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+ *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+ c3 += 2;
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+ c2 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c3 = (uint8_t) _mm_extract_epi8(vout, 12);
+ *c2 = (uint8_t) _mm_extract_epi8(vout, 8);
+ *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c b/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
new file mode 100644
index 0000000..901475e
--- /dev/null
+++ b/src/qu8-igemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
@@ -0,0 +1,347 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx4c2-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 2);
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
+ __m128i vacc1x0123 = vacc0x0123;
+ __m128i vacc2x0123 = vacc0x0123;
+ __m128i vacc3x0123 = vacc0x0123;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const uint8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const uint8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.kernel_zero_point);
+ const __m128i vzero = _mm_setzero_si128();
+ while (k >= 8 * sizeof(uint8_t)) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 += 8;
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
+ a2 += 8;
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
+ a3 += 8;
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 8));
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 16));
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((const uint8_t*) w + 24));
+ const __m128i vxb3 = _mm_sub_epi16(_mm_unpacklo_epi8(vb3, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
+
+ w = (const void*) ((const uint8_t*) w + 32);
+ k -= 8 * sizeof(uint8_t);
+ }
+ if (k != 0) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_unpacklo_epi8(va0, vzero);
+ a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_unpacklo_epi8(va1, vzero);
+ a1 = (const uint8_t*) ((uintptr_t) a1 + k);
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_unpacklo_epi8(va2, vzero);
+ a2 = (const uint8_t*) ((uintptr_t) a2 + k);
+ const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
+ const __m128i vxa3 = _mm_unpacklo_epi8(va3, vzero);
+ a3 = (const uint8_t*) ((uintptr_t) a3 + k);
+
+ const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb0, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb1 = _mm_sub_epi16(_mm_unpacklo_epi8(vb1, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
+ w = (const void*) ((const uint8_t*) w + 8);
+ const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb2, vzero), vb_zero_point);
+
+ vacc0x0123 = _mm_add_epi32(vacc0x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc1x0123 = _mm_add_epi32(vacc1x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc2x0123 = _mm_add_epi32(vacc2x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ vacc3x0123 = _mm_add_epi32(vacc3x0123,
+ _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
+ const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+
+ const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
+ const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
+ const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
+ const __m128i vnmask3x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc3x0123);
+
+ const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
+ const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
+ const __m128i vabsacc2x0123 = _mm_abs_epi32(vacc2x0123);
+ const __m128i vabsacc3x0123 = _mm_abs_epi32(vacc3x0123);
+
+ const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vabsacc3x1133 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
+ const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
+ const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
+ const __m128i vabsprod3x02 = _mm_mul_epu32(vabsacc3x0123, vmultiplier);
+
+ const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i vnmask3x02 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(2, 2, 0, 0));
+
+ const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
+ const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
+ const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
+ const __m128i vprod3x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x02, vnmask3x02), vnmask3x02);
+
+ const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
+ const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
+ const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
+ const __m128i vq31prod3x02 = _mm_srli_epi64(_mm_add_epi64(vprod3x02, vrounding), 31);
+
+ const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
+ const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
+ const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
+ const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1133, vmultiplier);
+
+ const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i vnmask3x13 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
+ const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
+ const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
+ const __m128i vprod3x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x13, vnmask3x13), vnmask3x13);
+
+ const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
+ const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
+ const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
+ const __m128i vq31prod3x13 = _mm_srli_epi64(_mm_add_epi64(vprod3x13, vrounding), 31);
+
+ const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i vq31prod3x0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(vq31prod3x02), _mm_castsi128_ps(vq31prod3x13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i vq31prod3x0123 = _mm_shuffle_epi32(vq31prod3x0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
+ const __m128i vrem0x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
+ const __m128i vrem1x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
+ const __m128i vrem2x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
+ const __m128i vrem3x0123 =
+ _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
+
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
+ const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
+ vacc0x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
+ vacc1x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
+ vacc2x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
+ vacc3x0123 =
+ _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+
+ __m128i vout = _mm_packus_epi16(vacc01x0123, vacc23x0123);
+
+ vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min));
+ vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max));
+
+ if (nc >= 4) {
+ *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(3, 3, 3, 3)));
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(2, 2, 2, 2)));
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(_mm_shuffle_epi32(vout, _MM_SHUFFLE(1, 1, 1, 1)));
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
+ c3 += 2;
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+ c2 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c3 = (uint8_t) _mm_extract_epi16(vout, 6);
+ *c2 = (uint8_t) _mm_extract_epi16(vout, 4);
+ *c1 = (uint8_t) _mm_extract_epi16(vout, 2);
+ *c0 = (uint8_t) _mm_cvtsi128_si32(vout);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 64fe19e..207326c 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -502,8 +502,12 @@
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x8__neon)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_8x8__neon)
-DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2)
-DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_gemmlowp_ukernel_2x2__scalar)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 726452b..6c9fdf4 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -311,7 +311,12 @@
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x8__neon)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_8x8__neon)
-DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x4c8__ssse3_ld64)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse2_ld64)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__ssse3_ld64)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_gemmlowp_ukernel_2x2__scalar)