Fuse rounding term into bias in QS8 & QU8 VADD[C] microkernels
PiperOrigin-RevId: 395397695
diff --git a/src/params-init.c b/src/params-init.c
index 29de601..d7b5c7c 100644
--- a/src/params-init.c
+++ b/src/params-init.c
@@ -1973,7 +1973,7 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
for (uint32_t i = 0; i < 4; i++) {
params->sse2.bias[i] = bias;
}
@@ -1989,9 +1989,6 @@
}
params->sse2.shift = shift;
params->sse2.b_multiplier = (uint32_t) b_multiplier;
- for (uint32_t i = 0; i < 4; i++) {
- params->sse2.rounding[i] = rounding;
- }
for (uint32_t i = 0; i < 8; i++) {
params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
}
@@ -2041,12 +2038,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) (uint32_t) a_zero_point + b_multiplier * (int32_t) (uint32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
for (uint32_t i = 0; i < 4; i++) {
params->sse4.bias[i] = bias;
params->sse4.a_multiplier[i] = a_multiplier;
params->sse4.b_multiplier[i] = b_multiplier;
- params->sse4.rounding[i] = rounding;
params->sse4.shift[i] = shift;
}
for (uint32_t i = 0; i < 8; i++) {
@@ -2098,12 +2094,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) (uint32_t) a_zero_point + b_multiplier * (int32_t) (uint32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
for (uint32_t i = 0; i < 8; i++) {
params->avx2.bias[i] = bias;
params->avx2.a_multiplier[i] = a_multiplier;
params->avx2.b_multiplier[i] = b_multiplier;
- params->avx2.rounding[i] = rounding;
params->avx2.shift[i] = shift;
}
for (uint32_t i = 0; i < 16; i++) {
@@ -2153,12 +2148,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) (uint32_t) a_zero_point + b_multiplier * (int32_t) (uint32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
for (uint32_t i = 0; i < 16; i++) {
params->avx512.bias[i] = bias;
params->avx512.a_multiplier[i] = a_multiplier;
params->avx512.b_multiplier[i] = b_multiplier;
- params->avx512.rounding[i] = rounding;
params->avx512.shift[i] = shift;
}
for (uint32_t i = 0; i < 32; i++) {
@@ -2261,12 +2255,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) (uint32_t) a_zero_point + b_multiplier * (int32_t) (uint32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
for (uint32_t i = 0; i < 4; i++) {
params->wasmsimd.bias[i] = bias;
params->wasmsimd.a_multiplier[i] = a_multiplier;
params->wasmsimd.b_multiplier[i] = b_multiplier;
- params->wasmsimd.rounding[i] = rounding;
}
params->wasmsimd.shift = shift;
for (uint32_t i = 0; i < 8; i++) {
@@ -2319,10 +2312,9 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- params->scalar.bias = (int32_t) -(a_multiplier * (int32_t) (uint32_t) a_zero_point + b_multiplier * (int32_t) (uint32_t) b_zero_point);
+ params->scalar.bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
params->scalar.a_multiplier = a_multiplier;
params->scalar.b_multiplier = b_multiplier;
- params->scalar.rounding = rounding;
params->scalar.shift = shift;
params->scalar.output_min_less_zero_point = (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
params->scalar.output_max_less_zero_point = (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
@@ -2370,7 +2362,7 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
for (uint32_t i = 0; i < 4; i++) {
params->sse2.bias[i] = bias;
}
@@ -2386,9 +2378,6 @@
}
params->sse2.shift = shift;
params->sse2.b_multiplier = (uint32_t) b_multiplier;
- for (uint32_t i = 0; i < 4; i++) {
- params->sse2.rounding[i] = rounding;
- }
for (uint32_t i = 0; i < 8; i++) {
params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
params->sse2.output_min[i] = (int16_t) output_min;
@@ -2436,7 +2425,7 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
for (uint32_t i = 0; i < 4; i++) {
params->sse4_mul16.bias[i] = bias;
}
@@ -2452,9 +2441,6 @@
}
params->sse4_mul16.shift = shift;
params->sse4_mul16.b_multiplier = (uint32_t) b_multiplier;
- for (uint32_t i = 0; i < 4; i++) {
- params->sse4_mul16.rounding[i] = rounding;
- }
for (uint32_t i = 0; i < 8; i++) {
params->sse4_mul16.output_zero_point[i] = (int16_t) output_zero_point;
}
@@ -2504,12 +2490,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
for (uint32_t i = 0; i < 4; i++) {
params->sse4_mul32.bias[i] = bias;
params->sse4_mul32.a_multiplier[i] = a_multiplier;
params->sse4_mul32.b_multiplier[i] = b_multiplier;
- params->sse4_mul32.rounding[i] = rounding;
params->sse4_mul32.shift[i] = shift;
}
for (uint32_t i = 0; i < 8; i++) {
@@ -2561,12 +2546,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
for (uint32_t i = 0; i < 8; i++) {
params->avx2.bias[i] = bias;
params->avx2.a_multiplier[i] = a_multiplier;
params->avx2.b_multiplier[i] = b_multiplier;
- params->avx2.rounding[i] = rounding;
params->avx2.shift[i] = shift;
}
for (uint32_t i = 0; i < 16; i++) {
@@ -2616,12 +2600,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
for (uint32_t i = 0; i < 16; i++) {
params->avx512.bias[i] = bias;
params->avx512.a_multiplier[i] = a_multiplier;
params->avx512.b_multiplier[i] = b_multiplier;
- params->avx512.rounding[i] = rounding;
params->avx512.shift[i] = shift;
}
for (uint32_t i = 0; i < 32; i++) {
@@ -2724,12 +2707,11 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- const int32_t bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
for (uint32_t i = 0; i < 4; i++) {
params->wasmsimd.bias[i] = bias;
params->wasmsimd.a_multiplier[i] = a_multiplier;
params->wasmsimd.b_multiplier[i] = b_multiplier;
- params->wasmsimd.rounding[i] = rounding;
}
params->wasmsimd.shift = shift;
for (uint32_t i = 0; i < 8; i++) {
@@ -2782,10 +2764,9 @@
const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
const int32_t rounding = INT32_C(1) << (shift - 1);
- params->scalar.bias = (int32_t) -(a_multiplier * (int32_t) a_zero_point + b_multiplier * (int32_t) b_zero_point);
+ params->scalar.bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
params->scalar.a_multiplier = a_multiplier;
params->scalar.b_multiplier = b_multiplier;
- params->scalar.rounding = rounding;
params->scalar.shift = shift;
params->scalar.output_min_less_zero_point = (int32_t) output_min - (int32_t) output_zero_point;
params->scalar.output_max_less_zero_point = (int32_t) output_max - (int32_t) output_zero_point;
diff --git a/src/qs8-vadd/avx2-mul32-ld64.c.in b/src/qs8-vadd/avx2-mul32-ld64.c.in
index 59981d6..8e8b4e0 100644
--- a/src/qs8-vadd/avx2-mul32-ld64.c.in
+++ b/src/qs8-vadd/avx2-mul32-ld64.c.in
@@ -30,7 +30,6 @@
const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
$if BATCH_TILE > 8:
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
@@ -55,7 +54,7 @@
vacc${ABC[N:N+8]} = _mm256_add_epi32(vacc${ABC[N:N+8]}, _mm256_mullo_epi32(vb${ABC[N:N+8]}, vb_multiplier));
$for N in range(0, BATCH_TILE, 8):
- vacc${ABC[N:N+8]} = _mm256_sra_epi32(_mm256_add_epi32(vacc${ABC[N:N+8]}, vrounding), vshift);
+ vacc${ABC[N:N+8]} = _mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift);
$for N in range(0, BATCH_TILE, 16):
$if N + 8 < BATCH_TILE:
@@ -106,7 +105,7 @@
vacc${ABC[0:8]} = _mm256_add_epi32(vacc${ABC[0:8]}, _mm256_mullo_epi32(vb${ABC[0:8]}, vb_multiplier));
- vacc${ABC[0:8]} = _mm256_sra_epi32(_mm256_add_epi32(vacc${ABC[0:8]}, vrounding), vshift);
+ vacc${ABC[0:8]} = _mm256_sra_epi32(vacc${ABC[0:8]}, vshift);
$if BATCH_TILE > 8:
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[0:8]}), _mm256_extracti128_si256(vacc${ABC[0:8]}, 1)), _mm256_castsi256_si128(voutput_zero_point));
diff --git a/src/qs8-vadd/avx512skx-mul32-ld128.c.in b/src/qs8-vadd/avx512skx-mul32-ld128.c.in
index 43a967d..5053e7d 100644
--- a/src/qs8-vadd/avx512skx-mul32-ld128.c.in
+++ b/src/qs8-vadd/avx512skx-mul32-ld128.c.in
@@ -33,7 +33,6 @@
const __m512i vbias = _mm512_load_si512(params->avx512.bias);
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
const __m512i vb_multiplier = _mm512_load_si512(params->avx512.b_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
$if BATCH_TILE > 16:
const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
@@ -60,7 +59,7 @@
vacc${ABC[N:N+16]} = _mm512_add_epi32(vacc${ABC[N:N+16]}, _mm512_mullo_epi32(vb${ABC[N:N+16]}, vb_multiplier));
$for N in range(0, BATCH_TILE, 16):
- vacc${ABC[N:N+16]} = _mm512_sra_epi32(_mm512_add_epi32(vacc${ABC[N:N+16]}, vrounding), vshift);
+ vacc${ABC[N:N+16]} = _mm512_sra_epi32(vacc${ABC[N:N+16]}, vshift);
$for N in range(0, BATCH_TILE, 32):
$if N + 16 < BATCH_TILE:
@@ -119,7 +118,7 @@
vacc${ABC[0:16]} = _mm512_add_epi32(vacc${ABC[0:16]}, _mm512_mullo_epi32(vb${ABC[0:16]}, vb_multiplier));
- vacc${ABC[0:16]} = _mm512_sra_epi32(_mm512_add_epi32(vacc${ABC[0:16]}, vrounding), vshift);
+ vacc${ABC[0:16]} = _mm512_sra_epi32(vacc${ABC[0:16]}, vshift);
$if BATCH_TILE > 16:
__m256i vout${ABC[0:4]}${ABC[8:12]}${ABC[4:8]}${ABC[12:16]} = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc${ABC[0:16]}), _mm512_extracti32x8_epi32(vacc${ABC[0:16]}, 1)), _mm512_castsi512_si256(voutput_zero_point));
diff --git a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x16.c b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x16.c
index 4153780..deb836d 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x16.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -70,10 +69,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -113,8 +112,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x24.c b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x24.c
index 3bf276f..1051c5c 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x24.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x24.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -84,12 +83,12 @@
vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_unpacklo_epi16(vbprodGHIJKLMNlo, vbprodGHIJKLMNhi));
vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_unpackhi_epi16(vbprodGHIJKLMNlo, vbprodGHIJKLMNhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -134,8 +133,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x32.c b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x32.c
index e765771..9e99933 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x32.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x32.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -98,14 +97,14 @@
vaccOPQR = _mm_add_epi32(vaccOPQR, _mm_unpacklo_epi16(vbprodOPQRSTUVlo, vbprodOPQRSTUVhi));
vaccSTUV = _mm_add_epi32(vaccSTUV, _mm_unpackhi_epi16(vbprodOPQRSTUVlo, vbprodOPQRSTUVhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -151,8 +150,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x8.c b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x8.c
index 1b9e90d..2391419 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x8.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -56,8 +55,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -94,8 +93,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x16.c b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x16.c
index 63ec6be..69f0a24 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x16.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x16.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -53,10 +52,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vb89AB, vb_multiplier));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vbCDEF, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -85,8 +84,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x24.c b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x24.c
index 1c00a98..ddacd87 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x24.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x24.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -61,12 +60,12 @@
vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_mullo_epi32(vbGHIJ, vb_multiplier));
vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_mullo_epi32(vbKLMN, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -100,8 +99,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x32.c b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x32.c
index 270e529..f9658bf 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x32.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x32.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -69,14 +68,14 @@
vaccOPQR = _mm_add_epi32(vaccOPQR, _mm_mullo_epi32(vbOPQR, vb_multiplier));
vaccSTUV = _mm_add_epi32(vaccSTUV, _mm_mullo_epi32(vbSTUV, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -111,8 +110,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c
index 4088dae..d5dd4ec 100644
--- a/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c
+++ b/src/qs8-vadd/gen/minmax-avx-mul32-ld32-x8.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -45,8 +44,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -72,8 +71,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c
index 2e189d1..680a1ff 100644
--- a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c
+++ b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x16.c
@@ -25,7 +25,6 @@
const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -45,8 +44,8 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
@@ -70,7 +69,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x24.c b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x24.c
index 983dcbe..4964720 100644
--- a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x24.c
+++ b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x24.c
@@ -25,7 +25,6 @@
const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -49,9 +48,9 @@
vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
vaccGHIJKLMN = _mm256_add_epi32(vaccGHIJKLMN, _mm256_mullo_epi32(vbGHIJKLMN, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
- vaccGHIJKLMN = _mm256_sra_epi32(_mm256_add_epi32(vaccGHIJKLMN, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
+ vaccGHIJKLMN = _mm256_sra_epi32(vaccGHIJKLMN, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
__m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extracti128_si256(vaccGHIJKLMN, 1)), _mm256_castsi256_si128(voutput_zero_point));
@@ -80,7 +79,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x32.c b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x32.c
index 2b7915d..39178d9 100644
--- a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x32.c
+++ b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x32.c
@@ -25,7 +25,6 @@
const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -53,10 +52,10 @@
vaccGHIJKLMN = _mm256_add_epi32(vaccGHIJKLMN, _mm256_mullo_epi32(vbGHIJKLMN, vb_multiplier));
vaccOPQRSTUV = _mm256_add_epi32(vaccOPQRSTUV, _mm256_mullo_epi32(vbOPQRSTUV, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
- vaccGHIJKLMN = _mm256_sra_epi32(_mm256_add_epi32(vaccGHIJKLMN, vrounding), vshift);
- vaccOPQRSTUV = _mm256_sra_epi32(_mm256_add_epi32(vaccOPQRSTUV, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
+ vaccGHIJKLMN = _mm256_sra_epi32(vaccGHIJKLMN, vshift);
+ vaccOPQRSTUV = _mm256_sra_epi32(vaccOPQRSTUV, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
__m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
@@ -85,7 +84,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x8.c b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x8.c
index 44131af..e4f113e 100644
--- a/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x8.c
+++ b/src/qs8-vadd/gen/minmax-avx2-mul32-ld64-x8.c
@@ -25,7 +25,6 @@
const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -41,7 +40,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
@@ -63,7 +62,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c b/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c
index 304ead7..e276402 100644
--- a/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c
+++ b/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c
@@ -25,7 +25,6 @@
const __m512i vbias = _mm512_load_si512(params->avx512.bias);
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
const __m512i vb_multiplier = _mm512_load_si512(params->avx512.b_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
@@ -41,7 +40,7 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
@@ -64,7 +63,7 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c b/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c
index 2cf410c..e194eee 100644
--- a/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c
+++ b/src/qs8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c
@@ -25,7 +25,6 @@
const __m512i vbias = _mm512_load_si512(params->avx512.bias);
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
const __m512i vb_multiplier = _mm512_load_si512(params->avx512.b_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx512.output_min);
@@ -45,8 +44,8 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vbGHIJKLMNOPQRSTUV, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
- vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(_mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
+ vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(vaccGHIJKLMNOPQRSTUV, vshift);
__m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
@@ -70,7 +69,7 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qs8-vadd/gen/minmax-scalar-x1.c b/src/qs8-vadd/gen/minmax-scalar-x1.c
index 1ba3eb9..6368e02 100644
--- a/src/qs8-vadd/gen/minmax-scalar-x1.c
+++ b/src/qs8-vadd/gen/minmax-scalar-x1.c
@@ -23,7 +23,6 @@
const int32_t vbias = params->scalar.bias;
const int32_t va_multiplier = params->scalar.a_multiplier;
const int32_t vb_multiplier = params->scalar.b_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -34,7 +33,7 @@
const int32_t vb = *input_b++;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-scalar-x2.c b/src/qs8-vadd/gen/minmax-scalar-x2.c
index 9c4e1b1..4ee062d 100644
--- a/src/qs8-vadd/gen/minmax-scalar-x2.c
+++ b/src/qs8-vadd/gen/minmax-scalar-x2.c
@@ -23,7 +23,6 @@
const int32_t vbias = params->scalar.bias;
const int32_t va_multiplier = params->scalar.a_multiplier;
const int32_t vb_multiplier = params->scalar.b_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -43,8 +42,8 @@
vacc0 += vb0 * vb_multiplier;
vacc1 += vb1 * vb_multiplier;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -64,7 +63,7 @@
const int32_t vb = *input_b;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-scalar-x4.c b/src/qs8-vadd/gen/minmax-scalar-x4.c
index dfd231f..f45a809 100644
--- a/src/qs8-vadd/gen/minmax-scalar-x4.c
+++ b/src/qs8-vadd/gen/minmax-scalar-x4.c
@@ -23,7 +23,6 @@
const int32_t vbias = params->scalar.bias;
const int32_t va_multiplier = params->scalar.a_multiplier;
const int32_t vb_multiplier = params->scalar.b_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -51,10 +50,10 @@
vacc2 += vb2 * vb_multiplier;
vacc3 += vb3 * vb_multiplier;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
- int32_t vout2 = asr_s32(vacc2 + vrounding, vshift);
- int32_t vout3 = asr_s32(vacc3 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
+ int32_t vout2 = asr_s32(vacc2, vshift);
+ int32_t vout3 = asr_s32(vacc3, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -83,7 +82,7 @@
const int32_t vb = *input_b++;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
index dd507da..c72e4a9 100644
--- a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -74,10 +73,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -121,8 +120,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c
index f8107c2..678bb17 100644
--- a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -90,12 +89,12 @@
vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_unpacklo_epi16(vbprodGHIJKLMNlo, vbprodGHIJKLMNhi));
vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_unpackhi_epi16(vbprodGHIJKLMNlo, vbprodGHIJKLMNhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -144,8 +143,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c
index 90473a2..8cf5229 100644
--- a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -106,14 +105,14 @@
vaccOPQR = _mm_add_epi32(vaccOPQR, _mm_unpacklo_epi16(vbprodOPQRSTUVlo, vbprodOPQRSTUVhi));
vaccSTUV = _mm_add_epi32(vaccSTUV, _mm_unpackhi_epi16(vbprodOPQRSTUVlo, vbprodOPQRSTUVhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -165,8 +164,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
index f3eb883..807647c 100644
--- a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -58,8 +57,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -98,8 +97,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
index fc41ff4..381d7c9 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -70,10 +69,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -113,8 +112,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c
index 181401e..3dc173f 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -84,12 +83,12 @@
vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_unpacklo_epi16(vbprodGHIJKLMNlo, vbprodGHIJKLMNhi));
vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_unpackhi_epi16(vbprodGHIJKLMNlo, vbprodGHIJKLMNhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -134,8 +133,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c
index 3587ca3..72e58ac 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -98,14 +97,14 @@
vaccOPQR = _mm_add_epi32(vaccOPQR, _mm_unpacklo_epi16(vbprodOPQRSTUVlo, vbprodOPQRSTUVhi));
vaccSTUV = _mm_add_epi32(vaccSTUV, _mm_unpackhi_epi16(vbprodOPQRSTUVlo, vbprodOPQRSTUVhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -151,8 +150,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
index a0e0671..e3d3a34 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -56,8 +55,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -94,8 +93,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x16.c b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x16.c
index e984174..c9959f3 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x16.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x16.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -53,10 +52,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vb89AB, vb_multiplier));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vbCDEF, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -85,8 +84,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x24.c b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x24.c
index e272bf3..5977e28 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x24.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x24.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -61,12 +60,12 @@
vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_mullo_epi32(vbGHIJ, vb_multiplier));
vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_mullo_epi32(vbKLMN, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -100,8 +99,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x32.c b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x32.c
index 867efc1..7aa3855 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x32.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x32.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -69,14 +68,14 @@
vaccOPQR = _mm_add_epi32(vaccOPQR, _mm_mullo_epi32(vbOPQR, vb_multiplier));
vaccSTUV = _mm_add_epi32(vaccSTUV, _mm_mullo_epi32(vbSTUV, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -111,8 +110,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x8.c b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x8.c
index 97f9661..cd6c4f7 100644
--- a/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x8.c
+++ b/src/qs8-vadd/gen/minmax-sse41-mul32-ld32-x8.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -45,8 +44,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -72,8 +71,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-wasmsimd-x16.c b/src/qs8-vadd/gen/minmax-wasmsimd-x16.c
index a021ca1..05660d7 100644
--- a/src/qs8-vadd/gen/minmax-wasmsimd-x16.c
+++ b/src/qs8-vadd/gen/minmax-wasmsimd-x16.c
@@ -24,7 +24,6 @@
const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -48,10 +47,10 @@
vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb89ABCDEF), vb_multiplier));
vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb89ABCDEF), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -78,8 +77,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-wasmsimd-x24.c b/src/qs8-vadd/gen/minmax-wasmsimd-x24.c
index 66ae0ce..1c86b41 100644
--- a/src/qs8-vadd/gen/minmax-wasmsimd-x24.c
+++ b/src/qs8-vadd/gen/minmax-wasmsimd-x24.c
@@ -24,7 +24,6 @@
const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -54,12 +53,12 @@
vaccGHIJ = wasm_i32x4_add(vaccGHIJ, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vbGHIJKLMN), vb_multiplier));
vaccKLMN = wasm_i32x4_add(vaccKLMN, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vbGHIJKLMN), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
- vaccGHIJ = wasm_i32x4_shr(wasm_i32x4_add(vaccGHIJ, vrounding), vshift);
- vaccKLMN = wasm_i32x4_shr(wasm_i32x4_add(vaccKLMN, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
+ vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift);
+ vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -91,8 +90,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-wasmsimd-x32.c b/src/qs8-vadd/gen/minmax-wasmsimd-x32.c
index 81cfb2c..51b4906 100644
--- a/src/qs8-vadd/gen/minmax-wasmsimd-x32.c
+++ b/src/qs8-vadd/gen/minmax-wasmsimd-x32.c
@@ -24,7 +24,6 @@
const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -60,14 +59,14 @@
vaccOPQR = wasm_i32x4_add(vaccOPQR, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vbOPQRSTUV), vb_multiplier));
vaccSTUV = wasm_i32x4_add(vaccSTUV, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vbOPQRSTUV), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
- vaccGHIJ = wasm_i32x4_shr(wasm_i32x4_add(vaccGHIJ, vrounding), vshift);
- vaccKLMN = wasm_i32x4_shr(wasm_i32x4_add(vaccKLMN, vrounding), vshift);
- vaccOPQR = wasm_i32x4_shr(wasm_i32x4_add(vaccOPQR, vrounding), vshift);
- vaccSTUV = wasm_i32x4_shr(wasm_i32x4_add(vaccSTUV, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
+ vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift);
+ vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift);
+ vaccOPQR = wasm_i32x4_shr(vaccOPQR, vshift);
+ vaccSTUV = wasm_i32x4_shr(vaccSTUV, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -100,8 +99,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-wasmsimd-x8.c b/src/qs8-vadd/gen/minmax-wasmsimd-x8.c
index 72783d9..2fd29e2 100644
--- a/src/qs8-vadd/gen/minmax-wasmsimd-x8.c
+++ b/src/qs8-vadd/gen/minmax-wasmsimd-x8.c
@@ -24,7 +24,6 @@
const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -42,8 +41,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
@@ -67,8 +66,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x16.c b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x16.c
index 9004722..3fd41d6 100644
--- a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x16.c
+++ b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x16.c
@@ -30,7 +30,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -58,10 +57,10 @@
vacc89AB = _mm_macc_epi32(vb89AB, vb_multiplier, vacc89AB);
vaccCDEF = _mm_macc_epi32(vbCDEF, vb_multiplier, vaccCDEF);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -90,8 +89,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x24.c b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x24.c
index c12b84f..9f45091 100644
--- a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x24.c
+++ b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x24.c
@@ -30,7 +30,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -66,12 +65,12 @@
vaccGHIJ = _mm_macc_epi32(vbGHIJ, vb_multiplier, vaccGHIJ);
vaccKLMN = _mm_macc_epi32(vbKLMN, vb_multiplier, vaccKLMN);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -105,8 +104,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x32.c b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x32.c
index 4e07a59..89b8cc5 100644
--- a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x32.c
+++ b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x32.c
@@ -30,7 +30,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -74,14 +73,14 @@
vaccOPQR = _mm_macc_epi32(vbOPQR, vb_multiplier, vaccOPQR);
vaccSTUV = _mm_macc_epi32(vbSTUV, vb_multiplier, vaccSTUV);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -116,8 +115,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c
index 10dffce..15babf7 100644
--- a/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c
+++ b/src/qs8-vadd/gen/minmax-xop-mul32-ld32-x8.c
@@ -30,7 +30,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -50,8 +49,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -77,8 +76,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vadd/scalar.c.in b/src/qs8-vadd/scalar.c.in
index 5d092b8..59326ec 100644
--- a/src/qs8-vadd/scalar.c.in
+++ b/src/qs8-vadd/scalar.c.in
@@ -22,7 +22,6 @@
const int32_t vbias = params->scalar.bias;
const int32_t va_multiplier = params->scalar.a_multiplier;
const int32_t vb_multiplier = params->scalar.b_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -34,7 +33,7 @@
const int32_t vb = *input_b++;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -56,7 +55,7 @@
vacc${N} += vb${N} * vb_multiplier;
$for N in range(BATCH_TILE):
- int32_t vout${N} = asr_s32(vacc${N} + vrounding, vshift);
+ int32_t vout${N} = asr_s32(vacc${N}, vshift);
$for N in range(BATCH_TILE):
vout${N} = math_max_s32(vout${N}, voutput_min_less_zero_point);
@@ -77,7 +76,7 @@
const int32_t vb = *input_b;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -87,7 +86,7 @@
const int32_t vb = *input_b++;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (${XINT8_T}) (vout + voutput_zero_point);
diff --git a/src/qs8-vadd/sse-mul16-ld64.c.in b/src/qs8-vadd/sse-mul16-ld64.c.in
index bf898ff..fce1ec2 100644
--- a/src/qs8-vadd/sse-mul16-ld64.c.in
+++ b/src/qs8-vadd/sse-mul16-ld64.c.in
@@ -36,7 +36,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->${PARAMS_STRUCT}.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
@@ -93,7 +92,7 @@
vacc${ABC[N+4:N+8]} = _mm_add_epi32(vacc${ABC[N+4:N+8]}, _mm_unpackhi_epi16(vbprod${ABC[N:N+8]}lo, vbprod${ABC[N:N+8]}hi));
$for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[N:N+4]}, vrounding), vshift);
+ vacc${ABC[N:N+4]} = _mm_sra_epi32(vacc${ABC[N:N+4]}, vshift);
$for N in range(0, BATCH_TILE, 8):
__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]}), voutput_zero_point);
@@ -174,8 +173,8 @@
vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_unpacklo_epi16(vbprod${ABC[0:8]}lo, vbprod${ABC[0:8]}hi));
vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_unpackhi_epi16(vbprod${ABC[0:8]}lo, vbprod${ABC[0:8]}hi));
- vacc${ABC[0:4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[0:4]}, vrounding), vshift);
- vacc${ABC[4:8]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[4:8]}, vrounding), vshift);
+ vacc${ABC[0:4]} = _mm_sra_epi32(vacc${ABC[0:4]}, vshift);
+ vacc${ABC[4:8]} = _mm_sra_epi32(vacc${ABC[4:8]}, vshift);
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
$if DATATYPE == "QS8" and SSE < 4:
diff --git a/src/qs8-vadd/sse-mul32-ld32.c.in b/src/qs8-vadd/sse-mul32-ld32.c.in
index b8019cb..f801ea5 100644
--- a/src/qs8-vadd/sse-mul32-ld32.c.in
+++ b/src/qs8-vadd/sse-mul32-ld32.c.in
@@ -42,7 +42,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
const __m128i vshift = _mm_loadu_si32(params->${PARAMS_STRUCT}.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
@@ -71,7 +70,7 @@
vacc${ABC[N:N+4]} = _mm_add_epi32(vacc${ABC[N:N+4]}, _mm_mullo_epi32(vb${ABC[N:N+4]}, vb_multiplier));
$for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[N:N+4]}, vrounding), vshift);
+ vacc${ABC[N:N+4]} = _mm_sra_epi32(vacc${ABC[N:N+4]}, vshift);
$for N in range(0, BATCH_TILE, 8):
const __m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]}), voutput_zero_point);
@@ -128,8 +127,8 @@
vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_mullo_epi32(vb${ABC[0:4]}, vb_multiplier));
vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_mullo_epi32(vb${ABC[4:8]}, vb_multiplier));
- vacc${ABC[0:4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[0:4]}, vrounding), vshift);
- vacc${ABC[4:8]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[4:8]}, vrounding), vshift);
+ vacc${ABC[0:4]} = _mm_sra_epi32(vacc${ABC[0:4]}, vshift);
+ vacc${ABC[4:8]} = _mm_sra_epi32(vacc${ABC[4:8]}, vshift);
const __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
diff --git a/src/qs8-vadd/wasmsimd.c.in b/src/qs8-vadd/wasmsimd.c.in
index 9c3b495..27ec5a1 100644
--- a/src/qs8-vadd/wasmsimd.c.in
+++ b/src/qs8-vadd/wasmsimd.c.in
@@ -31,7 +31,6 @@
const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -55,7 +54,7 @@
vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vacc${ABC[N+4:N+8]}, wasm_i32x4_mul(${WASM_X32X4_EXTEND_HIGH_X16X8}(vb${ABC[N:N+8]}), vb_multiplier));
$for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = wasm_i32x4_shr(wasm_i32x4_add(vacc${ABC[N:N+4]}, vrounding), vshift);
+ vacc${ABC[N:N+4]} = wasm_i32x4_shr(vacc${ABC[N:N+4]}, vshift);
$for N in range(0, BATCH_TILE, 8):
v128_t vout${ABC[N:N+8]} = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]}), voutput_zero_point);
@@ -103,8 +102,8 @@
vacc${ABC[0:4]} = wasm_i32x4_add(vacc${ABC[0:4]}, wasm_i32x4_mul(${WASM_X32X4_EXTEND_LOW_X16X8}(vb${ABC[0:8]}), vb_multiplier));
vacc${ABC[4:8]} = wasm_i32x4_add(vacc${ABC[4:8]}, wasm_i32x4_mul(${WASM_X32X4_EXTEND_HIGH_X16X8}(vb${ABC[0:8]}), vb_multiplier));
- vacc${ABC[0:4]} = wasm_i32x4_shr(wasm_i32x4_add(vacc${ABC[0:4]}, vrounding), vshift);
- vacc${ABC[4:8]} = wasm_i32x4_shr(wasm_i32x4_add(vacc${ABC[4:8]}, vrounding), vshift);
+ vacc${ABC[0:4]} = wasm_i32x4_shr(vacc${ABC[0:4]}, vshift);
+ vacc${ABC[4:8]} = wasm_i32x4_shr(vacc${ABC[4:8]}, vshift);
v128_t vout${ABC[0:8]} = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
diff --git a/src/qs8-vaddc/avx2-mul32-ld64.c.in b/src/qs8-vaddc/avx2-mul32-ld64.c.in
index d880d59..2ee7012 100644
--- a/src/qs8-vaddc/avx2-mul32-ld64.c.in
+++ b/src/qs8-vaddc/avx2-mul32-ld64.c.in
@@ -28,7 +28,6 @@
const union xnn_${DATATYPE.lower()}_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
$if BATCH_TILE > 8:
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
@@ -50,7 +49,7 @@
__m256i vacc${ABC[N:N+8]} = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va${ABC[N:N+8]}, va_multiplier));
$for N in range(0, BATCH_TILE, 8):
- vacc${ABC[N:N+8]} = _mm256_sra_epi32(_mm256_add_epi32(vacc${ABC[N:N+8]}, vrounding), vshift);
+ vacc${ABC[N:N+8]} = _mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift);
$for N in range(0, BATCH_TILE, 16):
$if N + 8 < BATCH_TILE:
@@ -97,7 +96,7 @@
__m256i vacc${ABC[0:8]} = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va${ABC[0:8]}, va_multiplier));
- vacc${ABC[0:8]} = _mm256_sra_epi32(_mm256_add_epi32(vacc${ABC[0:8]}, vrounding), vshift);
+ vacc${ABC[0:8]} = _mm256_sra_epi32(vacc${ABC[0:8]}, vshift);
$if BATCH_TILE > 8:
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[0:8]}), _mm256_extracti128_si256(vacc${ABC[0:8]}, 1)), _mm256_castsi256_si128(voutput_zero_point));
diff --git a/src/qs8-vaddc/avx512skx-mul32-ld128.c.in b/src/qs8-vaddc/avx512skx-mul32-ld128.c.in
index 6095515..0726468 100644
--- a/src/qs8-vaddc/avx512skx-mul32-ld128.c.in
+++ b/src/qs8-vaddc/avx512skx-mul32-ld128.c.in
@@ -31,7 +31,6 @@
const union xnn_${DATATYPE.lower()}_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
$if BATCH_TILE > 16:
const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
@@ -55,7 +54,7 @@
__m512i vacc${ABC[N:N+16]} = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va${ABC[N:N+16]}, va_multiplier));
$for N in range(0, BATCH_TILE, 16):
- vacc${ABC[N:N+16]} = _mm512_sra_epi32(_mm512_add_epi32(vacc${ABC[N:N+16]}, vrounding), vshift);
+ vacc${ABC[N:N+16]} = _mm512_sra_epi32(vacc${ABC[N:N+16]}, vshift);
$for N in range(0, BATCH_TILE, 32):
$if N + 16 < BATCH_TILE:
@@ -109,7 +108,7 @@
__m512i vacc${ABC[0:16]} = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va${ABC[0:16]}, va_multiplier));
- vacc${ABC[0:16]} = _mm512_sra_epi32(_mm512_add_epi32(vacc${ABC[0:16]}, vrounding), vshift);
+ vacc${ABC[0:16]} = _mm512_sra_epi32(vacc${ABC[0:16]}, vshift);
$if BATCH_TILE > 16:
__m256i vout${ABC[0:4]}${ABC[8:12]}${ABC[4:8]}${ABC[12:16]} = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc${ABC[0:16]}), _mm512_extracti32x8_epi32(vacc${ABC[0:16]}, 1)), _mm512_castsi512_si256(voutput_zero_point));
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x16.c b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x16.c
index 30c9307..c27da55 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x16.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -54,10 +53,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -88,8 +87,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x24.c b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x24.c
index 5da4de0..9301793 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x24.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x24.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -61,12 +60,12 @@
__m128i vaccGHIJ = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprodGHIJKLMNlo, vaprodGHIJKLMNhi));
__m128i vaccKLMN = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprodGHIJKLMNlo, vaprodGHIJKLMNhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -102,8 +101,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x32.c b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x32.c
index 2af9bab..2b57a0e 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x32.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x32.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -68,14 +67,14 @@
__m128i vaccOPQR = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprodOPQRSTUVlo, vaprodOPQRSTUVhi));
__m128i vaccSTUV = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprodOPQRSTUVlo, vaprodOPQRSTUVhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -112,8 +111,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x8.c b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x8.c
index dcee41c..ae42ecb 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x8.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -47,8 +46,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -77,8 +76,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x16.c b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x16.c
index b6a9ef9..db288cc 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x16.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x16.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -45,10 +44,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_mullo_epi32(va89AB, va_multiplier));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_mullo_epi32(vaCDEF, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -71,8 +70,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x24.c b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x24.c
index e20cf5a..ddbcb75 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x24.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x24.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -49,12 +48,12 @@
__m128i vaccGHIJ = _mm_add_epi32(vbias, _mm_mullo_epi32(vaGHIJ, va_multiplier));
__m128i vaccKLMN = _mm_add_epi32(vbias, _mm_mullo_epi32(vaKLMN, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -82,8 +81,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c
index 221a53a..8dfbea8 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -53,14 +52,14 @@
__m128i vaccOPQR = _mm_add_epi32(vbias, _mm_mullo_epi32(vaOPQR, va_multiplier));
__m128i vaccSTUV = _mm_add_epi32(vbias, _mm_mullo_epi32(vaSTUV, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -89,8 +88,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c
index c1c652c..9b37d0a 100644
--- a/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c
+++ b/src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x8.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -41,8 +40,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -63,8 +62,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
index 6b7407b..2b84d52 100644
--- a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
+++ b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -40,8 +39,8 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
__m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
@@ -61,7 +60,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x24.c b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x24.c
index a71fcc1..5184996 100644
--- a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x24.c
+++ b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x24.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -42,9 +41,9 @@
__m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
__m256i vaccGHIJKLMN = _mm256_add_epi32(vbias, _mm256_mullo_epi32(vaGHIJKLMN, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
- vaccGHIJKLMN = _mm256_sra_epi32(_mm256_add_epi32(vaccGHIJKLMN, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
+ vaccGHIJKLMN = _mm256_sra_epi32(vaccGHIJKLMN, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
__m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extracti128_si256(vaccGHIJKLMN, 1)), _mm256_castsi256_si128(voutput_zero_point));
@@ -69,7 +68,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x32.c b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x32.c
index c51c26b..ccdb822 100644
--- a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x32.c
+++ b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x32.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -44,10 +43,10 @@
__m256i vaccGHIJKLMN = _mm256_add_epi32(vbias, _mm256_mullo_epi32(vaGHIJKLMN, va_multiplier));
__m256i vaccOPQRSTUV = _mm256_add_epi32(vbias, _mm256_mullo_epi32(vaOPQRSTUV, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
- vaccGHIJKLMN = _mm256_sra_epi32(_mm256_add_epi32(vaccGHIJKLMN, vrounding), vshift);
- vaccOPQRSTUV = _mm256_sra_epi32(_mm256_add_epi32(vaccOPQRSTUV, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
+ vaccGHIJKLMN = _mm256_sra_epi32(vaccGHIJKLMN, vshift);
+ vaccOPQRSTUV = _mm256_sra_epi32(vaccOPQRSTUV, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
__m256i voutGHIJOPQRKLMNSTUV = _mm256_adds_epi16(_mm256_packs_epi32(vaccGHIJKLMN, vaccOPQRSTUV), voutput_zero_point);
@@ -72,7 +71,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c
index 4c013e4..e61ab23 100644
--- a/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c
+++ b/src/qs8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -38,7 +37,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
@@ -57,7 +56,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
__m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
diff --git a/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c b/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c
index 5d50f70..09130f7 100644
--- a/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c
+++ b/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
@@ -38,7 +37,7 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
@@ -58,7 +57,7 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c b/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c
index a3f4c75..bf02f38 100644
--- a/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c
+++ b/src/qs8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx512.output_min);
@@ -40,8 +39,8 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
__m512i vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vbias, _mm512_mullo_epi32(vaGHIJKLMNOPQRSTUV, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
- vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(_mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
+ vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(vaccGHIJKLMNOPQRSTUV, vshift);
__m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
@@ -61,7 +60,7 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packs_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qs8-vaddc/gen/minmax-scalar-x1.c b/src/qs8-vaddc/gen/minmax-scalar-x1.c
index 2807c3d..cacd01e 100644
--- a/src/qs8-vaddc/gen/minmax-scalar-x1.c
+++ b/src/qs8-vaddc/gen/minmax-scalar-x1.c
@@ -22,7 +22,6 @@
{
const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
const int32_t va_multiplier = params->scalar.a_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -32,7 +31,7 @@
const int32_t va = *input_a++;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-scalar-x2.c b/src/qs8-vaddc/gen/minmax-scalar-x2.c
index 3d9d054..dc04b44 100644
--- a/src/qs8-vaddc/gen/minmax-scalar-x2.c
+++ b/src/qs8-vaddc/gen/minmax-scalar-x2.c
@@ -22,7 +22,6 @@
{
const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
const int32_t va_multiplier = params->scalar.a_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -37,8 +36,8 @@
const int32_t vacc1 = vbias + va1 * va_multiplier;
input_b += 2;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -57,7 +56,7 @@
const int32_t va = *input_a;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-scalar-x4.c b/src/qs8-vaddc/gen/minmax-scalar-x4.c
index d050dc0..be7a847 100644
--- a/src/qs8-vaddc/gen/minmax-scalar-x4.c
+++ b/src/qs8-vaddc/gen/minmax-scalar-x4.c
@@ -22,7 +22,6 @@
{
const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
const int32_t va_multiplier = params->scalar.a_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -41,10 +40,10 @@
const int32_t vacc3 = vbias + va3 * va_multiplier;
input_b += 4;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
- int32_t vout2 = asr_s32(vacc2 + vrounding, vshift);
- int32_t vout3 = asr_s32(vacc3 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
+ int32_t vout2 = asr_s32(vacc2, vshift);
+ int32_t vout3 = asr_s32(vacc3, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -72,7 +71,7 @@
const int32_t va = *input_a++;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (int8_t) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c
index da38776..89f2659 100644
--- a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c
+++ b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -56,10 +55,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -93,8 +92,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x24.c b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x24.c
index 0c0fb5e..8a4f558 100644
--- a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x24.c
+++ b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x24.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -64,12 +63,12 @@
__m128i vaccGHIJ = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprodGHIJKLMNlo, vaprodGHIJKLMNhi));
__m128i vaccKLMN = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprodGHIJKLMNlo, vaprodGHIJKLMNhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -108,8 +107,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x32.c b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x32.c
index 5c01aa8..37f733f 100644
--- a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x32.c
+++ b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x32.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -72,14 +71,14 @@
__m128i vaccOPQR = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprodOPQRSTUVlo, vaprodOPQRSTUVhi));
__m128i vaccSTUV = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprodOPQRSTUVlo, vaprodOPQRSTUVhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -121,8 +120,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c
index 70fde70..f9e84ab 100644
--- a/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c
+++ b/src/qs8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -48,8 +47,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -79,8 +78,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c
index 75e4483..ec36387 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -54,10 +53,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -88,8 +87,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x24.c b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x24.c
index 035a1d0..ad5c8e1 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x24.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x24.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -61,12 +60,12 @@
__m128i vaccGHIJ = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprodGHIJKLMNlo, vaprodGHIJKLMNhi));
__m128i vaccKLMN = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprodGHIJKLMNlo, vaprodGHIJKLMNhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -102,8 +101,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x32.c b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x32.c
index 6ee11e3..248fc6c 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x32.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x32.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -68,14 +67,14 @@
__m128i vaccOPQR = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprodOPQRSTUVlo, vaprodOPQRSTUVhi));
__m128i vaccSTUV = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprodOPQRSTUVlo, vaprodOPQRSTUVhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -112,8 +111,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c
index d89d87e..c13c63a 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse4_mul16.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse4_mul16.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul16.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse4_mul16.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul16.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul16.output_min);
@@ -47,8 +46,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -77,8 +76,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c
index 2d4aaeb..53836e9 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -45,10 +44,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_mullo_epi32(va89AB, va_multiplier));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_mullo_epi32(vaCDEF, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -71,8 +70,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x24.c b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x24.c
index b748130..b43b386 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x24.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x24.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -49,12 +48,12 @@
__m128i vaccGHIJ = _mm_add_epi32(vbias, _mm_mullo_epi32(vaGHIJ, va_multiplier));
__m128i vaccKLMN = _mm_add_epi32(vbias, _mm_mullo_epi32(vaKLMN, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -82,8 +81,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x32.c b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x32.c
index 7cfc51e..d4e05d1 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x32.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x32.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -53,14 +52,14 @@
__m128i vaccOPQR = _mm_add_epi32(vbias, _mm_mullo_epi32(vaOPQR, va_multiplier));
__m128i vaccSTUV = _mm_add_epi32(vbias, _mm_mullo_epi32(vaSTUV, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -89,8 +88,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c
index b3e636a..88230d0 100644
--- a/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c
+++ b/src/qs8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c
@@ -23,7 +23,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -41,8 +40,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -63,8 +62,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-wasmsimd-x16.c b/src/qs8-vaddc/gen/minmax-wasmsimd-x16.c
index dd07539..b722efc 100644
--- a/src/qs8-vaddc/gen/minmax-wasmsimd-x16.c
+++ b/src/qs8-vaddc/gen/minmax-wasmsimd-x16.c
@@ -22,7 +22,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -41,10 +40,10 @@
v128_t vacc89AB = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va89ABCDEF), va_multiplier));
v128_t vaccCDEF = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va89ABCDEF), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -66,8 +65,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-wasmsimd-x24.c b/src/qs8-vaddc/gen/minmax-wasmsimd-x24.c
index 24ec402..51633b5 100644
--- a/src/qs8-vaddc/gen/minmax-wasmsimd-x24.c
+++ b/src/qs8-vaddc/gen/minmax-wasmsimd-x24.c
@@ -22,7 +22,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -44,12 +43,12 @@
v128_t vaccGHIJ = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vaGHIJKLMN), va_multiplier));
v128_t vaccKLMN = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vaGHIJKLMN), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
- vaccGHIJ = wasm_i32x4_shr(wasm_i32x4_add(vaccGHIJ, vrounding), vshift);
- vaccKLMN = wasm_i32x4_shr(wasm_i32x4_add(vaccKLMN, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
+ vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift);
+ vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -76,8 +75,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-wasmsimd-x32.c b/src/qs8-vaddc/gen/minmax-wasmsimd-x32.c
index 7c6c2c1..5a9b4b6 100644
--- a/src/qs8-vaddc/gen/minmax-wasmsimd-x32.c
+++ b/src/qs8-vaddc/gen/minmax-wasmsimd-x32.c
@@ -22,7 +22,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -47,14 +46,14 @@
v128_t vaccOPQR = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(vaOPQRSTUV), va_multiplier));
v128_t vaccSTUV = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(vaOPQRSTUV), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
- vaccGHIJ = wasm_i32x4_shr(wasm_i32x4_add(vaccGHIJ, vrounding), vshift);
- vaccKLMN = wasm_i32x4_shr(wasm_i32x4_add(vaccKLMN, vrounding), vshift);
- vaccOPQR = wasm_i32x4_shr(wasm_i32x4_add(vaccOPQR, vrounding), vshift);
- vaccSTUV = wasm_i32x4_shr(wasm_i32x4_add(vaccSTUV, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
+ vaccGHIJ = wasm_i32x4_shr(vaccGHIJ, vshift);
+ vaccKLMN = wasm_i32x4_shr(vaccKLMN, vshift);
+ vaccOPQR = wasm_i32x4_shr(vaccOPQR, vshift);
+ vaccSTUV = wasm_i32x4_shr(vaccSTUV, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -82,8 +81,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-wasmsimd-x8.c b/src/qs8-vaddc/gen/minmax-wasmsimd-x8.c
index eb541dc..6bd8a53 100644
--- a/src/qs8-vaddc/gen/minmax-wasmsimd-x8.c
+++ b/src/qs8-vaddc/gen/minmax-wasmsimd-x8.c
@@ -22,7 +22,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -38,8 +37,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
@@ -59,8 +58,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x16.c b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x16.c
index f92fd6e..a894fc7 100644
--- a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x16.c
+++ b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x16.c
@@ -28,7 +28,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -50,10 +49,10 @@
__m128i vacc89AB = _mm_macc_epi32(va89AB, va_multiplier, vbias);
__m128i vaccCDEF = _mm_macc_epi32(vaCDEF, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -76,8 +75,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x24.c b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x24.c
index af73d75..5bdf4c4 100644
--- a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x24.c
+++ b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x24.c
@@ -28,7 +28,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -54,12 +53,12 @@
__m128i vaccGHIJ = _mm_macc_epi32(vaGHIJ, va_multiplier, vbias);
__m128i vaccKLMN = _mm_macc_epi32(vaKLMN, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -87,8 +86,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x32.c b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x32.c
index d59dcb3..5924c7c 100644
--- a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x32.c
+++ b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x32.c
@@ -28,7 +28,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -58,14 +57,14 @@
__m128i vaccOPQR = _mm_macc_epi32(vaOPQR, va_multiplier, vbias);
__m128i vaccSTUV = _mm_macc_epi32(vaSTUV, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
- vaccGHIJ = _mm_sra_epi32(_mm_add_epi32(vaccGHIJ, vrounding), vshift);
- vaccKLMN = _mm_sra_epi32(_mm_add_epi32(vaccKLMN, vrounding), vshift);
- vaccOPQR = _mm_sra_epi32(_mm_add_epi32(vaccOPQR, vrounding), vshift);
- vaccSTUV = _mm_sra_epi32(_mm_add_epi32(vaccSTUV, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
+ vaccGHIJ = _mm_sra_epi32(vaccGHIJ, vshift);
+ vaccKLMN = _mm_sra_epi32(vaccKLMN, vshift);
+ vaccOPQR = _mm_sra_epi32(vaccOPQR, vshift);
+ vaccSTUV = _mm_sra_epi32(vaccSTUV, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -94,8 +93,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c
index a4e3df3..34ca82d 100644
--- a/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c
+++ b/src/qs8-vaddc/gen/minmax-xop-mul32-ld32-x8.c
@@ -28,7 +28,6 @@
const union xnn_qs8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4_mul32.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4_mul32.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
@@ -46,8 +45,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -68,8 +67,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qs8-vaddc/scalar.c.in b/src/qs8-vaddc/scalar.c.in
index 0dfcaca..0f85ac8 100644
--- a/src/qs8-vaddc/scalar.c.in
+++ b/src/qs8-vaddc/scalar.c.in
@@ -21,7 +21,6 @@
{
const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
const int32_t va_multiplier = params->scalar.a_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -32,7 +31,7 @@
const int32_t va = *input_a++;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -50,7 +49,7 @@
input_b += ${BATCH_TILE};
$for N in range(BATCH_TILE):
- int32_t vout${N} = asr_s32(vacc${N} + vrounding, vshift);
+ int32_t vout${N} = asr_s32(vacc${N}, vshift);
$for N in range(BATCH_TILE):
vout${N} = math_max_s32(vout${N}, voutput_min_less_zero_point);
@@ -70,7 +69,7 @@
const int32_t va = *input_a;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (${XINT8_T}) (vout + voutput_zero_point);
@@ -79,7 +78,7 @@
const int32_t va = *input_a++;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (${XINT8_T}) (vout + voutput_zero_point);
diff --git a/src/qs8-vaddc/sse-mul16-ld64.c.in b/src/qs8-vaddc/sse-mul16-ld64.c.in
index 6dc36dd..621f3ce 100644
--- a/src/qs8-vaddc/sse-mul16-ld64.c.in
+++ b/src/qs8-vaddc/sse-mul16-ld64.c.in
@@ -36,7 +36,6 @@
_mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->${PARAMS_STRUCT}.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
@@ -78,7 +77,7 @@
__m128i vacc${ABC[N+4:N+8]} = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod${ABC[N:N+8]}lo, vaprod${ABC[N:N+8]}hi));
$for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[N:N+4]}, vrounding), vshift);
+ vacc${ABC[N:N+4]} = _mm_sra_epi32(vacc${ABC[N:N+4]}, vshift);
$for N in range(0, BATCH_TILE, 8):
__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]}), voutput_zero_point);
@@ -146,8 +145,8 @@
__m128i vacc${ABC[0:4]} = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod${ABC[0:8]}lo, vaprod${ABC[0:8]}hi));
__m128i vacc${ABC[4:8]} = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod${ABC[0:8]}lo, vaprod${ABC[0:8]}hi));
- vacc${ABC[0:4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[0:4]}, vrounding), vshift);
- vacc${ABC[4:8]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[4:8]}, vrounding), vshift);
+ vacc${ABC[0:4]} = _mm_sra_epi32(vacc${ABC[0:4]}, vshift);
+ vacc${ABC[4:8]} = _mm_sra_epi32(vacc${ABC[4:8]}, vshift);
__m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
$if DATATYPE == "QS8" and SSE < 4:
diff --git a/src/qs8-vaddc/sse-mul32-ld32.c.in b/src/qs8-vaddc/sse-mul32-ld32.c.in
index ff1a22e..700680c 100644
--- a/src/qs8-vaddc/sse-mul32-ld32.c.in
+++ b/src/qs8-vaddc/sse-mul32-ld32.c.in
@@ -40,7 +40,6 @@
const union xnn_${DATATYPE.lower()}_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.rounding);
const __m128i vshift = _mm_loadu_si32(params->${PARAMS_STRUCT}.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->${PARAMS_STRUCT}.output_min);
@@ -64,7 +63,7 @@
__m128i vacc${ABC[N:N+4]} = _mm_add_epi32(vbias, _mm_mullo_epi32(va${ABC[N:N+4]}, va_multiplier));
$for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[N:N+4]}, vrounding), vshift);
+ vacc${ABC[N:N+4]} = _mm_sra_epi32(vacc${ABC[N:N+4]}, vshift);
$for N in range(0, BATCH_TILE, 8):
const __m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]}), voutput_zero_point);
@@ -112,8 +111,8 @@
__m128i vacc${ABC[0:4]} = _mm_add_epi32(vbias, _mm_mullo_epi32(va${ABC[0:4]}, va_multiplier));
__m128i vacc${ABC[4:8]} = _mm_add_epi32(vbias, _mm_mullo_epi32(va${ABC[4:8]}, va_multiplier));
- vacc${ABC[0:4]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[0:4]}, vrounding), vshift);
- vacc${ABC[4:8]} = _mm_sra_epi32(_mm_add_epi32(vacc${ABC[4:8]}, vrounding), vshift);
+ vacc${ABC[0:4]} = _mm_sra_epi32(vacc${ABC[0:4]}, vshift);
+ vacc${ABC[4:8]} = _mm_sra_epi32(vacc${ABC[4:8]}, vshift);
const __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
diff --git a/src/qs8-vaddc/wasmsimd.c.in b/src/qs8-vaddc/wasmsimd.c.in
index 2018517..697f27f 100644
--- a/src/qs8-vaddc/wasmsimd.c.in
+++ b/src/qs8-vaddc/wasmsimd.c.in
@@ -29,7 +29,6 @@
const union xnn_${DATATYPE.lower()}_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -49,7 +48,7 @@
v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vbias, wasm_i32x4_mul(${WASM_X32X4_EXTEND_HIGH_X16X8}(va${ABC[N:N+8]}), va_multiplier));
$for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = wasm_i32x4_shr(wasm_i32x4_add(vacc${ABC[N:N+4]}, vrounding), vshift);
+ vacc${ABC[N:N+4]} = wasm_i32x4_shr(vacc${ABC[N:N+4]}, vshift);
$for N in range(0, BATCH_TILE, 8):
v128_t vout${ABC[N:N+8]} = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]}), voutput_zero_point);
@@ -92,8 +91,8 @@
v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vbias, wasm_i32x4_mul(${WASM_X32X4_EXTEND_LOW_X16X8}(va${ABC[0:8]}), va_multiplier));
v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vbias, wasm_i32x4_mul(${WASM_X32X4_EXTEND_HIGH_X16X8}(va${ABC[0:8]}), va_multiplier));
- vacc${ABC[0:4]} = wasm_i32x4_shr(wasm_i32x4_add(vacc${ABC[0:4]}, vrounding), vshift);
- vacc${ABC[4:8]} = wasm_i32x4_shr(wasm_i32x4_add(vacc${ABC[4:8]}, vrounding), vshift);
+ vacc${ABC[0:4]} = wasm_i32x4_shr(vacc${ABC[0:4]}, vshift);
+ vacc${ABC[4:8]} = wasm_i32x4_shr(vacc${ABC[4:8]}, vshift);
v128_t vout${ABC[0:8]} = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x16.c b/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x16.c
index 084fbde..9d2da04 100644
--- a/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x16.c
+++ b/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -66,10 +65,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -107,8 +106,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x8.c b/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x8.c
index a76b598..3d55d0f 100644
--- a/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x8.c
+++ b/src/qu8-vadd/gen/minmax-avx-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -54,8 +53,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -90,8 +89,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x16.c b/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x16.c
index 398907e..0075baa 100644
--- a/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x16.c
+++ b/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x16.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -53,10 +52,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vb89AB, vb_multiplier));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vbCDEF, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -85,8 +84,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c b/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c
index 1aac429..36bcc2a 100644
--- a/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c
+++ b/src/qu8-vadd/gen/minmax-avx-mul32-ld32-x8.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -45,8 +44,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -72,8 +71,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c b/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c
index e8e8512..d2e9996 100644
--- a/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c
+++ b/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x16.c
@@ -25,7 +25,6 @@
const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -45,8 +44,8 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
vacc89ABCDEF = _mm256_add_epi32(vacc89ABCDEF, _mm256_mullo_epi32(vb89ABCDEF, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
@@ -70,7 +69,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
diff --git a/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x8.c b/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x8.c
index 5af1253..2910624 100644
--- a/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x8.c
+++ b/src/qu8-vadd/gen/minmax-avx2-mul32-ld64-x8.c
@@ -25,7 +25,6 @@
const __m256i vbias = _mm256_load_si256((const __m256i*) params->avx2.bias);
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
const __m256i vb_multiplier = _mm256_load_si256((const __m256i*) params->avx2.b_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -41,7 +40,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
@@ -63,7 +62,7 @@
vacc01234567 = _mm256_add_epi32(vacc01234567, _mm256_mullo_epi32(vb01234567, vb_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
__m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
diff --git a/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c b/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c
index 77b83b5..46110ce 100644
--- a/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c
+++ b/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x16.c
@@ -25,7 +25,6 @@
const __m512i vbias = _mm512_load_si512(params->avx512.bias);
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
const __m512i vb_multiplier = _mm512_load_si512(params->avx512.b_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
@@ -41,7 +40,7 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
@@ -64,7 +63,7 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c b/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c
index 62485ed..cb86eec 100644
--- a/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c
+++ b/src/qu8-vadd/gen/minmax-avx512skx-mul32-ld128-x32.c
@@ -25,7 +25,6 @@
const __m512i vbias = _mm512_load_si512(params->avx512.bias);
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
const __m512i vb_multiplier = _mm512_load_si512(params->avx512.b_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx512.output_min);
@@ -45,8 +44,8 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, _mm512_mullo_epi32(vbGHIJKLMNOPQRSTUV, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
- vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(_mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
+ vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(vaccGHIJKLMNOPQRSTUV, vshift);
__m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
@@ -70,7 +69,7 @@
vacc0123456789ABCDEF = _mm512_add_epi32(vacc0123456789ABCDEF, _mm512_mullo_epi32(vb0123456789ABCDEF, vb_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qu8-vadd/gen/minmax-scalar-x1.c b/src/qu8-vadd/gen/minmax-scalar-x1.c
index 0bdc27d..430435c 100644
--- a/src/qu8-vadd/gen/minmax-scalar-x1.c
+++ b/src/qu8-vadd/gen/minmax-scalar-x1.c
@@ -23,7 +23,6 @@
const int32_t vbias = params->scalar.bias;
const int32_t va_multiplier = params->scalar.a_multiplier;
const int32_t vb_multiplier = params->scalar.b_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -34,7 +33,7 @@
const int32_t vb = *input_b++;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-scalar-x2.c b/src/qu8-vadd/gen/minmax-scalar-x2.c
index d27e573..78404b1 100644
--- a/src/qu8-vadd/gen/minmax-scalar-x2.c
+++ b/src/qu8-vadd/gen/minmax-scalar-x2.c
@@ -23,7 +23,6 @@
const int32_t vbias = params->scalar.bias;
const int32_t va_multiplier = params->scalar.a_multiplier;
const int32_t vb_multiplier = params->scalar.b_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -43,8 +42,8 @@
vacc0 += vb0 * vb_multiplier;
vacc1 += vb1 * vb_multiplier;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -64,7 +63,7 @@
const int32_t vb = *input_b;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-scalar-x4.c b/src/qu8-vadd/gen/minmax-scalar-x4.c
index 5a0232c..4f153be 100644
--- a/src/qu8-vadd/gen/minmax-scalar-x4.c
+++ b/src/qu8-vadd/gen/minmax-scalar-x4.c
@@ -23,7 +23,6 @@
const int32_t vbias = params->scalar.bias;
const int32_t va_multiplier = params->scalar.a_multiplier;
const int32_t vb_multiplier = params->scalar.b_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -51,10 +50,10 @@
vacc2 += vb2 * vb_multiplier;
vacc3 += vb3 * vb_multiplier;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
- int32_t vout2 = asr_s32(vacc2 + vrounding, vshift);
- int32_t vout3 = asr_s32(vacc3 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
+ int32_t vout2 = asr_s32(vacc2, vshift);
+ int32_t vout3 = asr_s32(vacc3, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -83,7 +82,7 @@
const int32_t vb = *input_b++;
const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x16.c b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
index 6496f37..fa92f77 100644
--- a/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
+++ b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -71,10 +70,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -115,8 +114,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
index b80edde..be00962 100644
--- a/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
+++ b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -57,8 +56,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -96,8 +95,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x16.c b/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
index 441d3c0..60e858a 100644
--- a/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
+++ b/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -66,10 +65,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -107,8 +106,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c b/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
index 19b9503..d1cdd1c 100644
--- a/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
+++ b/src/qu8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -54,8 +53,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -90,8 +89,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x16.c b/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x16.c
index 98a5d85..4d2420a 100644
--- a/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x16.c
+++ b/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x16.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -53,10 +52,10 @@
vacc89AB = _mm_add_epi32(vacc89AB, _mm_mullo_epi32(vb89AB, vb_multiplier));
vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_mullo_epi32(vbCDEF, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -85,8 +84,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x8.c b/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x8.c
index 93901a4..0adf807 100644
--- a/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x8.c
+++ b/src/qu8-vadd/gen/minmax-sse41-mul32-ld32-x8.c
@@ -25,7 +25,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -45,8 +44,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -72,8 +71,8 @@
vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-wasmsimd-x16.c b/src/qu8-vadd/gen/minmax-wasmsimd-x16.c
index e37866e..956b9d3 100644
--- a/src/qu8-vadd/gen/minmax-wasmsimd-x16.c
+++ b/src/qu8-vadd/gen/minmax-wasmsimd-x16.c
@@ -24,7 +24,6 @@
const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -48,10 +47,10 @@
vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb89ABCDEF), vb_multiplier));
vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb89ABCDEF), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -78,8 +77,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-wasmsimd-x8.c b/src/qu8-vadd/gen/minmax-wasmsimd-x8.c
index 77dc684..bdcfbfe 100644
--- a/src/qu8-vadd/gen/minmax-wasmsimd-x8.c
+++ b/src/qu8-vadd/gen/minmax-wasmsimd-x8.c
@@ -24,7 +24,6 @@
const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -42,8 +41,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
@@ -67,8 +66,8 @@
vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier));
vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x16.c b/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x16.c
index 1a128da..ffd2cf8 100644
--- a/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x16.c
+++ b/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x16.c
@@ -30,7 +30,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -58,10 +57,10 @@
vacc89AB = _mm_macc_epi32(vb89AB, vb_multiplier, vacc89AB);
vaccCDEF = _mm_macc_epi32(vbCDEF, vb_multiplier, vaccCDEF);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -90,8 +89,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c b/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c
index 13a6999..259175d 100644
--- a/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c
+++ b/src/qu8-vadd/gen/minmax-xop-mul32-ld32-x8.c
@@ -30,7 +30,6 @@
const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -50,8 +49,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -77,8 +76,8 @@
vacc0123 = _mm_macc_epi32(vb0123, vb_multiplier, vacc0123);
vacc4567 = _mm_macc_epi32(vb4567, vb_multiplier, vacc4567);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x16.c b/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x16.c
index 9bfae04..df51820 100644
--- a/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x16.c
+++ b/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -52,10 +51,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -85,8 +84,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x8.c b/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x8.c
index 53894d3..43bdb76 100644
--- a/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x8.c
+++ b/src/qu8-vaddc/gen/minmax-avx-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -46,8 +45,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -75,8 +74,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x16.c b/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x16.c
index 3167bcc..21eb4cd 100644
--- a/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x16.c
+++ b/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x16.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -45,10 +44,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_mullo_epi32(va89AB, va_multiplier));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_mullo_epi32(vaCDEF, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -71,8 +70,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c b/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c
index c67836c..db4c569 100644
--- a/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c
+++ b/src/qu8-vaddc/gen/minmax-avx-mul32-ld32-x8.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -41,8 +40,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -63,8 +62,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c b/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
index b0ce301..a9d0ef7 100644
--- a/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
+++ b/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x16.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -40,8 +39,8 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
__m256i vacc89ABCDEF = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va89ABCDEF, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
- vacc89ABCDEF = _mm256_sra_epi32(_mm256_add_epi32(vacc89ABCDEF, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
+ vacc89ABCDEF = _mm256_sra_epi32(vacc89ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(vacc01234567, vacc89ABCDEF), voutput_zero_point);
@@ -61,7 +60,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), _mm256_castsi256_si128(voutput_zero_point));
__m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
diff --git a/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c b/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c
index 2af5f7b..0e64fee 100644
--- a/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c
+++ b/src/qu8-vaddc/gen/minmax-avx2-mul32-ld64-x8.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m256i va_multiplier = _mm256_load_si256((const __m256i*) params->avx2.a_multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->avx2.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx2.output_min);
@@ -38,7 +37,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
@@ -57,7 +56,7 @@
__m256i vacc01234567 = _mm256_add_epi32(vbias, _mm256_mullo_epi32(va01234567, va_multiplier));
- vacc01234567 = _mm256_sra_epi32(_mm256_add_epi32(vacc01234567, vrounding), vshift);
+ vacc01234567 = _mm256_sra_epi32(vacc01234567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extracti128_si256(vacc01234567, 1)), voutput_zero_point);
__m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
diff --git a/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c b/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c
index b444a19..d63f7b6 100644
--- a/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c
+++ b/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x16.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->avx512.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx512.output_min);
@@ -38,7 +37,7 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
@@ -58,7 +57,7 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), voutput_zero_point);
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c b/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c
index 1572762..ceebdcf 100644
--- a/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c
+++ b/src/qu8-vaddc/gen/minmax-avx512skx-mul32-ld128-x32.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m512i va_multiplier = _mm512_load_si512(params->avx512.a_multiplier);
- const __m512i vrounding = _mm512_load_si512(params->avx512.rounding);
const __m128i vshift = _mm_loadu_si32(params->avx512.shift);
const __m512i voutput_zero_point = _mm512_load_si512(params->avx512.output_zero_point);
const __m256i voutput_min = _mm256_load_si256((const __m256i*) params->avx512.output_min);
@@ -40,8 +39,8 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
__m512i vaccGHIJKLMNOPQRSTUV = _mm512_add_epi32(vbias, _mm512_mullo_epi32(vaGHIJKLMNOPQRSTUV, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
- vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(_mm512_add_epi32(vaccGHIJKLMNOPQRSTUV, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
+ vaccGHIJKLMNOPQRSTUV = _mm512_sra_epi32(vaccGHIJKLMNOPQRSTUV, vshift);
__m512i vout0123GHIJ4567KLMN89ABOPQRCDEFSTUV = _mm512_adds_epi16(_mm512_packs_epi32(vacc0123456789ABCDEF, vaccGHIJKLMNOPQRSTUV), voutput_zero_point);
@@ -61,7 +60,7 @@
__m512i vacc0123456789ABCDEF = _mm512_add_epi32(vbias, _mm512_mullo_epi32(va0123456789ABCDEF, va_multiplier));
- vacc0123456789ABCDEF = _mm512_sra_epi32(_mm512_add_epi32(vacc0123456789ABCDEF, vrounding), vshift);
+ vacc0123456789ABCDEF = _mm512_sra_epi32(vacc0123456789ABCDEF, vshift);
__m256i vout012389AB4567CDEF = _mm256_adds_epi16(_mm256_packs_epi32(_mm512_castsi512_si256(vacc0123456789ABCDEF), _mm512_extracti32x8_epi32(vacc0123456789ABCDEF, 1)), _mm512_castsi512_si256(voutput_zero_point));
__m128i vout0123456789ABCDEF = _mm_shuffle_epi32(_mm_packus_epi16(_mm256_castsi256_si128(vout012389AB4567CDEF), _mm256_extracti128_si256(vout012389AB4567CDEF, 1)), _MM_SHUFFLE(3, 1, 2, 0));
diff --git a/src/qu8-vaddc/gen/minmax-scalar-x1.c b/src/qu8-vaddc/gen/minmax-scalar-x1.c
index 410ad17..c506999 100644
--- a/src/qu8-vaddc/gen/minmax-scalar-x1.c
+++ b/src/qu8-vaddc/gen/minmax-scalar-x1.c
@@ -22,7 +22,6 @@
{
const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
const int32_t va_multiplier = params->scalar.a_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -32,7 +31,7 @@
const int32_t va = *input_a++;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-scalar-x2.c b/src/qu8-vaddc/gen/minmax-scalar-x2.c
index 2d1469c..4e48464 100644
--- a/src/qu8-vaddc/gen/minmax-scalar-x2.c
+++ b/src/qu8-vaddc/gen/minmax-scalar-x2.c
@@ -22,7 +22,6 @@
{
const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
const int32_t va_multiplier = params->scalar.a_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -37,8 +36,8 @@
const int32_t vacc1 = vbias + va1 * va_multiplier;
input_b += 2;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -57,7 +56,7 @@
const int32_t va = *input_a;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-scalar-x4.c b/src/qu8-vaddc/gen/minmax-scalar-x4.c
index f122289..af54875 100644
--- a/src/qu8-vaddc/gen/minmax-scalar-x4.c
+++ b/src/qu8-vaddc/gen/minmax-scalar-x4.c
@@ -22,7 +22,6 @@
{
const int32_t vbias = params->scalar.bias + (int32_t) *input_b * params->scalar.b_multiplier;
const int32_t va_multiplier = params->scalar.a_multiplier;
- const int32_t vrounding = params->scalar.rounding;
const uint32_t vshift = params->scalar.shift;
const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
@@ -41,10 +40,10 @@
const int32_t vacc3 = vbias + va3 * va_multiplier;
input_b += 4;
- int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
- int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
- int32_t vout2 = asr_s32(vacc2 + vrounding, vshift);
- int32_t vout3 = asr_s32(vacc3 + vrounding, vshift);
+ int32_t vout0 = asr_s32(vacc0, vshift);
+ int32_t vout1 = asr_s32(vacc1, vshift);
+ int32_t vout2 = asr_s32(vacc2, vshift);
+ int32_t vout3 = asr_s32(vacc3, vshift);
vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
@@ -72,7 +71,7 @@
const int32_t va = *input_a++;
const int32_t vacc = vbias + va * va_multiplier;
- int32_t vout = asr_s32(vacc + vrounding, vshift);
+ int32_t vout = asr_s32(vacc, vshift);
vout = math_max_s32(vout, voutput_min_less_zero_point);
vout = math_min_s32(vout, voutput_max_less_zero_point);
*output++ = (uint8_t) (vout + voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c b/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c
index 04a5fec..673c34a 100644
--- a/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c
+++ b/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -55,10 +54,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -89,8 +88,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c b/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c
index 887b04a..e1c5764 100644
--- a/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c
+++ b/src/qu8-vaddc/gen/minmax-sse2-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -48,8 +47,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -78,8 +77,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c b/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c
index 54ebfd3..9f3d3e2 100644
--- a/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c
+++ b/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x16.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -52,10 +51,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -85,8 +84,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c b/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c
index cd9ae9e..584319e 100644
--- a/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c
+++ b/src/qu8-vaddc/gen/minmax-sse41-mul16-ld64-x8.c
@@ -26,7 +26,6 @@
_mm_load_si128((const __m128i*) params->sse2.bias));
const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
@@ -46,8 +45,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -75,8 +74,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c b/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c
index 7d9adfa..31e4f81 100644
--- a/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c
+++ b/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x16.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -45,10 +44,10 @@
__m128i vacc89AB = _mm_add_epi32(vbias, _mm_mullo_epi32(va89AB, va_multiplier));
__m128i vaccCDEF = _mm_add_epi32(vbias, _mm_mullo_epi32(vaCDEF, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -71,8 +70,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c b/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c
index 353ed6a..9fca60f 100644
--- a/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c
+++ b/src/qu8-vaddc/gen/minmax-sse41-mul32-ld32-x8.c
@@ -23,7 +23,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -41,8 +40,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -63,8 +62,8 @@
__m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
__m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-wasmsimd-x16.c b/src/qu8-vaddc/gen/minmax-wasmsimd-x16.c
index a49b3ef..44bea6a 100644
--- a/src/qu8-vaddc/gen/minmax-wasmsimd-x16.c
+++ b/src/qu8-vaddc/gen/minmax-wasmsimd-x16.c
@@ -22,7 +22,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -41,10 +40,10 @@
v128_t vacc89AB = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va89ABCDEF), va_multiplier));
v128_t vaccCDEF = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va89ABCDEF), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
- vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
- vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
+ vacc89AB = wasm_i32x4_shr(vacc89AB, vshift);
+ vaccCDEF = wasm_i32x4_shr(vaccCDEF, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -66,8 +65,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-wasmsimd-x8.c b/src/qu8-vaddc/gen/minmax-wasmsimd-x8.c
index cd7797f..cbd7c62 100644
--- a/src/qu8-vaddc/gen/minmax-wasmsimd-x8.c
+++ b/src/qu8-vaddc/gen/minmax-wasmsimd-x8.c
@@ -22,7 +22,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
- const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
const int32_t vshift = params->wasmsimd.shift;
const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
@@ -38,8 +37,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
@@ -59,8 +58,8 @@
v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier));
v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier));
- vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
- vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+ vacc0123 = wasm_i32x4_shr(vacc0123, vshift);
+ vacc4567 = wasm_i32x4_shr(vacc4567, vshift);
v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x16.c b/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x16.c
index e5b23f2..89bca3d 100644
--- a/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x16.c
+++ b/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x16.c
@@ -28,7 +28,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -50,10 +49,10 @@
__m128i vacc89AB = _mm_macc_epi32(va89AB, va_multiplier, vbias);
__m128i vaccCDEF = _mm_macc_epi32(vaCDEF, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
- vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
- vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
+ vacc89AB = _mm_sra_epi32(vacc89AB, vshift);
+ vaccCDEF = _mm_sra_epi32(vaccCDEF, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
const __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
@@ -76,8 +75,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c b/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c
index a8477b7..c03c0e7 100644
--- a/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c
+++ b/src/qu8-vaddc/gen/minmax-xop-mul32-ld32-x8.c
@@ -28,7 +28,6 @@
const union xnn_qu8_addsub_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse4.rounding);
const __m128i vshift = _mm_loadu_si32(params->sse4.shift);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
@@ -46,8 +45,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
@@ -68,8 +67,8 @@
__m128i vacc0123 = _mm_macc_epi32(va0123, va_multiplier, vbias);
__m128i vacc4567 = _mm_macc_epi32(va4567, va_multiplier, vbias);
- vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
- vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+ vacc0123 = _mm_sra_epi32(vacc0123, vshift);
+ vacc4567 = _mm_sra_epi32(vacc4567, vshift);
const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 5b78d30..f003251 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -624,7 +624,6 @@
XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
- XNN_ALIGN(16) int32_t rounding[4];
uint32_t shift;
uint32_t b_multiplier;
XNN_ALIGN(16) int16_t output_zero_point[8];
@@ -635,7 +634,6 @@
XNN_ALIGN(16) int32_t bias[4];
XNN_ALIGN(16) int32_t a_multiplier[4];
XNN_ALIGN(16) int32_t b_multiplier[4];
- XNN_ALIGN(16) int32_t rounding[4];
XNN_ALIGN(16) uint32_t shift[4];
XNN_ALIGN(16) int16_t output_zero_point[8];
XNN_ALIGN(16) uint8_t output_min[16];
@@ -645,7 +643,6 @@
XNN_ALIGN(32) int32_t bias[8];
XNN_ALIGN(32) int32_t a_multiplier[8];
XNN_ALIGN(32) int32_t b_multiplier[8];
- XNN_ALIGN(32) int32_t rounding[8];
XNN_ALIGN(32) uint32_t shift[8];
XNN_ALIGN(32) int16_t output_zero_point[16];
XNN_ALIGN(16) uint8_t output_min[16];
@@ -655,7 +652,6 @@
XNN_ALIGN(64) int32_t bias[16];
XNN_ALIGN(64) int32_t a_multiplier[16];
XNN_ALIGN(64) int32_t b_multiplier[16];
- XNN_ALIGN(64) int32_t rounding[16];
XNN_ALIGN(64) uint32_t shift[16];
XNN_ALIGN(64) int16_t output_zero_point[32];
XNN_ALIGN(32) uint8_t output_min[32];
@@ -667,7 +663,6 @@
XNN_ALIGN(16) int32_t bias[4];
XNN_ALIGN(16) int32_t a_multiplier[4];
XNN_ALIGN(16) int32_t b_multiplier[4];
- XNN_ALIGN(16) int32_t rounding[4];
int32_t shift;
XNN_ALIGN(16) int16_t output_zero_point[8];
XNN_ALIGN(16) uint8_t output_min[16];
@@ -681,7 +676,6 @@
int32_t bias;
int32_t a_multiplier;
int32_t b_multiplier;
- int32_t rounding;
uint32_t shift;
int32_t output_min_less_zero_point;
int32_t output_max_less_zero_point;
@@ -706,7 +700,6 @@
XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
- XNN_ALIGN(16) int32_t rounding[4];
uint32_t shift;
uint32_t b_multiplier;
XNN_ALIGN(16) int16_t output_zero_point[8];
@@ -719,7 +712,6 @@
XNN_ALIGN(16) uint16_t a_multiplier_hi[8];
XNN_ALIGN(16) uint16_t b_multiplier_lo[8];
XNN_ALIGN(16) uint16_t b_multiplier_hi[8];
- XNN_ALIGN(16) int32_t rounding[4];
uint32_t shift;
uint32_t b_multiplier;
XNN_ALIGN(16) int16_t output_zero_point[8];
@@ -730,7 +722,6 @@
XNN_ALIGN(16) int32_t bias[4];
XNN_ALIGN(16) int32_t a_multiplier[4];
XNN_ALIGN(16) int32_t b_multiplier[4];
- XNN_ALIGN(16) int32_t rounding[4];
XNN_ALIGN(16) uint32_t shift[4];
XNN_ALIGN(16) int16_t output_zero_point[8];
XNN_ALIGN(16) int8_t output_min[16];
@@ -740,7 +731,6 @@
XNN_ALIGN(32) int32_t bias[8];
XNN_ALIGN(32) int32_t a_multiplier[8];
XNN_ALIGN(32) int32_t b_multiplier[8];
- XNN_ALIGN(32) int32_t rounding[8];
XNN_ALIGN(32) uint32_t shift[8];
XNN_ALIGN(32) int16_t output_zero_point[16];
XNN_ALIGN(16) int8_t output_min[16];
@@ -750,7 +740,6 @@
XNN_ALIGN(64) int32_t bias[16];
XNN_ALIGN(64) int32_t a_multiplier[16];
XNN_ALIGN(64) int32_t b_multiplier[16];
- XNN_ALIGN(64) int32_t rounding[16];
XNN_ALIGN(64) uint32_t shift[16];
XNN_ALIGN(64) int16_t output_zero_point[32];
XNN_ALIGN(32) int8_t output_min[32];
@@ -762,7 +751,6 @@
XNN_ALIGN(16) int32_t bias[4];
XNN_ALIGN(16) int32_t a_multiplier[4];
XNN_ALIGN(16) int32_t b_multiplier[4];
- XNN_ALIGN(16) int32_t rounding[4];
int32_t shift;
XNN_ALIGN(16) int16_t output_zero_point[8];
XNN_ALIGN(16) int8_t output_min[16];
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index 9f776e4..73a5cf6 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -456,7 +456,7 @@
int32_t acc = params.scalar.bias + (int32_t) (uint32_t) a * params.scalar.a_multiplier + (int32_t) (uint32_t) b * params.scalar.b_multiplier;
// Shift right with rounding away from zero.
- acc = asr_s32(acc + params.scalar.rounding, params.scalar.shift);
+ acc = asr_s32(acc, params.scalar.shift);
// Clamp and add output zero point.
acc = math_max_s32(acc, params.scalar.output_min_less_zero_point);
@@ -472,7 +472,7 @@
int32_t acc = params.scalar.bias + (int32_t) a * params.scalar.a_multiplier + (int32_t) b * params.scalar.b_multiplier;
// Shift right with rounding away from zero.
- acc = asr_s32(acc + params.scalar.rounding, params.scalar.shift);
+ acc = asr_s32(acc, params.scalar.shift);
// Clamp and add output zero point.
acc = math_max_s32(acc, params.scalar.output_min_less_zero_point);