Refactor pre-SSE4 versions of QS8/QC8 GEMM/IGEMM microkernels
- Replace sign-extension from 8 to 16 bits with a more efficient sequence
- Replace casts to uintptr_t to casts to typed pointer types where possible
PiperOrigin-RevId: 382654408
diff --git a/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
index d3af8e9..ede239d 100644
--- a/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
@@ -82,11 +82,11 @@
const void* w = weights;
for (; c >= 24; c -= 24) {
__m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t)));
- __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t)));
- __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 12 * sizeof(int32_t)));
- __m128i vaccGHIJ = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t)));
- __m128i vaccKLMN = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 20 * sizeof(int32_t)));
+ __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+ __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+ __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+ __m128i vaccGHIJ = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 16));
+ __m128i vaccKLMN = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 20));
const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
@@ -288,12 +288,12 @@
__m128 vscaledKLMN = _mm_cvtepi32_ps(vaccKLMN);
const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
- const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 4 * sizeof(float)));
- const __m128 vscale89AB = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(float)));
- const __m128 vscaleCDEF = _mm_loadu_ps((const float*) ((uintptr_t) w + 12 * sizeof(float)));
- const __m128 vscaleGHIJ = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(float)));
- const __m128 vscaleKLMN = _mm_loadu_ps((const float*) ((uintptr_t) w + 20 * sizeof(float)));
- w = (const void*) ((uintptr_t) w + 24 * sizeof(float));
+ const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
+ const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
+ const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
+ const __m128 vscaleGHIJ = _mm_loadu_ps((const float*) w + 16);
+ const __m128 vscaleKLMN = _mm_loadu_ps((const float*) w + 20);
+ w = (const void*) ((const float*) w + 24);
vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
@@ -327,7 +327,7 @@
output += 24;
}
if XNN_UNLIKELY(c != 0) {
- const int8_t* k = (const int8_t*) ((uintptr_t) w + 24 * sizeof(int32_t));
+ const int8_t* k = (const int8_t*) ((const int32_t*) w + 24);
do {
__m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
@@ -393,7 +393,7 @@
vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
vacc0123 = _mm_cvtps_epi32(vscaled0123);
- w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ w = (const void*) ((const int32_t*) w + 4);
const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
__m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);