Refactor pre-SSE4 versions of QS8/QC8 GEMM/IGEMM microkernels

- Replace sign-extension from 8 to 16 bits with a more efficient sequence
- Replace casts to uintptr_t to casts to typed pointer types where possible

PiperOrigin-RevId: 382654408
diff --git a/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c b/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
index d3af8e9..ede239d 100644
--- a/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
+++ b/src/qc8-dwconv/gen/up24x9-minmax-fp32-avx-mul32.c
@@ -82,11 +82,11 @@
     const void* w = weights;
     for (; c >= 24; c -= 24) {
       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
-      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t)));
-      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t)));
-      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 12 * sizeof(int32_t)));
-      __m128i vaccGHIJ = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t)));
-      __m128i vaccKLMN = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 20 * sizeof(int32_t)));
+      __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
+      __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
+      __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
+      __m128i vaccGHIJ = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 16));
+      __m128i vaccKLMN = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 20));
 
 
       const __m128i vi0x0123 = _mm_cvtepi8_epi32(_mm_loadu_si32(i0));
@@ -288,12 +288,12 @@
       __m128 vscaledKLMN = _mm_cvtepi32_ps(vaccKLMN);
 
       const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
-      const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 4 * sizeof(float)));
-      const __m128 vscale89AB = _mm_loadu_ps((const float*) ((uintptr_t) w + 8 * sizeof(float)));
-      const __m128 vscaleCDEF = _mm_loadu_ps((const float*) ((uintptr_t) w + 12 * sizeof(float)));
-      const __m128 vscaleGHIJ = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(float)));
-      const __m128 vscaleKLMN = _mm_loadu_ps((const float*) ((uintptr_t) w + 20 * sizeof(float)));
-      w = (const void*) ((uintptr_t) w + 24 * sizeof(float));
+      const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
+      const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
+      const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
+      const __m128 vscaleGHIJ = _mm_loadu_ps((const float*) w + 16);
+      const __m128 vscaleKLMN = _mm_loadu_ps((const float*) w + 20);
+      w = (const void*) ((const float*) w + 24);
       vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
       vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
@@ -327,7 +327,7 @@
       output += 24;
     }
     if XNN_UNLIKELY(c != 0) {
-      const int8_t* k = (const int8_t*) ((uintptr_t) w + 24 * sizeof(int32_t));
+      const int8_t* k = (const int8_t*) ((const int32_t*) w + 24);
       do {
         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
 
@@ -393,7 +393,7 @@
         vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
         vacc0123 = _mm_cvtps_epi32(vscaled0123);
 
-        w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+        w = (const void*) ((const int32_t*) w + 4);
 
         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
         __m128i vout0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc0123), voutput_zero_point);