Quantized GEMM/IGEMM microkernels bump kc to be a multiple of channels.

Rewind A pointers by KC.
Remove last partial channel of remainder code.  Its now handled by main loop.

PiperOrigin-RevId: 360231001
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
index ba3e4f1..e14c5d1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   const int8_t* a0 = a;
   int8_t* c0 = c;
 
@@ -119,10 +121,10 @@
     if (nc >= 4) {
       *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
 
-      a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
 
+      a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
       nc -= 4;
     } else {
       if (nc & 2) {