Quantized GEMM/IGEMM microkernels bump kc to be a multiple of channels.

Rewind A pointers by KC.
Remove last partial channel of remainder code.  Its now handled by main loop.

PiperOrigin-RevId: 360231001
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
index 15bc8bb..faadba9 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
 #include <wasm_simd128.h>
 
 #include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
 
 
 void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
+  kc = round_up_po2(kc, 8);
   int8_t* c0 = c;
 
   const v128_t vzero = wasm_f64x2_splat(0.0);