Quantized GEMM/IGEMM microkernels bump kc to be a multiple of channels.
Rewind A pointers by KC.
Remove last partial channel of remainder code. Its now handled by main loop.
PiperOrigin-RevId: 360231001
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
index 15bc8bb..faadba9 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-wasmsimd-ld64.c
@@ -12,6 +12,7 @@
#include <wasm_simd128.h>
#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
void xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64(
@@ -39,6 +40,7 @@
assert(w != NULL);
assert(c != NULL);
+ kc = round_up_po2(kc, 8);
int8_t* c0 = c;
const v128_t vzero = wasm_f64x2_splat(0.0);