Quantized GEMM/IGEMM microkernels bump kc to be a multiple of channels.
Rewind A pointers by KC.
Remove last partial channel of remainder code. Its now handled by main loop.
PiperOrigin-RevId: 360231001
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
index ba3e4f1..e14c5d1 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-wasmsimd-ld128.c
@@ -12,6 +12,7 @@
#include <wasm_simd128.h>
#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
void xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128(
@@ -35,6 +36,7 @@
assert(w != NULL);
assert(c != NULL);
+ kc = round_up_po2(kc, 8);
const int8_t* a0 = a;
int8_t* c0 = c;
@@ -119,10 +121,10 @@
if (nc >= 4) {
*((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
- a0 = (const int8_t*) ((uintptr_t) a0 - k);
-
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
nc -= 4;
} else {
if (nc & 2) {