Quantized GEMM/IGEMM microkernels bump kc to be a multiple of channels.
Rewind A pointers by KC.
Remove last partial channel of remainder code. Its now handled by main loop.
PiperOrigin-RevId: 360231001
diff --git a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
index b1b6880..e6faba2 100644
--- a/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/4x4c2-minmax-xop-ld128.c
@@ -17,6 +17,7 @@
#endif
#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
void xnn_qs8_igemm_minmax_ukernel_4x4c2__xop_ld128(
@@ -44,6 +45,7 @@
assert(w != NULL);
assert(c != NULL);
+ kc = round_up_po2(kc, 2);
int8_t* c0 = c;
int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
if XNN_UNPREDICTABLE(mr < 2) {
@@ -202,21 +204,6 @@
_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
vacc3x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-
- if (k > 6 * sizeof(int8_t)) {
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
- const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
-
- vacc0x0123 = _mm_maddd_epi16(
- _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
- vacc1x0123 = _mm_maddd_epi16(
- _mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc1x0123);
- vacc2x0123 = _mm_maddd_epi16(
- _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc2x0123);
- vacc3x0123 = _mm_maddd_epi16(
- _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
- }
}
}
}