Minor optimizations in NEON QS8 GEMM/IGEMM microkernels

PiperOrigin-RevId: 381558969
diff --git a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
index 9741b73..a290314 100644
--- a/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+++ b/src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -36,7 +36,7 @@
   assert(w != NULL);
   assert(c != NULL);
 
-  kc = round_up_po2(kc, 2);
+  kc = round_up_po2(kc, 2 * sizeof(int8_t));
   const int8_t* a0 = a;
   int8_t* c0 = c;
   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);