QS8 Neon dot product intrinsics GEMM and IGEMM microkernels reduced remainder code.

Remove remainder code for k in 5 to 7 range.  Allow the main loop to do it.
KC is rounded up to multiple of 4.  Rewind by A pointers by kc.

PiperOrigin-RevId: 360495618
diff --git a/src/qs8-gemm/MRxNRc4-neondot.c.in b/src/qs8-gemm/MRxNRc4-neondot.c.in
index 49b43d2..e9416c3 100644
--- a/src/qs8-gemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-gemm/MRxNRc4-neondot.c.in
@@ -87,11 +87,11 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a ${MR}x4 block of activations.
       $for M in range(MR):
-        const int8x8_t va${M}x01234567 = vld1_s8(a${M}); a${M} += k;
+        const int8x8_t va${M}x01234567 = vld1_s8(a${M}); a${M} += 4;
 
       // Load a 4x${NR} block of weights.
       $for N in range(0, NR, 4):
@@ -101,17 +101,6 @@
       $for M in range(MR):
         $for N in range(0, NR, 4):
             vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb0123x${ABC[N:N+4]}, va${M}x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x${NR} block of weights.
-        $for N in range(0, NR, 4):
-          const int8x16_t vb4567x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
-        $for M in range(MR):
-          $for N in range(0, NR, 4):
-              vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb4567x${ABC[N:N+4]}, va${M}x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c b/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
index c1665fd..71829b0 100644
--- a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
@@ -210,21 +210,21 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 12x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
-      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
-      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
-      const int8x8_t va8x01234567 = vld1_s8(a8); a8 += k;
-      const int8x8_t va9x01234567 = vld1_s8(a9); a9 += k;
-      const int8x8_t va10x01234567 = vld1_s8(a10); a10 += k;
-      const int8x8_t va11x01234567 = vld1_s8(a11); a11 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
+      const int8x8_t va8x01234567 = vld1_s8(a8); a8 += 4;
+      const int8x8_t va9x01234567 = vld1_s8(a9); a9 += 4;
+      const int8x8_t va10x01234567 = vld1_s8(a10); a10 += 4;
+      const int8x8_t va11x01234567 = vld1_s8(a11); a11 += 4;
 
       // Load a 4x8 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -255,38 +255,6 @@
       vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
       vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
       vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-        vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
-        vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
-        vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
-        vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
-        vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
-        vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
-        vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
-        vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
index 964525e..7645374 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
@@ -77,10 +77,10 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 1x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
 
       // Load a 4x16 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -93,20 +93,6 @@
       vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
       vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
       vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
index 8242be7..c6925c1 100644
--- a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
@@ -67,10 +67,10 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 1x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
 
       // Load a 4x8 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -79,16 +79,6 @@
       // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
       vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
       vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
index 719db22..3c93b58 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
@@ -134,13 +134,13 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 4x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
 
       // Load a 4x16 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -165,32 +165,6 @@
       vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
       vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
       vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-        vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-        vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-        vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
index d32b05e..f733070 100644
--- a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
@@ -106,13 +106,13 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 4x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
 
       // Load a 4x8 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -127,22 +127,6 @@
       vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
       vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
       vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
index 3a9277e..0086b49 100644
--- a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
@@ -172,15 +172,15 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 6x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
 
       // Load a 4x16 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -213,40 +213,6 @@
       vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
       vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
       vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-        vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-        vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-        vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-        vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-        vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
index b1fc09f..889b948 100644
--- a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
@@ -132,15 +132,15 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 6x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
 
       // Load a 4x8 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -159,26 +159,6 @@
       vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
       vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
       vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
index 93dab4c..3b2301b 100644
--- a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
@@ -210,17 +210,17 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 8x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
-      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
-      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
 
       // Load a 4x16 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -261,48 +261,6 @@
       vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
       vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
       vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x16 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-        vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-        vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-        vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-        vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-        vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-        vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-        vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb4567x89AB, va6x01234567, 1);
-        vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-        vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb4567x89AB, va7x01234567, 1);
-        vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
index 6dbeb3a..204f3be 100644
--- a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
@@ -158,17 +158,17 @@
 
       k -= 8 * sizeof(int8_t);
     }
-    // Handle up to 6 final positions of `k`
+    // Handle up to 4 final positions of `k`
     if XNN_UNLIKELY(k != 0) {
       // Load a 8x4 block of activations.
-      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
-      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
-      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
-      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
-      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
-      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
-      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
-      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
+      const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+      const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+      const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+      const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+      const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+      const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+      const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+      const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
 
       // Load a 4x8 block of weights.
       const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -191,30 +191,6 @@
       vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
       vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
       vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-
-      if (k > 4) {
-        // Load a 4x8 block of weights.
-        const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-        const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-        // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
-        vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-        vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-        vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-        vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-        vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-        vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-        vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-        vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-        vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-        vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-        vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-        vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-        vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-      }
     }
 
     // Post-accumulation work
diff --git a/src/qs8-igemm/MRxNRc4-neondot.c.in b/src/qs8-igemm/MRxNRc4-neondot.c.in
index e49b634..9cf885c 100644
--- a/src/qs8-igemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-igemm/MRxNRc4-neondot.c.in
@@ -93,7 +93,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a ${MR}x4 block of activations.
         $for M in range(MR):
@@ -107,17 +107,6 @@
         $for M in range(MR):
           $for N in range(0, NR, 4):
               vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb0123x${ABC[N:N+4]}, va${M}x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x${NR} block of weights.
-          $for N in range(0, NR, 4):
-            const int8x16_t vb4567x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
-          $for M in range(MR):
-            $for N in range(0, NR, 4):
-                vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb4567x${ABC[N:N+4]}, va${M}x01234567, 1);
-        }
       }
       p -= ${MR} * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c b/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
index 7a74253..0f0953b 100644
--- a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
@@ -241,7 +241,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 12x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -286,38 +286,6 @@
         vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
         vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
         vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-          vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-          vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-          vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-          vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
-          vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
-          vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
-          vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
-          vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
-          vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
-          vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
-          vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
-        }
       }
       p -= 12 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
index a72ae5b..afb0cea 100644
--- a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
@@ -86,7 +86,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 1x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -102,20 +102,6 @@
         vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
         vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
         vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-        }
       }
       p -= 1 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
index ed42605..dd311f3 100644
--- a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
@@ -76,7 +76,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 1x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -88,16 +88,6 @@
         // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
         vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
         vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-        }
       }
       p -= 1 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
index 95dde6d..c4d0481 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
@@ -149,7 +149,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 4x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -180,32 +180,6 @@
         vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
         vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
         vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-          vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-          vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-          vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-        }
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
index 5758e21..989ba1b 100644
--- a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
@@ -121,7 +121,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 4x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -142,22 +142,6 @@
         vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
         vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
         vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-        }
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
index ab885b5..9b524a1 100644
--- a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
@@ -191,7 +191,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 6x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -232,40 +232,6 @@
         vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
         vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
         vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-          vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-          vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-          vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-          vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-          vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-        }
       }
       p -= 6 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
index b4c3bf2..41db255 100644
--- a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
@@ -151,7 +151,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 6x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -178,26 +178,6 @@
         vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
         vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
         vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-        }
       }
       p -= 6 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
index 001ce33..cbab6ea 100644
--- a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
@@ -233,7 +233,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 8x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -284,48 +284,6 @@
         vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
         vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
         vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x16 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
-          vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
-          vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
-          vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
-          vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
-          vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
-          vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
-          vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-          vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-          vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb4567x89AB, va6x01234567, 1);
-          vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
-          vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-          vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-          vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb4567x89AB, va7x01234567, 1);
-          vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
-        }
       }
       p -= 8 * sizeof(void*);
     } while (p != 0);
diff --git a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
index 9ce9937..0d680cc 100644
--- a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
@@ -181,7 +181,7 @@
 
         k -= 8 * sizeof(int8_t);
       }
-      // Handle up to 6 final positions of `k`
+      // Handle up to 4 final positions of `k`
       if XNN_UNLIKELY(k != 0) {
         // Load a 8x4 block of activations.
         const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -214,30 +214,6 @@
         vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
         vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
         vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-
-        if (k > 4) {
-          // Load a 4x8 block of weights.
-          const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-          const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
-          // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
-          vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
-          vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
-          vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
-          vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
-          vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
-          vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
-          vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
-          vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
-          vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
-          vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
-          vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
-          vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
-          vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
-          vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
-          vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
-          vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
-        }
       }
       p -= 8 * sizeof(void*);
     } while (p != 0);