QS8 Neon dot product intrinsics GEMM and IGEMM microkernels reduced remainder code.
Remove remainder code for k in 5 to 7 range. Allow the main loop to do it.
KC is rounded up to multiple of 4. Rewind by A pointers by kc.
PiperOrigin-RevId: 360495618
diff --git a/src/qs8-gemm/MRxNRc4-neondot.c.in b/src/qs8-gemm/MRxNRc4-neondot.c.in
index 49b43d2..e9416c3 100644
--- a/src/qs8-gemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-gemm/MRxNRc4-neondot.c.in
@@ -87,11 +87,11 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a ${MR}x4 block of activations.
$for M in range(MR):
- const int8x8_t va${M}x01234567 = vld1_s8(a${M}); a${M} += k;
+ const int8x8_t va${M}x01234567 = vld1_s8(a${M}); a${M} += 4;
// Load a 4x${NR} block of weights.
$for N in range(0, NR, 4):
@@ -101,17 +101,6 @@
$for M in range(MR):
$for N in range(0, NR, 4):
vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb0123x${ABC[N:N+4]}, va${M}x01234567, 0);
-
- if (k > 4) {
- // Load a 4x${NR} block of weights.
- $for N in range(0, NR, 4):
- const int8x16_t vb4567x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb4567x${ABC[N:N+4]}, va${M}x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c b/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
index c1665fd..71829b0 100644
--- a/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/12x8c4-minmax-neondot.c
@@ -210,21 +210,21 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 12x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
- const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
- const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
- const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
- const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
- const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
- const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
- const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
- const int8x8_t va8x01234567 = vld1_s8(a8); a8 += k;
- const int8x8_t va9x01234567 = vld1_s8(a9); a9 += k;
- const int8x8_t va10x01234567 = vld1_s8(a10); a10 += k;
- const int8x8_t va11x01234567 = vld1_s8(a11); a11 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+ const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+ const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+ const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+ const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+ const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+ const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+ const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
+ const int8x8_t va8x01234567 = vld1_s8(a8); a8 += 4;
+ const int8x8_t va9x01234567 = vld1_s8(a9); a9 += 4;
+ const int8x8_t va10x01234567 = vld1_s8(a10); a10 += 4;
+ const int8x8_t va11x01234567 = vld1_s8(a11); a11 += 4;
// Load a 4x8 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -255,38 +255,6 @@
vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
- vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
- vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
- vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
- vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
- vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
- vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
- vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
- vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
- vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
- vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
- vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
index 964525e..7645374 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x16c4-minmax-neondot.c
@@ -77,10 +77,10 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 1x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
// Load a 4x16 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -93,20 +93,6 @@
vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
index 8242be7..c6925c1 100644
--- a/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/1x8c4-minmax-neondot.c
@@ -67,10 +67,10 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 1x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
// Load a 4x8 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -79,16 +79,6 @@
// Multiply-accumulate: 1x4 * 4x8 --> 1x8.
vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
index 719db22..3c93b58 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x16c4-minmax-neondot.c
@@ -134,13 +134,13 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 4x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
- const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
- const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
- const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+ const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+ const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+ const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
// Load a 4x16 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -165,32 +165,6 @@
vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
- vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
- vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
- vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
index d32b05e..f733070 100644
--- a/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/4x8c4-minmax-neondot.c
@@ -106,13 +106,13 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 4x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
- const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
- const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
- const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+ const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+ const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+ const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
// Load a 4x8 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -127,22 +127,6 @@
vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
index 3a9277e..0086b49 100644
--- a/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x16c4-minmax-neondot.c
@@ -172,15 +172,15 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 6x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
- const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
- const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
- const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
- const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
- const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+ const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+ const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+ const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+ const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+ const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
// Load a 4x16 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -213,40 +213,6 @@
vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
- vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
- vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
- vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
- vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
- vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
index b1fc09f..889b948 100644
--- a/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/6x8c4-minmax-neondot.c
@@ -132,15 +132,15 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 6x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
- const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
- const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
- const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
- const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
- const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+ const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+ const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+ const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+ const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+ const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
// Load a 4x8 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -159,26 +159,6 @@
vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
index 93dab4c..3b2301b 100644
--- a/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x16c4-minmax-neondot.c
@@ -210,17 +210,17 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 8x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
- const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
- const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
- const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
- const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
- const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
- const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
- const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+ const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+ const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+ const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+ const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+ const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+ const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+ const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
// Load a 4x16 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -261,48 +261,6 @@
vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
- vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
- vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
- vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
- vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
- vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
- vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
- vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
- vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb4567x89AB, va6x01234567, 1);
- vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
- vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
- vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
- vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb4567x89AB, va7x01234567, 1);
- vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
index 6dbeb3a..204f3be 100644
--- a/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-gemm/gen/8x8c4-minmax-neondot.c
@@ -158,17 +158,17 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 8x4 block of activations.
- const int8x8_t va0x01234567 = vld1_s8(a0); a0 += k;
- const int8x8_t va1x01234567 = vld1_s8(a1); a1 += k;
- const int8x8_t va2x01234567 = vld1_s8(a2); a2 += k;
- const int8x8_t va3x01234567 = vld1_s8(a3); a3 += k;
- const int8x8_t va4x01234567 = vld1_s8(a4); a4 += k;
- const int8x8_t va5x01234567 = vld1_s8(a5); a5 += k;
- const int8x8_t va6x01234567 = vld1_s8(a6); a6 += k;
- const int8x8_t va7x01234567 = vld1_s8(a7); a7 += k;
+ const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 4;
+ const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 4;
+ const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 4;
+ const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 4;
+ const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 4;
+ const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 4;
+ const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 4;
+ const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 4;
// Load a 4x8 block of weights.
const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
@@ -191,30 +191,6 @@
vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
- vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
- vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
- vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
- }
}
// Post-accumulation work
diff --git a/src/qs8-igemm/MRxNRc4-neondot.c.in b/src/qs8-igemm/MRxNRc4-neondot.c.in
index e49b634..9cf885c 100644
--- a/src/qs8-igemm/MRxNRc4-neondot.c.in
+++ b/src/qs8-igemm/MRxNRc4-neondot.c.in
@@ -93,7 +93,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a ${MR}x4 block of activations.
$for M in range(MR):
@@ -107,17 +107,6 @@
$for M in range(MR):
$for N in range(0, NR, 4):
vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb0123x${ABC[N:N+4]}, va${M}x01234567, 0);
-
- if (k > 4) {
- // Load a 4x${NR} block of weights.
- $for N in range(0, NR, 4):
- const int8x16_t vb4567x${ABC[N:N+4]} = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: ${MR}x4 * 4x${NR} --> ${MR}x${NR}.
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vdotq_lane_s32(vacc${M}x${ABC[N:N+4]}, vb4567x${ABC[N:N+4]}, va${M}x01234567, 1);
- }
}
p -= ${MR} * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c b/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
index 7a74253..0f0953b 100644
--- a/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/12x8c4-minmax-neondot.c
@@ -241,7 +241,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 12x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -286,38 +286,6 @@
vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb0123x4567, va10x01234567, 0);
vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb0123x0123, va11x01234567, 0);
vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb0123x4567, va11x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 12x4 * 4x8 --> 12x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
- vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
- vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
- vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
- vacc8x0123 = vdotq_lane_s32(vacc8x0123, vb4567x0123, va8x01234567, 1);
- vacc8x4567 = vdotq_lane_s32(vacc8x4567, vb4567x4567, va8x01234567, 1);
- vacc9x0123 = vdotq_lane_s32(vacc9x0123, vb4567x0123, va9x01234567, 1);
- vacc9x4567 = vdotq_lane_s32(vacc9x4567, vb4567x4567, va9x01234567, 1);
- vacc10x0123 = vdotq_lane_s32(vacc10x0123, vb4567x0123, va10x01234567, 1);
- vacc10x4567 = vdotq_lane_s32(vacc10x4567, vb4567x4567, va10x01234567, 1);
- vacc11x0123 = vdotq_lane_s32(vacc11x0123, vb4567x0123, va11x01234567, 1);
- vacc11x4567 = vdotq_lane_s32(vacc11x4567, vb4567x4567, va11x01234567, 1);
- }
}
p -= 12 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
index a72ae5b..afb0cea 100644
--- a/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x16c4-minmax-neondot.c
@@ -86,7 +86,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 1x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -102,20 +102,6 @@
vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- }
}
p -= 1 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
index ed42605..dd311f3 100644
--- a/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/1x8c4-minmax-neondot.c
@@ -76,7 +76,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 1x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -88,16 +88,6 @@
// Multiply-accumulate: 1x4 * 4x8 --> 1x8.
vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 1x4 * 4x8 --> 1x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- }
}
p -= 1 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
index 95dde6d..c4d0481 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x16c4-minmax-neondot.c
@@ -149,7 +149,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 4x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -180,32 +180,6 @@
vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
- vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
- vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
- vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
- }
}
p -= 4 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
index 5758e21..989ba1b 100644
--- a/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/4x8c4-minmax-neondot.c
@@ -121,7 +121,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 4x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -142,22 +142,6 @@
vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 4x4 * 4x8 --> 4x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- }
}
p -= 4 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
index ab885b5..9b524a1 100644
--- a/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x16c4-minmax-neondot.c
@@ -191,7 +191,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 6x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -232,40 +232,6 @@
vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 6x4 * 4x16 --> 6x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
- vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
- vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
- vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
- vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
- vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
- }
}
p -= 6 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
index b4c3bf2..41db255 100644
--- a/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/6x8c4-minmax-neondot.c
@@ -151,7 +151,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 6x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -178,26 +178,6 @@
vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 6x4 * 4x8 --> 6x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- }
}
p -= 6 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
index 001ce33..cbab6ea 100644
--- a/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x16c4-minmax-neondot.c
@@ -233,7 +233,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 8x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -284,48 +284,6 @@
vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
-
- if (k > 4) {
- // Load a 4x16 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
- vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
- vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
- vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
- vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
- vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
- vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
- vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
- vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
- vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb4567x89AB, va6x01234567, 1);
- vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
- vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
- vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
- vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb4567x89AB, va7x01234567, 1);
- vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
- }
}
p -= 8 * sizeof(void*);
} while (p != 0);
diff --git a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
index 9ce9937..0d680cc 100644
--- a/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
+++ b/src/qs8-igemm/gen/8x8c4-minmax-neondot.c
@@ -181,7 +181,7 @@
k -= 8 * sizeof(int8_t);
}
- // Handle up to 6 final positions of `k`
+ // Handle up to 4 final positions of `k`
if XNN_UNLIKELY(k != 0) {
// Load a 8x4 block of activations.
const int8x8_t va0x01234567 = vld1_s8(a0);
@@ -214,30 +214,6 @@
vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
-
- if (k > 4) {
- // Load a 4x8 block of weights.
- const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
- const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
-
- // Multiply-accumulate: 8x4 * 4x8 --> 8x8.
- vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
- vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
- vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
- vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
- vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
- vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
- vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
- vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
- vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
- vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
- vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
- vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
- vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
- vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
- vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
- vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
- }
}
p -= 8 * sizeof(void*);
} while (p != 0);