LD1R and LD2R variants of c2 microkernel - Instead of 1 LD1 and 4 DUP, use 4 LD1R or 2 LD2R PiperOrigin-RevId: 410613731

commit: 15eec029e49b8bf258d67122d0f5c71993c75b50 [log] [tgz]
author: Frank Barchard <fbarchard@google.com> Wed Nov 17 13:26:20 2021 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Wed Nov 17 13:27:12 2021 -0800
tree: 181076d657ec6699cddee3fffcd7a6219b571742
parent: 42f5c50972cccb2ee5250d821b99121f62b0430e [diff] [blame]
diff --git a/src/qs8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-ld4r.c b/src/qs8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-ld4r.c
index 5b5674f..47dbc29 100644
--- a/src/qs8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-ld4r.c
+++ b/src/qs8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-ld4r.c

@@ -85,6 +85,7 @@
         const int8x8_t va0c0x1 = vreinterpret_s8_s16(va0x1.val[0]);
         const int8x8_t va1c0x0 = vreinterpret_s8_s16(va1x0.val[0]);
         const int8x8_t va1c0x1 = vreinterpret_s8_s16(va1x1.val[0]);
+
         int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0c0x0);
         int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1c0x0);
         const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
@@ -103,6 +104,7 @@
         const int8x8_t va0c1x1 = vreinterpret_s8_s16(va0x1.val[1]);
         const int8x8_t va1c1x0 = vreinterpret_s8_s16(va1x0.val[1]);
         const int8x8_t va1c1x1 = vreinterpret_s8_s16(va1x1.val[1]);
+
         int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0c1x0);
         int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1c1x0);
         const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
@@ -121,6 +123,7 @@
         const int8x8_t va0c2x1 = vreinterpret_s8_s16(va0x1.val[2]);
         const int8x8_t va1c2x0 = vreinterpret_s8_s16(va1x0.val[2]);
         const int8x8_t va1c2x1 = vreinterpret_s8_s16(va1x1.val[2]);
+
         int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0c2x0);
         int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1c2x0);
         const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
@@ -139,6 +142,7 @@
         const int8x8_t va0c3x1 = vreinterpret_s8_s16(va0x1.val[3]);
         const int8x8_t va1c3x0 = vreinterpret_s8_s16(va1x0.val[3]);
         const int8x8_t va1c3x1 = vreinterpret_s8_s16(va1x1.val[3]);
+
         int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0c3x0);
         int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1c3x0);
         const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
commit	15eec029e49b8bf258d67122d0f5c71993c75b50	[log] [tgz]
author	Frank Barchard <fbarchard@google.com>	Wed Nov 17 13:26:20 2021 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Wed Nov 17 13:27:12 2021 -0800
tree	181076d657ec6699cddee3fffcd7a6219b571742
parent	42f5c50972cccb2ee5250d821b99121f62b0430e [diff] [blame]