blob: 8ea3f4f172b82d1e0592d4353bb5abc1480ea3f3 [file] [log] [blame]
Frank Barchardc7a032d2021-11-10 12:37:49 -08001// Auto-generated file. Do not edit!
Frank Barcharde22685a2021-11-12 11:36:58 -08002// Template: src/qs8-igemm/c2-neon-mull-shuffle.c.in
Frank Barchardc7a032d2021-11-10 12:37:49 -08003// Generator: tools/xngen
4//
5// Copyright 2021 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <arm_neon.h>
13
14#include <xnnpack/gemm.h>
15#include <xnnpack/math.h>
16
17
Frank Barcharde22685a2021-11-12 11:36:58 -080018void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal(
Frank Barchardc7a032d2021-11-10 12:37:49 -080019 size_t mr,
20 size_t nc,
21 size_t kc,
22 size_t ks,
23 const int8_t** restrict a,
24 const void* restrict w,
25 int8_t* restrict c,
26 size_t cm_stride,
27 size_t cn_stride,
28 size_t a_offset,
29 const int8_t* zero,
30 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
31{
32 assert(mr != 0);
33 assert(mr <= 4);
34 assert(nc != 0);
35 assert(kc != 0);
36 assert(ks != 0);
37 assert(ks % (4 * sizeof(void*)) == 0);
38 assert(a_offset % sizeof(int8_t) == 0);
39 assert(a != NULL);
40 assert(w != NULL);
41 assert(c != NULL);
42
43 int8_t* c0 = c;
44 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
45 if XNN_UNPREDICTABLE(mr < 2) {
46 c1 = c0;
47 }
48 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
49 if XNN_UNPREDICTABLE(mr <= 2) {
50 c2 = c1;
51 }
52 int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
53 if XNN_UNPREDICTABLE(mr != 4) {
54 c3 = c2;
55 }
56
57 do {
58 int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
59 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
60 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
61 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
62 int32x4_t vacc1x0123 = vacc0x0123;
63 int32x4_t vacc1x4567 = vacc0x4567;
64 int32x4_t vacc1x89AB = vacc0x89AB;
65 int32x4_t vacc1xCDEF = vacc0xCDEF;
66 int32x4_t vacc2x0123 = vacc0x0123;
67 int32x4_t vacc2x4567 = vacc0x4567;
68 int32x4_t vacc2x89AB = vacc0x89AB;
69 int32x4_t vacc2xCDEF = vacc0xCDEF;
70 int32x4_t vacc3x0123 = vacc0x0123;
71 int32x4_t vacc3x4567 = vacc0x4567;
72 int32x4_t vacc3x89AB = vacc0x89AB;
73 int32x4_t vacc3xCDEF = vacc0xCDEF;
74
75 size_t p = ks;
76 do {
77 const int8_t* restrict a0 = a[0];
78 if XNN_UNPREDICTABLE(a0 != zero) {
79 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
80 }
81 const int8_t* restrict a1 = a[1];
82 if XNN_UNPREDICTABLE(a1 != zero) {
83 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
84 }
85 const int8_t* restrict a2 = a[2];
86 if XNN_UNPREDICTABLE(a2 != zero) {
87 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
88 }
89 const int8_t* restrict a3 = a[3];
90 if XNN_UNPREDICTABLE(a3 != zero) {
91 a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
92 }
93 a += 4;
94
95 size_t k = kc;
96
97 while (k >= 16 * sizeof(int8_t)) {
98 int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
99 int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
100 int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
101 int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
102 int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
103 int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
104 int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
105 int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
106
107 const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
108 const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
109 const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
110 const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
111 const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
112 const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
113 const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
114 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
115 const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
116 const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
117 const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
118 const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
119 const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
120 const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
121 const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
122 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
123
124 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
125 int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
126 int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, va2x0);
127 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0);
128 const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
129 vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, va0x1);
130 vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, va1x1);
131 vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, va2x1);
132 vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, va3x1);
133 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
134 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
135 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
136 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
137 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
138 int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
139 int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, va2x0);
140 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0);
141 const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
142 vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, va0x1);
143 vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, va1x1);
144 vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, va2x1);
145 vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, va3x1);
146 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
147 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
148 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
149 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
150 int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0x0);
151 int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, va1x0);
152 int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, va2x0);
153 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0);
154 const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
155 vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, va0x1);
156 vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, va1x1);
157 vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, va2x1);
158 vprod3x89ABc0 = vmlal_s8(vprod3x89ABc0, vb89ABc0x1, va3x1);
159 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
160 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
161 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
162 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
163 int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0x0);
164 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0);
165 int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, va2x0);
166 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0);
167 const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
168 vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, va0x1);
169 vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, va1x1);
170 vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, va2x1);
171 vprod3xCDEFc0 = vmlal_s8(vprod3xCDEFc0, vbCDEFc0x1, va3x1);
172 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
173 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
174 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
175 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
176 va0x0 = vext_s8(va0x0, va0x0, 2);
177 va0x1 = vext_s8(va0x1, va0x1, 2);
178 va1x0 = vext_s8(va1x0, va1x0, 2);
179 va1x1 = vext_s8(va1x1, va1x1, 2);
180 va2x0 = vext_s8(va2x0, va2x0, 2);
181 va2x1 = vext_s8(va2x1, va2x1, 2);
182 va3x0 = vext_s8(va3x0, va3x0, 2);
183 va3x1 = vext_s8(va3x1, va3x1, 2);
184 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
185 int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
186 int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, va2x0);
187 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0);
188 const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
189 vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, va0x1);
190 vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, va1x1);
191 vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, va2x1);
192 vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, va3x1);
193 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
194 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
195 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
196 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
197 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
198 int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
199 int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, va2x0);
200 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0);
201 const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
202 vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, va0x1);
203 vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, va1x1);
204 vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, va2x1);
205 vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, va3x1);
206 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
207 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
208 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
209 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
210 int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0x0);
211 int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, va1x0);
212 int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, va2x0);
213 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0);
214 const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
215 vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, va0x1);
216 vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, va1x1);
217 vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, va2x1);
218 vprod3x89ABc1 = vmlal_s8(vprod3x89ABc1, vb89ABc1x1, va3x1);
219 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
220 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
221 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
222 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
223 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0);
224 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0);
225 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0);
226 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0);
227 const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
228 vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, va0x1);
229 vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, va1x1);
230 vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, va2x1);
231 vprod3xCDEFc1 = vmlal_s8(vprod3xCDEFc1, vbCDEFc1x1, va3x1);
232 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
233 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
234 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
235 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
236 va0x0 = vext_s8(va0x0, va0x0, 2);
237 va0x1 = vext_s8(va0x1, va0x1, 2);
238 va1x0 = vext_s8(va1x0, va1x0, 2);
239 va1x1 = vext_s8(va1x1, va1x1, 2);
240 va2x0 = vext_s8(va2x0, va2x0, 2);
241 va2x1 = vext_s8(va2x1, va2x1, 2);
242 va3x0 = vext_s8(va3x0, va3x0, 2);
243 va3x1 = vext_s8(va3x1, va3x1, 2);
244 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
245 int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
246 int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, va2x0);
247 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0);
248 const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
249 vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, va0x1);
250 vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, va1x1);
251 vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, va2x1);
252 vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, va3x1);
253 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
254 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
255 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
256 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
257 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
258 int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
259 int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, va2x0);
260 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0);
261 const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
262 vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, va0x1);
263 vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, va1x1);
264 vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, va2x1);
265 vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, va3x1);
266 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
267 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
268 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
269 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
270 int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, va0x0);
271 int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, va1x0);
272 int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, va2x0);
273 int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, va3x0);
274 const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
275 vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, va0x1);
276 vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, va1x1);
277 vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, va2x1);
278 vprod3x89ABc2 = vmlal_s8(vprod3x89ABc2, vb89ABc2x1, va3x1);
279 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
280 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
281 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
282 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
283 int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, va0x0);
284 int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, va1x0);
285 int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, va2x0);
286 int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, va3x0);
287 const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
288 vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, va0x1);
289 vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, va1x1);
290 vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, va2x1);
291 vprod3xCDEFc2 = vmlal_s8(vprod3xCDEFc2, vbCDEFc2x1, va3x1);
292 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
293 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
294 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
295 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
296 va0x0 = vext_s8(va0x0, va0x0, 2);
297 va0x1 = vext_s8(va0x1, va0x1, 2);
298 va1x0 = vext_s8(va1x0, va1x0, 2);
299 va1x1 = vext_s8(va1x1, va1x1, 2);
300 va2x0 = vext_s8(va2x0, va2x0, 2);
301 va2x1 = vext_s8(va2x1, va2x1, 2);
302 va3x0 = vext_s8(va3x0, va3x0, 2);
303 va3x1 = vext_s8(va3x1, va3x1, 2);
304 int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
305 int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
306 int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, va2x0);
307 int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, va3x0);
308 const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
309 vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, va0x1);
310 vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, va1x1);
311 vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, va2x1);
312 vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, va3x1);
313 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
314 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
315 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
316 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
317 int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
318 int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
319 int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, va2x0);
320 int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, va3x0);
321 const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
322 vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, va0x1);
323 vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, va1x1);
324 vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, va2x1);
325 vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, va3x1);
326 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
327 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
328 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
329 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
330 int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, va0x0);
331 int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, va1x0);
332 int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, va2x0);
333 int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, va3x0);
334 const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
335 vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, va0x1);
336 vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, va1x1);
337 vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, va2x1);
338 vprod3x89ABc3 = vmlal_s8(vprod3x89ABc3, vb89ABc3x1, va3x1);
339 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
340 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
341 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
342 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
343 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0);
344 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0);
345 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0);
346 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0);
347 const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
348 vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, va0x1);
349 vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, va1x1);
350 vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, va2x1);
351 vprod3xCDEFc3 = vmlal_s8(vprod3xCDEFc3, vbCDEFc3x1, va3x1);
352 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
353 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
354 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
355 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
356
357 k -= 16 * sizeof(int8_t);
358 }
359
360 if (k >= 8 * sizeof(int8_t)) {
361 int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
362 int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
363 int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
364 int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
365
366 const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
367 const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
368 const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
369 const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
370 const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
371 const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
372 const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
373 const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
374 const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
375 const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
376 const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
377 const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
378 const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
379 const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
380 const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
381 const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
382
383 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0);
384 int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, va1x0);
385 int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, va2x0);
386 int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, va3x0);
387 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
388 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
389 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
390 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
391 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0);
392 int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, va1x0);
393 int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, va2x0);
394 int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, va3x0);
395 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
396 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
397 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
398 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
399 int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0x0);
400 int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, va1x0);
401 int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, va2x0);
402 int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, va3x0);
403 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
404 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
405 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
406 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
407 int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0x0);
408 int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, va1x0);
409 int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, va2x0);
410 int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, va3x0);
411 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
412 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
413 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
414 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
415 va0x0 = vext_s8(va0x0, va0x0, 2);
416 va1x0 = vext_s8(va1x0, va1x0, 2);
417 va2x0 = vext_s8(va2x0, va2x0, 2);
418 va3x0 = vext_s8(va3x0, va3x0, 2);
419 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0);
420 int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, va1x0);
421 int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, va2x0);
422 int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, va3x0);
423 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
424 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
425 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
426 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
427 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0);
428 int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, va1x0);
429 int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, va2x0);
430 int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, va3x0);
431 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
432 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
433 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
434 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
435 int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0x0);
436 int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, va1x0);
437 int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, va2x0);
438 int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, va3x0);
439 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
440 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
441 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
442 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
443 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0);
444 int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, va1x0);
445 int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, va2x0);
446 int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, va3x0);
447 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
448 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
449 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
450 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
451 va0x0 = vext_s8(va0x0, va0x0, 2);
452 va1x0 = vext_s8(va1x0, va1x0, 2);
453 va2x0 = vext_s8(va2x0, va2x0, 2);
454 va3x0 = vext_s8(va3x0, va3x0, 2);
455 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0);
456 int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, va1x0);
457 int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, va2x0);
458 int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, va3x0);
459 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
460 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
461 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
462 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
463 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0);
464 int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, va1x0);
465 int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, va2x0);
466 int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, va3x0);
467 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
468 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
469 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
470 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
471 int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, va0x0);
472 int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, va1x0);
473 int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, va2x0);
474 int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, va3x0);
475 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
476 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
477 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
478 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
479 int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, va0x0);
480 int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, va1x0);
481 int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, va2x0);
482 int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, va3x0);
483 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
484 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
485 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
486 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
487 va0x0 = vext_s8(va0x0, va0x0, 2);
488 va1x0 = vext_s8(va1x0, va1x0, 2);
489 va2x0 = vext_s8(va2x0, va2x0, 2);
490 va3x0 = vext_s8(va3x0, va3x0, 2);
491 int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, va0x0);
492 int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, va1x0);
493 int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, va2x0);
494 int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, va3x0);
495 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
496 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
497 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
498 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
499 int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, va0x0);
500 int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, va1x0);
501 int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, va2x0);
502 int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, va3x0);
503 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
504 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
505 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
506 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
507 int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, va0x0);
508 int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, va1x0);
509 int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, va2x0);
510 int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, va3x0);
511 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
512 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
513 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
514 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
515 int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, va0x0);
516 int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, va1x0);
517 int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, va2x0);
518 int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, va3x0);
519 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
520 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
521 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
522 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
523
524 k -= 8 * sizeof(int8_t);
525 }
526
527 if XNN_UNLIKELY(k != 0) {
528 const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
529 const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
530 const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
531 const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
532
533 const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
534 const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
535 const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
536 const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
537
538 const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
539 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
540 const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
541 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
542 const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
543 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
544 const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
545 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
546 const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
547 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
548 const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
549 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
550 const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
551 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
552 const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
553 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
554 const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
555 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
556 const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
557 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
558 const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
559 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
560 const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
561 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
562 const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
563 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
564 const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
565 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
566 const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
567 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
568 const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
569 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
570
571 if (k > 2 * sizeof(int8_t)) {
572 const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
573 const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
574 const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
575 const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
576
577 const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
578 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
579 const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
580 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
581 const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
582 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
583 const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
584 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
585 const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
586 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
587 const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
588 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
589 const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
590 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
591 const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
592 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
593 const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
594 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
595 const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
596 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
597 const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
598 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
599 const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
600 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
601 const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
602 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
603 const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
604 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
605 const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
606 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
607 const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
608 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
609
610 if (k > 4 * sizeof(int8_t)) {
611 const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
612 const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
613 const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
614 const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
615
616 const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
617 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
618 const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
619 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
620 const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
621 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
622 const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
623 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
624 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
625 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
626 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
627 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
628 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
629 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
630 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
631 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
632 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
633 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
634 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
635 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
636 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
637 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
638 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
639 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
640 const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
641 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
642 const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
643 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
644 const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
645 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
646 const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
647 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
648
649 if (k > 6 * sizeof(int8_t)) {
650 const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
651 const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
652 const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
653 const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
654
655 const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
656 vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
657 const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
658 vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
659 const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
660 vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
661 const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
662 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
663 const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
664 vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
665 const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
666 vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
667 const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
668 vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
669 const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
670 vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
671 const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
672 vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
673 const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
674 vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
675 const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
676 vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
677 const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
678 vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
679 const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
680 vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
681 const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
682 vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
683 const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
684 vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
685 const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
686 vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
687 }
688 }
689 }
690 }
691 p -= 4 * sizeof(void*);
692 } while (p != 0);
693
694 const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->rndnu_neon.right_pre_shift);
695 const int32x4_t vmultiplier = vld1q_dup_s32(&params->rndnu_neon.multiplier);
696 const int32x4_t vright_post_shift = vld1q_dup_s32(&params->rndnu_neon.right_post_shift);
697
698 vacc0x0123 = vshlq_s32(vacc0x0123, vright_pre_shift);
699 vacc0x4567 = vshlq_s32(vacc0x4567, vright_pre_shift);
700 vacc0x89AB = vshlq_s32(vacc0x89AB, vright_pre_shift);
701 vacc0xCDEF = vshlq_s32(vacc0xCDEF, vright_pre_shift);
702 vacc1x0123 = vshlq_s32(vacc1x0123, vright_pre_shift);
703 vacc1x4567 = vshlq_s32(vacc1x4567, vright_pre_shift);
704 vacc1x89AB = vshlq_s32(vacc1x89AB, vright_pre_shift);
705 vacc1xCDEF = vshlq_s32(vacc1xCDEF, vright_pre_shift);
706 vacc2x0123 = vshlq_s32(vacc2x0123, vright_pre_shift);
707 vacc2x4567 = vshlq_s32(vacc2x4567, vright_pre_shift);
708 vacc2x89AB = vshlq_s32(vacc2x89AB, vright_pre_shift);
709 vacc2xCDEF = vshlq_s32(vacc2xCDEF, vright_pre_shift);
710 vacc3x0123 = vshlq_s32(vacc3x0123, vright_pre_shift);
711 vacc3x4567 = vshlq_s32(vacc3x4567, vright_pre_shift);
712 vacc3x89AB = vshlq_s32(vacc3x89AB, vright_pre_shift);
713 vacc3xCDEF = vshlq_s32(vacc3xCDEF, vright_pre_shift);
714
715 vacc0x0123 = vqdmulhq_s32(vacc0x0123, vmultiplier);
716 vacc0x4567 = vqdmulhq_s32(vacc0x4567, vmultiplier);
717 vacc0x89AB = vqdmulhq_s32(vacc0x89AB, vmultiplier);
718 vacc0xCDEF = vqdmulhq_s32(vacc0xCDEF, vmultiplier);
719 vacc1x0123 = vqdmulhq_s32(vacc1x0123, vmultiplier);
720 vacc1x4567 = vqdmulhq_s32(vacc1x4567, vmultiplier);
721 vacc1x89AB = vqdmulhq_s32(vacc1x89AB, vmultiplier);
722 vacc1xCDEF = vqdmulhq_s32(vacc1xCDEF, vmultiplier);
723 vacc2x0123 = vqdmulhq_s32(vacc2x0123, vmultiplier);
724 vacc2x4567 = vqdmulhq_s32(vacc2x4567, vmultiplier);
725 vacc2x89AB = vqdmulhq_s32(vacc2x89AB, vmultiplier);
726 vacc2xCDEF = vqdmulhq_s32(vacc2xCDEF, vmultiplier);
727 vacc3x0123 = vqdmulhq_s32(vacc3x0123, vmultiplier);
728 vacc3x4567 = vqdmulhq_s32(vacc3x4567, vmultiplier);
729 vacc3x89AB = vqdmulhq_s32(vacc3x89AB, vmultiplier);
730 vacc3xCDEF = vqdmulhq_s32(vacc3xCDEF, vmultiplier);
731
732 vacc0x0123 = vrshlq_s32(vacc0x0123, vright_post_shift);
733 vacc0x4567 = vrshlq_s32(vacc0x4567, vright_post_shift);
734 vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_post_shift);
735 vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_post_shift);
736 vacc1x0123 = vrshlq_s32(vacc1x0123, vright_post_shift);
737 vacc1x4567 = vrshlq_s32(vacc1x4567, vright_post_shift);
738 vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_post_shift);
739 vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_post_shift);
740 vacc2x0123 = vrshlq_s32(vacc2x0123, vright_post_shift);
741 vacc2x4567 = vrshlq_s32(vacc2x4567, vright_post_shift);
742 vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_post_shift);
743 vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_post_shift);
744 vacc3x0123 = vrshlq_s32(vacc3x0123, vright_post_shift);
745 vacc3x4567 = vrshlq_s32(vacc3x4567, vright_post_shift);
746 vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_post_shift);
747 vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_post_shift);
748
749 const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->rndnu_neon.output_zero_point);
750#if XNN_ARCH_ARM64
751 const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
752 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
753 const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
754 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
755 const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
756 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
757 const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
758 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
759
760 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
761 int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
762 int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
763 int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
764#else
765 const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
766 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
767 const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
768 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
769 const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
770 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
771 const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
772 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
773
774 int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
775 int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
776 int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
777 int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
778#endif
779 const int8x16_t voutput_min = vld1q_dup_s8(&params->rndnu_neon.output_min);
780 const int8x16_t voutput_max = vld1q_dup_s8(&params->rndnu_neon.output_max);
781
782 vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
783 vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
784 vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
785 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
786
787 vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
788 vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
789 vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
790 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
791
792 if (nc >= 16) {
793 vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
794 vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
795 vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
796 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
797
798 c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
799 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
800 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
801 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
802
803 a = (const int8_t**restrict) ((uintptr_t) a - ks);
804
805 nc -= 16;
806 } else {
807 int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
808 int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
809 if (nc & 8) {
810 vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
811 vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
812 vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
813 vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
814 vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
815 vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
816 }
817 if (nc & 4) {
818 vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
819 vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
820 vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
821 vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
822 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
823 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
824 }
825 if (nc & 2) {
826 vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
827 vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
828 vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
829 vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
830 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
831 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
832 }
833 if (nc & 1) {
834 vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
835 vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
836 vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
837 vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
838 }
839
840 nc = 0;
841 }
842 } while (nc != 0);
843}