blob: 3f555ac773fca0179ee21d9d7eee6719f51ef680 [file] [log] [blame]
Marat Dukhane76478b2021-06-28 16:35:40 -07001// Auto-generated file. Do not edit!
Frank Barcharde4525602021-06-29 13:50:00 -07002// Template: src/qs8-igemm/c4-neondot.c.in
Marat Dukhane76478b2021-06-28 16:35:40 -07003// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <arm_neon.h>
13
14#include <xnnpack/igemm.h>
15#include <xnnpack/intrinsics-polyfill.h>
16#include <xnnpack/math.h>
17
18
19void xnn_qc8_igemm_minmax_fp32_ukernel_8x16c4__neondot(
20 size_t mr,
21 size_t nc,
22 size_t kc,
23 size_t ks,
24 const int8_t** restrict a,
25 const void* restrict w,
26 int8_t* restrict c,
27 size_t cm_stride,
28 size_t cn_stride,
29 size_t a_offset,
30 const int8_t* zero,
31 const union xnn_qs8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
32{
33 assert(mr != 0);
34 assert(mr <= 8);
35 assert(nc != 0);
36 assert(kc != 0);
37 assert(ks != 0);
38 assert(ks % (8 * sizeof(void*)) == 0);
39 assert(a_offset % sizeof(int8_t) == 0);
40 assert(a != NULL);
41 assert(w != NULL);
42 assert(c != NULL);
43
44 kc = round_up_po2(kc, 4 * sizeof(int8_t));
45 int8_t* c0 = c;
46 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
47 if XNN_UNPREDICTABLE(mr < 2) {
48 c1 = c0;
49 }
50 int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
51 if XNN_UNPREDICTABLE(mr <= 2) {
52 c2 = c1;
53 }
54 int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
55 if XNN_UNPREDICTABLE(mr < 4) {
56 c3 = c2;
57 }
58 int8_t* c4 = (int8_t*) ((uintptr_t) c3 + cm_stride);
59 if XNN_UNPREDICTABLE(mr <= 4) {
60 c4 = c3;
61 }
62 int8_t* c5 = (int8_t*) ((uintptr_t) c4 + cm_stride);
63 if XNN_UNPREDICTABLE(mr < 6) {
64 c5 = c4;
65 }
66 int8_t* c6 = (int8_t*) ((uintptr_t) c5 + cm_stride);
67 if XNN_UNPREDICTABLE(mr <= 6) {
68 c6 = c5;
69 }
70 int8_t* c7 = (int8_t*) ((uintptr_t) c6 + cm_stride);
71 if XNN_UNPREDICTABLE(mr != 8) {
72 c7 = c6;
73 }
74
75 do {
76 int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
77 int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
78 int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
79 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
80 int32x4_t vacc1x0123 = vacc0x0123;
81 int32x4_t vacc1x4567 = vacc0x4567;
82 int32x4_t vacc1x89AB = vacc0x89AB;
83 int32x4_t vacc1xCDEF = vacc0xCDEF;
84 int32x4_t vacc2x0123 = vacc0x0123;
85 int32x4_t vacc2x4567 = vacc0x4567;
86 int32x4_t vacc2x89AB = vacc0x89AB;
87 int32x4_t vacc2xCDEF = vacc0xCDEF;
88 int32x4_t vacc3x0123 = vacc0x0123;
89 int32x4_t vacc3x4567 = vacc0x4567;
90 int32x4_t vacc3x89AB = vacc0x89AB;
91 int32x4_t vacc3xCDEF = vacc0xCDEF;
92 int32x4_t vacc4x0123 = vacc0x0123;
93 int32x4_t vacc4x4567 = vacc0x4567;
94 int32x4_t vacc4x89AB = vacc0x89AB;
95 int32x4_t vacc4xCDEF = vacc0xCDEF;
96 int32x4_t vacc5x0123 = vacc0x0123;
97 int32x4_t vacc5x4567 = vacc0x4567;
98 int32x4_t vacc5x89AB = vacc0x89AB;
99 int32x4_t vacc5xCDEF = vacc0xCDEF;
100 int32x4_t vacc6x0123 = vacc0x0123;
101 int32x4_t vacc6x4567 = vacc0x4567;
102 int32x4_t vacc6x89AB = vacc0x89AB;
103 int32x4_t vacc6xCDEF = vacc0xCDEF;
104 int32x4_t vacc7x0123 = vacc0x0123;
105 int32x4_t vacc7x4567 = vacc0x4567;
106 int32x4_t vacc7x89AB = vacc0x89AB;
107 int32x4_t vacc7xCDEF = vacc0xCDEF;
108
109 size_t p = ks;
110 do {
111 const int8_t* restrict a0 = a[0];
112 if XNN_UNPREDICTABLE(a0 != zero) {
113 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
114 }
115 const int8_t* restrict a1 = a[1];
116 if XNN_UNPREDICTABLE(a1 != zero) {
117 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
118 }
119 const int8_t* restrict a2 = a[2];
120 if XNN_UNPREDICTABLE(a2 != zero) {
121 a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
122 }
123 const int8_t* restrict a3 = a[3];
124 if XNN_UNPREDICTABLE(a3 != zero) {
125 a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
126 }
127 const int8_t* restrict a4 = a[4];
128 if XNN_UNPREDICTABLE(a4 != zero) {
129 a4 = (const int8_t*) ((uintptr_t) a4 + a_offset);
130 }
131 const int8_t* restrict a5 = a[5];
132 if XNN_UNPREDICTABLE(a5 != zero) {
133 a5 = (const int8_t*) ((uintptr_t) a5 + a_offset);
134 }
135 const int8_t* restrict a6 = a[6];
136 if XNN_UNPREDICTABLE(a6 != zero) {
137 a6 = (const int8_t*) ((uintptr_t) a6 + a_offset);
138 }
139 const int8_t* restrict a7 = a[7];
140 if XNN_UNPREDICTABLE(a7 != zero) {
141 a7 = (const int8_t*) ((uintptr_t) a7 + a_offset);
142 }
143 a += 8;
144
145 // Inner accumulation loop along the 16 columns.
146 size_t k = kc;
147 // 2x partial unrolled loop to load 8 bytes at a time.
148 while (k >= 8 * sizeof(int8_t)) {
149 // Load a 8x8 block of activations.
150 const int8x8_t va0x01234567 = vld1_s8(a0); a0 += 8;
151 const int8x8_t va1x01234567 = vld1_s8(a1); a1 += 8;
152 const int8x8_t va2x01234567 = vld1_s8(a2); a2 += 8;
153 const int8x8_t va3x01234567 = vld1_s8(a3); a3 += 8;
154 const int8x8_t va4x01234567 = vld1_s8(a4); a4 += 8;
155 const int8x8_t va5x01234567 = vld1_s8(a5); a5 += 8;
156 const int8x8_t va6x01234567 = vld1_s8(a6); a6 += 8;
157 const int8x8_t va7x01234567 = vld1_s8(a7); a7 += 8;
158
159 // Load a 8x16 block of weights.
160 const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
161 const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
162 const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
163 const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
164 const int8x16_t vb4567x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
165 const int8x16_t vb4567x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
166 const int8x16_t vb4567x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
167 const int8x16_t vb4567xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
168
169 // Multiply-accumulate: 8x8 * 8x16 --> 8x16.
170 vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
171 vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
172 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
173 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
174 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
175 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
176 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0);
177 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
178 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
179 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
180 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0);
181 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
182 vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
183 vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
184 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
185 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
186 vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
187 vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
188 vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb0123x89AB, va4x01234567, 0);
189 vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb0123xCDEF, va4x01234567, 0);
190 vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
191 vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
192 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
193 vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
194 vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb0123x0123, va6x01234567, 0);
195 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
196 vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb0123x89AB, va6x01234567, 0);
197 vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb0123xCDEF, va6x01234567, 0);
198 vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
199 vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
200 vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
201 vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
202 vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb4567x0123, va0x01234567, 1);
203 vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb4567x4567, va0x01234567, 1);
204 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb4567x89AB, va0x01234567, 1);
205 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
206 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb4567x0123, va1x01234567, 1);
207 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb4567x4567, va1x01234567, 1);
208 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb4567x89AB, va1x01234567, 1);
209 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
210 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb4567x0123, va2x01234567, 1);
211 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb4567x4567, va2x01234567, 1);
212 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb4567x89AB, va2x01234567, 1);
213 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
214 vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb4567x0123, va3x01234567, 1);
215 vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb4567x4567, va3x01234567, 1);
216 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb4567x89AB, va3x01234567, 1);
217 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
218 vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb4567x0123, va4x01234567, 1);
219 vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb4567x4567, va4x01234567, 1);
220 vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb4567x89AB, va4x01234567, 1);
221 vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb4567xCDEF, va4x01234567, 1);
222 vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb4567x0123, va5x01234567, 1);
223 vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb4567x4567, va5x01234567, 1);
224 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb4567x89AB, va5x01234567, 1);
225 vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb4567xCDEF, va5x01234567, 1);
226 vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb4567x0123, va6x01234567, 1);
227 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb4567x4567, va6x01234567, 1);
228 vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb4567x89AB, va6x01234567, 1);
229 vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb4567xCDEF, va6x01234567, 1);
230 vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb4567x0123, va7x01234567, 1);
231 vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb4567x4567, va7x01234567, 1);
232 vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb4567x89AB, va7x01234567, 1);
233 vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb4567xCDEF, va7x01234567, 1);
234
235 k -= 8 * sizeof(int8_t);
236 }
237 // Handle up to 4 final positions of `k`
238 if XNN_UNLIKELY(k != 0) {
239 // Load a 8x4 block of activations.
240 const int8x8_t va0x01234567 = vld1_s8(a0);
241 const int8x8_t va1x01234567 = vld1_s8(a1);
242 const int8x8_t va2x01234567 = vld1_s8(a2);
243 const int8x8_t va3x01234567 = vld1_s8(a3);
244 const int8x8_t va4x01234567 = vld1_s8(a4);
245 const int8x8_t va5x01234567 = vld1_s8(a5);
246 const int8x8_t va6x01234567 = vld1_s8(a6);
247 const int8x8_t va7x01234567 = vld1_s8(a7);
248
249 // Load a 4x16 block of weights.
250 const int8x16_t vb0123x0123 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
251 const int8x16_t vb0123x4567 = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
252 const int8x16_t vb0123x89AB = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
253 const int8x16_t vb0123xCDEF = vld1q_s8(w); w = (const void*) ((const int8_t*) w + 16);
254
255 // Multiply-accumulate: 8x4 * 4x16 --> 8x16.
256 vacc0x0123 = vdotq_lane_s32(vacc0x0123, vb0123x0123, va0x01234567, 0);
257 vacc0x4567 = vdotq_lane_s32(vacc0x4567, vb0123x4567, va0x01234567, 0);
258 vacc0x89AB = vdotq_lane_s32(vacc0x89AB, vb0123x89AB, va0x01234567, 0);
259 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
260 vacc1x0123 = vdotq_lane_s32(vacc1x0123, vb0123x0123, va1x01234567, 0);
261 vacc1x4567 = vdotq_lane_s32(vacc1x4567, vb0123x4567, va1x01234567, 0);
262 vacc1x89AB = vdotq_lane_s32(vacc1x89AB, vb0123x89AB, va1x01234567, 0);
263 vacc1xCDEF = vdotq_lane_s32(vacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
264 vacc2x0123 = vdotq_lane_s32(vacc2x0123, vb0123x0123, va2x01234567, 0);
265 vacc2x4567 = vdotq_lane_s32(vacc2x4567, vb0123x4567, va2x01234567, 0);
266 vacc2x89AB = vdotq_lane_s32(vacc2x89AB, vb0123x89AB, va2x01234567, 0);
267 vacc2xCDEF = vdotq_lane_s32(vacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
268 vacc3x0123 = vdotq_lane_s32(vacc3x0123, vb0123x0123, va3x01234567, 0);
269 vacc3x4567 = vdotq_lane_s32(vacc3x4567, vb0123x4567, va3x01234567, 0);
270 vacc3x89AB = vdotq_lane_s32(vacc3x89AB, vb0123x89AB, va3x01234567, 0);
271 vacc3xCDEF = vdotq_lane_s32(vacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
272 vacc4x0123 = vdotq_lane_s32(vacc4x0123, vb0123x0123, va4x01234567, 0);
273 vacc4x4567 = vdotq_lane_s32(vacc4x4567, vb0123x4567, va4x01234567, 0);
274 vacc4x89AB = vdotq_lane_s32(vacc4x89AB, vb0123x89AB, va4x01234567, 0);
275 vacc4xCDEF = vdotq_lane_s32(vacc4xCDEF, vb0123xCDEF, va4x01234567, 0);
276 vacc5x0123 = vdotq_lane_s32(vacc5x0123, vb0123x0123, va5x01234567, 0);
277 vacc5x4567 = vdotq_lane_s32(vacc5x4567, vb0123x4567, va5x01234567, 0);
278 vacc5x89AB = vdotq_lane_s32(vacc5x89AB, vb0123x89AB, va5x01234567, 0);
279 vacc5xCDEF = vdotq_lane_s32(vacc5xCDEF, vb0123xCDEF, va5x01234567, 0);
280 vacc6x0123 = vdotq_lane_s32(vacc6x0123, vb0123x0123, va6x01234567, 0);
281 vacc6x4567 = vdotq_lane_s32(vacc6x4567, vb0123x4567, va6x01234567, 0);
282 vacc6x89AB = vdotq_lane_s32(vacc6x89AB, vb0123x89AB, va6x01234567, 0);
283 vacc6xCDEF = vdotq_lane_s32(vacc6xCDEF, vb0123xCDEF, va6x01234567, 0);
284 vacc7x0123 = vdotq_lane_s32(vacc7x0123, vb0123x0123, va7x01234567, 0);
285 vacc7x4567 = vdotq_lane_s32(vacc7x4567, vb0123x4567, va7x01234567, 0);
286 vacc7x89AB = vdotq_lane_s32(vacc7x89AB, vb0123x89AB, va7x01234567, 0);
287 vacc7xCDEF = vdotq_lane_s32(vacc7xCDEF, vb0123xCDEF, va7x01234567, 0);
288 }
289 p -= 8 * sizeof(void*);
290 } while (p != 0);
291
292 float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
293 float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
294 float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
295 float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
296 float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
297 float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
298 float32x4_t vfpacc1x89AB = vcvtq_f32_s32(vacc1x89AB);
299 float32x4_t vfpacc1xCDEF = vcvtq_f32_s32(vacc1xCDEF);
300 float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
301 float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
302 float32x4_t vfpacc2x89AB = vcvtq_f32_s32(vacc2x89AB);
303 float32x4_t vfpacc2xCDEF = vcvtq_f32_s32(vacc2xCDEF);
304 float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
305 float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
306 float32x4_t vfpacc3x89AB = vcvtq_f32_s32(vacc3x89AB);
307 float32x4_t vfpacc3xCDEF = vcvtq_f32_s32(vacc3xCDEF);
308 float32x4_t vfpacc4x0123 = vcvtq_f32_s32(vacc4x0123);
309 float32x4_t vfpacc4x4567 = vcvtq_f32_s32(vacc4x4567);
310 float32x4_t vfpacc4x89AB = vcvtq_f32_s32(vacc4x89AB);
311 float32x4_t vfpacc4xCDEF = vcvtq_f32_s32(vacc4xCDEF);
312 float32x4_t vfpacc5x0123 = vcvtq_f32_s32(vacc5x0123);
313 float32x4_t vfpacc5x4567 = vcvtq_f32_s32(vacc5x4567);
314 float32x4_t vfpacc5x89AB = vcvtq_f32_s32(vacc5x89AB);
315 float32x4_t vfpacc5xCDEF = vcvtq_f32_s32(vacc5xCDEF);
316 float32x4_t vfpacc6x0123 = vcvtq_f32_s32(vacc6x0123);
317 float32x4_t vfpacc6x4567 = vcvtq_f32_s32(vacc6x4567);
318 float32x4_t vfpacc6x89AB = vcvtq_f32_s32(vacc6x89AB);
319 float32x4_t vfpacc6xCDEF = vcvtq_f32_s32(vacc6xCDEF);
320 float32x4_t vfpacc7x0123 = vcvtq_f32_s32(vacc7x0123);
321 float32x4_t vfpacc7x4567 = vcvtq_f32_s32(vacc7x4567);
322 float32x4_t vfpacc7x89AB = vcvtq_f32_s32(vacc7x89AB);
323 float32x4_t vfpacc7xCDEF = vcvtq_f32_s32(vacc7xCDEF);
324
325 const float32x4_t vscale0123 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
326 vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale0123);
327 vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale0123);
328 vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale0123);
329 vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale0123);
330 vfpacc4x0123 = vmulq_f32(vfpacc4x0123, vscale0123);
331 vfpacc5x0123 = vmulq_f32(vfpacc5x0123, vscale0123);
332 vfpacc6x0123 = vmulq_f32(vfpacc6x0123, vscale0123);
333 vfpacc7x0123 = vmulq_f32(vfpacc7x0123, vscale0123);
334 const float32x4_t vscale4567 = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
335 vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale4567);
336 vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale4567);
337 vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale4567);
338 vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale4567);
339 vfpacc4x4567 = vmulq_f32(vfpacc4x4567, vscale4567);
340 vfpacc5x4567 = vmulq_f32(vfpacc5x4567, vscale4567);
341 vfpacc6x4567 = vmulq_f32(vfpacc6x4567, vscale4567);
342 vfpacc7x4567 = vmulq_f32(vfpacc7x4567, vscale4567);
343 const float32x4_t vscale89AB = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
344 vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale89AB);
345 vfpacc1x89AB = vmulq_f32(vfpacc1x89AB, vscale89AB);
346 vfpacc2x89AB = vmulq_f32(vfpacc2x89AB, vscale89AB);
347 vfpacc3x89AB = vmulq_f32(vfpacc3x89AB, vscale89AB);
348 vfpacc4x89AB = vmulq_f32(vfpacc4x89AB, vscale89AB);
349 vfpacc5x89AB = vmulq_f32(vfpacc5x89AB, vscale89AB);
350 vfpacc6x89AB = vmulq_f32(vfpacc6x89AB, vscale89AB);
351 vfpacc7x89AB = vmulq_f32(vfpacc7x89AB, vscale89AB);
352 const float32x4_t vscaleCDEF = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
353 vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscaleCDEF);
354 vfpacc1xCDEF = vmulq_f32(vfpacc1xCDEF, vscaleCDEF);
355 vfpacc2xCDEF = vmulq_f32(vfpacc2xCDEF, vscaleCDEF);
356 vfpacc3xCDEF = vmulq_f32(vfpacc3xCDEF, vscaleCDEF);
357 vfpacc4xCDEF = vmulq_f32(vfpacc4xCDEF, vscaleCDEF);
358 vfpacc5xCDEF = vmulq_f32(vfpacc5xCDEF, vscaleCDEF);
359 vfpacc6xCDEF = vmulq_f32(vfpacc6xCDEF, vscaleCDEF);
360 vfpacc7xCDEF = vmulq_f32(vfpacc7xCDEF, vscaleCDEF);
361
362 vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
363 vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
364 vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
365 vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
366 vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
367 vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
368 vacc1x89AB = vcvtnq_s32_f32(vfpacc1x89AB);
369 vacc1xCDEF = vcvtnq_s32_f32(vfpacc1xCDEF);
370 vacc2x0123 = vcvtnq_s32_f32(vfpacc2x0123);
371 vacc2x4567 = vcvtnq_s32_f32(vfpacc2x4567);
372 vacc2x89AB = vcvtnq_s32_f32(vfpacc2x89AB);
373 vacc2xCDEF = vcvtnq_s32_f32(vfpacc2xCDEF);
374 vacc3x0123 = vcvtnq_s32_f32(vfpacc3x0123);
375 vacc3x4567 = vcvtnq_s32_f32(vfpacc3x4567);
376 vacc3x89AB = vcvtnq_s32_f32(vfpacc3x89AB);
377 vacc3xCDEF = vcvtnq_s32_f32(vfpacc3xCDEF);
378 vacc4x0123 = vcvtnq_s32_f32(vfpacc4x0123);
379 vacc4x4567 = vcvtnq_s32_f32(vfpacc4x4567);
380 vacc4x89AB = vcvtnq_s32_f32(vfpacc4x89AB);
381 vacc4xCDEF = vcvtnq_s32_f32(vfpacc4xCDEF);
382 vacc5x0123 = vcvtnq_s32_f32(vfpacc5x0123);
383 vacc5x4567 = vcvtnq_s32_f32(vfpacc5x4567);
384 vacc5x89AB = vcvtnq_s32_f32(vfpacc5x89AB);
385 vacc5xCDEF = vcvtnq_s32_f32(vfpacc5xCDEF);
386 vacc6x0123 = vcvtnq_s32_f32(vfpacc6x0123);
387 vacc6x4567 = vcvtnq_s32_f32(vfpacc6x4567);
388 vacc6x89AB = vcvtnq_s32_f32(vfpacc6x89AB);
389 vacc6xCDEF = vcvtnq_s32_f32(vfpacc6xCDEF);
390 vacc7x0123 = vcvtnq_s32_f32(vfpacc7x0123);
391 vacc7x4567 = vcvtnq_s32_f32(vfpacc7x4567);
392 vacc7x89AB = vcvtnq_s32_f32(vfpacc7x89AB);
393 vacc7xCDEF = vcvtnq_s32_f32(vfpacc7xCDEF);
394
395 const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
396#if XNN_ARCH_ARM64
397 const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
398 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
399 const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
400 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
401 const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
402 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
403 const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
404 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
405 const int16x8_t vacc4x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x0123), vacc4x4567), voutput_zero_point);
406 const int16x8_t vacc4x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc4x89AB), vacc4xCDEF), voutput_zero_point);
407 const int16x8_t vacc5x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x0123), vacc5x4567), voutput_zero_point);
408 const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc5x89AB), vacc5xCDEF), voutput_zero_point);
409 const int16x8_t vacc6x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x0123), vacc6x4567), voutput_zero_point);
410 const int16x8_t vacc6x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc6x89AB), vacc6xCDEF), voutput_zero_point);
411 const int16x8_t vacc7x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x0123), vacc7x4567), voutput_zero_point);
412 const int16x8_t vacc7x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc7x89AB), vacc7xCDEF), voutput_zero_point);
413
414 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
415 int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
416 int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
417 int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
418 int8x16_t vout4x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc4x01234567), vacc4x89ABCDEF);
419 int8x16_t vout5x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc5x01234567), vacc5x89ABCDEF);
420 int8x16_t vout6x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc6x01234567), vacc6x89ABCDEF);
421 int8x16_t vout7x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc7x01234567), vacc7x89ABCDEF);
422#else
423 const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
424 const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
425 const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
426 const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
427 const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
428 const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
429 const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
430 const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
431 const int16x8_t vacc4x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x0123), vqmovn_s32(vacc4x4567)), voutput_zero_point);
432 const int16x8_t vacc4x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc4x89AB), vqmovn_s32(vacc4xCDEF)), voutput_zero_point);
433 const int16x8_t vacc5x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x0123), vqmovn_s32(vacc5x4567)), voutput_zero_point);
434 const int16x8_t vacc5x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc5x89AB), vqmovn_s32(vacc5xCDEF)), voutput_zero_point);
435 const int16x8_t vacc6x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x0123), vqmovn_s32(vacc6x4567)), voutput_zero_point);
436 const int16x8_t vacc6x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc6x89AB), vqmovn_s32(vacc6xCDEF)), voutput_zero_point);
437 const int16x8_t vacc7x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x0123), vqmovn_s32(vacc7x4567)), voutput_zero_point);
438 const int16x8_t vacc7x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc7x89AB), vqmovn_s32(vacc7xCDEF)), voutput_zero_point);
439
440 int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
441 int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
442 int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
443 int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
444 int8x16_t vout4x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc4x01234567), vqmovn_s16(vacc4x89ABCDEF));
445 int8x16_t vout5x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc5x01234567), vqmovn_s16(vacc5x89ABCDEF));
446 int8x16_t vout6x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc6x01234567), vqmovn_s16(vacc6x89ABCDEF));
447 int8x16_t vout7x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc7x01234567), vqmovn_s16(vacc7x89ABCDEF));
448#endif
449 const int8x16_t voutput_min = vld1q_dup_s8(&params->neon.output_min);
450 const int8x16_t voutput_max = vld1q_dup_s8(&params->neon.output_max);
451
452 vout7x0123456789ABCDEF = vmaxq_s8(vout7x0123456789ABCDEF, voutput_min);
453 vout6x0123456789ABCDEF = vmaxq_s8(vout6x0123456789ABCDEF, voutput_min);
454 vout5x0123456789ABCDEF = vmaxq_s8(vout5x0123456789ABCDEF, voutput_min);
455 vout4x0123456789ABCDEF = vmaxq_s8(vout4x0123456789ABCDEF, voutput_min);
456 vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
457 vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
458 vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
459 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
460
461 vout7x0123456789ABCDEF = vminq_s8(vout7x0123456789ABCDEF, voutput_max);
462 vout6x0123456789ABCDEF = vminq_s8(vout6x0123456789ABCDEF, voutput_max);
463 vout5x0123456789ABCDEF = vminq_s8(vout5x0123456789ABCDEF, voutput_max);
464 vout4x0123456789ABCDEF = vminq_s8(vout4x0123456789ABCDEF, voutput_max);
465 vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
466 vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
467 vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
468 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
469
470 if (nc >= 16) {
471 vst1q_s8(c7 + 0, vout7x0123456789ABCDEF);
472 vst1q_s8(c6 + 0, vout6x0123456789ABCDEF);
473 vst1q_s8(c5 + 0, vout5x0123456789ABCDEF);
474 vst1q_s8(c4 + 0, vout4x0123456789ABCDEF);
475 vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
476 vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
477 vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
478 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
479
480 c7 = (int8_t*) ((uintptr_t) c7 + cn_stride);
481 c6 = (int8_t*) ((uintptr_t) c6 + cn_stride);
482 c5 = (int8_t*) ((uintptr_t) c5 + cn_stride);
483 c4 = (int8_t*) ((uintptr_t) c4 + cn_stride);
484 c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
485 c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
486 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
487 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
488
489 a = (const int8_t**restrict) ((uintptr_t) a - ks);
490
491 nc -= 16;
492 } else {
Marat Dukhane76478b2021-06-28 16:35:40 -0700493 int8x16_t vout6x01234567_7x01234567 = vcombine_s8(vget_low_s8(vout6x0123456789ABCDEF), vget_low_s8(vout7x0123456789ABCDEF));
Frank Barchardd460d0b2021-09-08 11:35:37 -0700494 int8x16_t vout4x01234567_5x01234567 = vcombine_s8(vget_low_s8(vout4x0123456789ABCDEF), vget_low_s8(vout5x0123456789ABCDEF));
495 int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
496 int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
Marat Dukhane76478b2021-06-28 16:35:40 -0700497 if (nc & 8) {
498 vst1_s8(c7, vget_high_s8(vout6x01234567_7x01234567)); c7 += 8;
499 vst1_s8(c6, vget_low_s8(vout6x01234567_7x01234567)); c6 += 8;
500 vst1_s8(c5, vget_high_s8(vout4x01234567_5x01234567)); c5 += 8;
501 vst1_s8(c4, vget_low_s8(vout4x01234567_5x01234567)); c4 += 8;
502 vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
503 vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
504 vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
505 vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
506 vout6x01234567_7x01234567 = vcombine_s8(vget_high_s8(vout6x0123456789ABCDEF), vget_high_s8(vout7x0123456789ABCDEF));
507 vout4x01234567_5x01234567 = vcombine_s8(vget_high_s8(vout4x0123456789ABCDEF), vget_high_s8(vout5x0123456789ABCDEF));
508 vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
509 vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
510 }
511 if (nc & 4) {
512 vst1q_lane_u32(__builtin_assume_aligned(c7, 1), vreinterpretq_u32_s8(vout6x01234567_7x01234567), 2); c7 += 4;
513 vst1q_lane_u32(__builtin_assume_aligned(c6, 1), vreinterpretq_u32_s8(vout6x01234567_7x01234567), 0); c6 += 4;
514 vst1q_lane_u32(__builtin_assume_aligned(c5, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 2); c5 += 4;
515 vst1q_lane_u32(__builtin_assume_aligned(c4, 1), vreinterpretq_u32_s8(vout4x01234567_5x01234567), 0); c4 += 4;
516 vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
517 vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
518 vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
519 vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
520 vout6x01234567_7x01234567 = vextq_s8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 4);
521 vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 4);
522 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
523 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
524 }
525 if (nc & 2) {
526 vst1q_lane_u16(__builtin_assume_aligned(c7, 1), vreinterpretq_u16_s8(vout6x01234567_7x01234567), 4); c7 += 2;
527 vst1q_lane_u16(__builtin_assume_aligned(c6, 1), vreinterpretq_u16_s8(vout6x01234567_7x01234567), 0); c6 += 2;
528 vst1q_lane_u16(__builtin_assume_aligned(c5, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 4); c5 += 2;
529 vst1q_lane_u16(__builtin_assume_aligned(c4, 1), vreinterpretq_u16_s8(vout4x01234567_5x01234567), 0); c4 += 2;
530 vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
531 vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
532 vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
533 vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
534 vout6x01234567_7x01234567 = vextq_s8(vout6x01234567_7x01234567, vout6x01234567_7x01234567, 2);
535 vout4x01234567_5x01234567 = vextq_s8(vout4x01234567_5x01234567, vout4x01234567_5x01234567, 2);
536 vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
537 vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
538 }
539 if (nc & 1) {
540 vst1q_lane_s8(c7, vout6x01234567_7x01234567, 8);
541 vst1q_lane_s8(c6, vout6x01234567_7x01234567, 0);
542 vst1q_lane_s8(c5, vout4x01234567_5x01234567, 8);
543 vst1q_lane_s8(c4, vout4x01234567_5x01234567, 0);
544 vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
545 vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
546 vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
547 vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
548 }
549
550 nc = 0;
551 }
552 } while (nc != 0);
553}