blob: 41057cb9a89167c0ad8a47fa661fea76ba9c499e [file] [log] [blame]
Frank Barchard40668982021-08-24 11:12:04 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
Marat Dukhan89991902021-12-06 00:54:36 -08006$assert REQUANTIZATION in ["FP32", "RNDNU"]
Frank Barchard40668982021-08-24 11:12:04 -07007
8#include <xnnpack/assembly.h>
9
Marat Dukhan89991902021-12-06 00:54:36 -080010$REWIND_DECREMENT = {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
Frank Barchard40668982021-08-24 11:12:04 -070011# void xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128(
12# size_t mr, x0
13# size_t nc, x1
14# size_t kc, x2 / x0
15# const int8_t* restrict a, x3
16# size_t a_stride, x4
17# const void* restrict w, x5
18# int8_t* restrict c, x6
19# size_t cm_stride, x7
20# size_t cn_stride, [sp] -> x12
Frank Barchard0c764222021-08-24 16:13:06 -070021# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11
Frank Barchard40668982021-08-24 11:12:04 -070022
Frank Barchard40668982021-08-24 11:12:04 -070023$if REQUANTIZATION == "RNDNU":
24 # params structure is 20 bytes
25 # struct {
26 # uint8_t kernel_zero_point[4];
27 # int32_t right_pre_shift;
28 # int32_t multiplier;
29 # int32_t right_post_shift;
30 # int16_t output_zero_point;
31 # int8_t output_min;
32 # int8_t output_max;
33 # } rndnu_neon;
34$elif REQUANTIZATION == "FP32":
35 # params structure is 12 bytes
36 # struct {
37 # uint8_t kernel_zero_point[4];
38 # float scale;
39 # int16_t output_zero_point;
40 # int8_t output_min;
41 # int8_t output_max;
42 # } fp32_neonv8;
43
44# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
45
46# Register usage
47# A0 x3 v0
48# A1 x15 v1
49# A2 x13 v2
50# A3 x4 v3
51# B x5 v4 v5 v6 v7
52# C0 x6 v16 v20 v24 v28
53# C1 x8 v17 v21 v25 v29
54# C2 x9 v18 v22 v26 v30
55# C3 x7 v19 v23 v27 v31
56# zero_point v8 v12 v13 v14 v15
57# unused v9 v10 v11
58
59BEGIN_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
60
61 # Clamp A and C pointers
62 CMP x0, 2 // if mr < 2
63 LDP x12, x11, [sp] // cn_stride, params
64 ADD x2, x2, 3 // kc = (kc + 3) & ~3
65 ADD x15, x3, x4 // a1 = a0 + a_stride
66 ADD x8, x6, x7 // c1 = c0 + cm_stride
67
Frank Barchard0c764222021-08-24 16:13:06 -070068 # Save d8,d12-d15 on stack
69 STR d8, [sp, -48]!
Frank Barchard40668982021-08-24 11:12:04 -070070 CSEL x15, x3, x15, LO // a1 = a0
71 CSEL x8, x6, x8, LO // c1 = c0
72 BIC x2, x2, 3
73
74 STP d12, d13, [sp, 16]
75 ADD x13, x15, x4 // a2 = a1 + a_stride
76 ADD x9, x8, x7 // c2 = c1 + cm_stride
77 // if mr <= 2
78 CSEL x13, x15, x13, LS // a2 = a1
79 CSEL x9, x8, x9, LS // c2 = c1
80
81 STP d14, d15, [sp, 32]
82 CMP x0, 4 // if mr < 4
83 ADD x4, x13, x4 // a3 = a2 + a_stride
84 ADD x7, x9, x7 // c3 = c2 + cm_stride
85
86 LD1R {v8.4s}, [x11], 4 // kernel_zero_point
87
88 CSEL x4, x13, x4, LO // a3 = a2
89 CSEL x7, x9, x7, LO // c3 = c2
90
91 .p2align 3
920:
93 # Load initial bias from w into accumulators
94 LDP q16, q20, [x5], 32
95
96 MOVI v12.4s, 0
97 MOVI v13.4s, 0
98 MOVI v14.4s, 0
99 MOVI v15.4s, 0
100
101 MOV v17.16b, v16.16b
102 MOV v18.16b, v16.16b
103 LDP q24, q28, [x5], 32
104 MOV v19.16b, v16.16b
105 MOV v21.16b, v20.16b
106 SUBS x0, x2, 16 // k = kc - 16
107 MOV v22.16b, v20.16b
108 MOV v23.16b, v20.16b
109 MOV v25.16b, v24.16b
110 MOV v26.16b, v24.16b
111 MOV v27.16b, v24.16b
112 MOV v29.16b, v28.16b
113 MOV v30.16b, v28.16b
114 MOV v31.16b, v28.16b
115
116 # Is there at least 16 bytes?
117 B.LO 3f
118
119 # Main loop - 16 bytes of A
120 .p2align 3
1211:
122 LDR q0, [x3], 16
123 LDR q4, [x5], 16
124 LDR q1, [x15], 16
125 LDR q2, [x13], 16
126 LDR q3, [x4], 16
127 LDR q5, [x5], 16
128
129 UDOT v12.4s, v8.16b, v0.16b // update zero point
130 UDOT v13.4s, v8.16b, v1.16b
131 UDOT v14.4s, v8.16b, v2.16b
132 UDOT v15.4s, v8.16b, v3.16b
133
134 UDOT v16.4s, v4.16b, v0.4b[0]
135 UDOT v17.4s, v4.16b, v1.4b[0]
136 LDP q6, q7, [x5], 32
137 UDOT v18.4s, v4.16b, v2.4b[0]
138 UDOT v19.4s, v4.16b, v3.4b[0]
139 UDOT v20.4s, v5.16b, v0.4b[0]
140 UDOT v21.4s, v5.16b, v1.4b[0]
141 UDOT v22.4s, v5.16b, v2.4b[0]
142 UDOT v23.4s, v5.16b, v3.4b[0]
143 UDOT v24.4s, v6.16b, v0.4b[0]
144 UDOT v25.4s, v6.16b, v1.4b[0]
145 LDP q4, q5, [x5], 32
146 UDOT v26.4s, v6.16b, v2.4b[0]
147 UDOT v27.4s, v6.16b, v3.4b[0]
148 UDOT v28.4s, v7.16b, v0.4b[0]
149 UDOT v29.4s, v7.16b, v1.4b[0]
150 UDOT v30.4s, v7.16b, v2.4b[0]
151 UDOT v31.4s, v7.16b, v3.4b[0]
152
153 UDOT v16.4s, v4.16b, v0.4b[1]
154 UDOT v17.4s, v4.16b, v1.4b[1]
155 LDP q6, q7, [x5], 32
156 UDOT v18.4s, v4.16b, v2.4b[1]
157 UDOT v19.4s, v4.16b, v3.4b[1]
158 UDOT v20.4s, v5.16b, v0.4b[1]
159 UDOT v21.4s, v5.16b, v1.4b[1]
160 UDOT v22.4s, v5.16b, v2.4b[1]
161 UDOT v23.4s, v5.16b, v3.4b[1]
162 UDOT v24.4s, v6.16b, v0.4b[1]
163 UDOT v25.4s, v6.16b, v1.4b[1]
164 LDP q4, q5, [x5], 32
165 UDOT v26.4s, v6.16b, v2.4b[1]
166 UDOT v27.4s, v6.16b, v3.4b[1]
167 UDOT v28.4s, v7.16b, v0.4b[1]
168 UDOT v29.4s, v7.16b, v1.4b[1]
169 UDOT v30.4s, v7.16b, v2.4b[1]
170 UDOT v31.4s, v7.16b, v3.4b[1]
171
172 UDOT v16.4s, v4.16b, v0.4b[2]
173 UDOT v17.4s, v4.16b, v1.4b[2]
174 LDP q6, q7, [x5], 32
175 UDOT v18.4s, v4.16b, v2.4b[2]
176 UDOT v19.4s, v4.16b, v3.4b[2]
177 UDOT v20.4s, v5.16b, v0.4b[2]
178 UDOT v21.4s, v5.16b, v1.4b[2]
179 UDOT v22.4s, v5.16b, v2.4b[2]
180 UDOT v23.4s, v5.16b, v3.4b[2]
181 UDOT v24.4s, v6.16b, v0.4b[2]
182 UDOT v25.4s, v6.16b, v1.4b[2]
183 LDP q4, q5, [x5], 32
184 UDOT v26.4s, v6.16b, v2.4b[2]
185 UDOT v27.4s, v6.16b, v3.4b[2]
186 UDOT v28.4s, v7.16b, v0.4b[2]
187 UDOT v29.4s, v7.16b, v1.4b[2]
188 UDOT v30.4s, v7.16b, v2.4b[2]
189 UDOT v31.4s, v7.16b, v3.4b[2]
190
191 UDOT v16.4s, v4.16b, v0.4b[3]
192 UDOT v17.4s, v4.16b, v1.4b[3]
193 LDP q6, q7, [x5], 32
194 UDOT v18.4s, v4.16b, v2.4b[3]
195 UDOT v19.4s, v4.16b, v3.4b[3]
196 UDOT v20.4s, v5.16b, v0.4b[3]
197 UDOT v21.4s, v5.16b, v1.4b[3]
198 UDOT v22.4s, v5.16b, v2.4b[3]
199 UDOT v23.4s, v5.16b, v3.4b[3]
200 UDOT v24.4s, v6.16b, v0.4b[3]
201 UDOT v25.4s, v6.16b, v1.4b[3]
202 UDOT v26.4s, v6.16b, v2.4b[3]
203 UDOT v27.4s, v6.16b, v3.4b[3]
204 SUBS x0, x0, 16
205 UDOT v28.4s, v7.16b, v0.4b[3]
206 UDOT v29.4s, v7.16b, v1.4b[3]
207 UDOT v30.4s, v7.16b, v2.4b[3]
208 UDOT v31.4s, v7.16b, v3.4b[3]
209 B.HS 1b
210
211 # Is there a remainder?- 4 to 12 bytes of A
212 TST x0, 15
213 B.NE 3f
214
2152:
216 ADDP v0.4s, v12.4s, v12.4s
217 ADDP v1.4s, v13.4s, v13.4s
218 ADDP v2.4s, v14.4s, v14.4s
219 ADDP v3.4s, v15.4s, v15.4s
220 ADDP v12.4s, v0.4s, v0.4s
221 ADDP v13.4s, v1.4s, v1.4s
222 ADDP v14.4s, v2.4s, v2.4s
223 ADDP v15.4s, v3.4s, v3.4s
224
225 # Subtract zero point from accumulators
226 SUB v16.4s, v16.4s, v12.4s
227 SUB v17.4s, v17.4s, v13.4s
228 SUB v18.4s, v18.4s, v14.4s
229 SUB v19.4s, v19.4s, v15.4s
230 SUB v20.4s, v20.4s, v12.4s
231 SUB v21.4s, v21.4s, v13.4s
232 SUB v22.4s, v22.4s, v14.4s
233 SUB v23.4s, v23.4s, v15.4s
234 SUB v24.4s, v24.4s, v12.4s
235 SUB v25.4s, v25.4s, v13.4s
236 SUB v26.4s, v26.4s, v14.4s
237 SUB v27.4s, v27.4s, v15.4s
238 SUB v28.4s, v28.4s, v12.4s
239 SUB v29.4s, v29.4s, v13.4s
240 SUB v30.4s, v30.4s, v14.4s
241 SUB v31.4s, v31.4s, v15.4s
242
Marat Dukhan89991902021-12-06 00:54:36 -0800243 $if REQUANTIZATION == "RNDNU":
Frank Barchard40668982021-08-24 11:12:04 -0700244 # Apply params - preshift, scale, postshift, bias and clamp
245 LD1R {v4.4s}, [x11], 4
246 SSHL v16.4s, v16.4s, v4.4s // shift to upper bits
247 SSHL v17.4s, v17.4s, v4.4s
248 SSHL v18.4s, v18.4s, v4.4s
249 SSHL v19.4s, v19.4s, v4.4s
250 SSHL v20.4s, v20.4s, v4.4s
251 SSHL v21.4s, v21.4s, v4.4s
252 SSHL v22.4s, v22.4s, v4.4s
253 SSHL v23.4s, v23.4s, v4.4s
254 LD1R {v5.4s}, [x11], 4
255 SSHL v24.4s, v24.4s, v4.4s
256 SSHL v25.4s, v25.4s, v4.4s
257 SSHL v26.4s, v26.4s, v4.4s
258 SSHL v27.4s, v27.4s, v4.4s
259 SSHL v28.4s, v28.4s, v4.4s
260 SSHL v29.4s, v29.4s, v4.4s
261 SSHL v30.4s, v30.4s, v4.4s
262 SSHL v31.4s, v31.4s, v4.4s
263 LD1R {v6.4s}, [x11], 4
264 SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding
265 SQDMULH v17.4s, v17.4s, v5.4s
266 SQDMULH v18.4s, v18.4s, v5.4s
267 SQDMULH v19.4s, v19.4s, v5.4s
268 SQDMULH v20.4s, v20.4s, v5.4s
269 SQDMULH v21.4s, v21.4s, v5.4s
270 SQDMULH v22.4s, v22.4s, v5.4s
271 SQDMULH v23.4s, v23.4s, v5.4s
272 SQDMULH v24.4s, v24.4s, v5.4s
273 SQDMULH v25.4s, v25.4s, v5.4s
274 SQDMULH v26.4s, v26.4s, v5.4s
275 SQDMULH v27.4s, v27.4s, v5.4s
276 SQDMULH v28.4s, v28.4s, v5.4s
277 SQDMULH v29.4s, v29.4s, v5.4s
278 SQDMULH v30.4s, v30.4s, v5.4s
279 SQDMULH v31.4s, v31.4s, v5.4s
280 SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left
281 SRSHL v17.4s, v17.4s, v6.4s
282 SRSHL v18.4s, v18.4s, v6.4s
283 SRSHL v19.4s, v19.4s, v6.4s
284 SRSHL v20.4s, v20.4s, v6.4s
285 SRSHL v21.4s, v21.4s, v6.4s
286 SRSHL v22.4s, v22.4s, v6.4s
287 SRSHL v23.4s, v23.4s, v6.4s
288 SRSHL v24.4s, v24.4s, v6.4s
289 SRSHL v25.4s, v25.4s, v6.4s
290 SRSHL v26.4s, v26.4s, v6.4s
291 SRSHL v27.4s, v27.4s, v6.4s
292 SRSHL v28.4s, v28.4s, v6.4s
293 SRSHL v29.4s, v29.4s, v6.4s
294 SRSHL v30.4s, v30.4s, v6.4s
295 SRSHL v31.4s, v31.4s, v6.4s
296 $elif REQUANTIZATION == "FP32":
297 SCVTF v16.4s, v16.4s
298 SCVTF v17.4s, v17.4s
Frank Barchard0c764222021-08-24 16:13:06 -0700299 # Apply params - scale, bias and clamp
300 LD1R {v4.4s}, [x11], 4
301 SCVTF v18.4s, v18.4s
302 SCVTF v19.4s, v19.4s
Frank Barchard40668982021-08-24 11:12:04 -0700303 SCVTF v20.4s, v20.4s
304 SCVTF v21.4s, v21.4s
305 SCVTF v22.4s, v22.4s
306 SCVTF v23.4s, v23.4s
307 SCVTF v24.4s, v24.4s
308 SCVTF v25.4s, v25.4s
309 SCVTF v26.4s, v26.4s
310 SCVTF v27.4s, v27.4s
311 SCVTF v28.4s, v28.4s
312 SCVTF v29.4s, v29.4s
313 SCVTF v30.4s, v30.4s
314 SCVTF v31.4s, v31.4s
315
Frank Barchard0c764222021-08-24 16:13:06 -0700316 FMUL v16.4s, v16.4s, v4.4s
317 FMUL v17.4s, v17.4s, v4.4s
318 FMUL v18.4s, v18.4s, v4.4s
319 FMUL v19.4s, v19.4s, v4.4s
320 FMUL v20.4s, v20.4s, v4.4s
321 FMUL v21.4s, v21.4s, v4.4s
322 FMUL v22.4s, v22.4s, v4.4s
323 FMUL v23.4s, v23.4s, v4.4s
324 FMUL v24.4s, v24.4s, v4.4s
325 FMUL v25.4s, v25.4s, v4.4s
326 FMUL v26.4s, v26.4s, v4.4s
327 FMUL v27.4s, v27.4s, v4.4s
328 FMUL v28.4s, v28.4s, v4.4s
329 FMUL v29.4s, v29.4s, v4.4s
330 FMUL v30.4s, v30.4s, v4.4s
331 FMUL v31.4s, v31.4s, v4.4s
Frank Barchard40668982021-08-24 11:12:04 -0700332
333 FCVTNS v16.4s, v16.4s
334 FCVTNS v17.4s, v17.4s
335 FCVTNS v18.4s, v18.4s
336 FCVTNS v19.4s, v19.4s
337 FCVTNS v20.4s, v20.4s
338 FCVTNS v21.4s, v21.4s
339 FCVTNS v22.4s, v22.4s
340 FCVTNS v23.4s, v23.4s
341 FCVTNS v24.4s, v24.4s
342 FCVTNS v25.4s, v25.4s
343 FCVTNS v26.4s, v26.4s
344 FCVTNS v27.4s, v27.4s
345 FCVTNS v28.4s, v28.4s
346 FCVTNS v29.4s, v29.4s
347 FCVTNS v30.4s, v30.4s
348 FCVTNS v31.4s, v31.4s
349
350 SQXTN v16.4h, v16.4s
351 SQXTN v17.4h, v17.4s
352 SQXTN v18.4h, v18.4s
353 SQXTN v19.4h, v19.4s
354 SQXTN v24.4h, v24.4s
355 SQXTN v25.4h, v25.4s
356 SQXTN v26.4h, v26.4s
357 SQXTN v27.4h, v27.4s
358 LD1R {v6.8h}, [x11], 2 // add bias
359
360 SQXTN2 v16.8h, v20.4s
361 SQXTN2 v17.8h, v21.4s
362 SQXTN2 v18.8h, v22.4s
363 SQXTN2 v19.8h, v23.4s
364 SQXTN2 v24.8h, v28.4s
365 SQXTN2 v25.8h, v29.4s
366 SQXTN2 v26.8h, v30.4s
367 SQXTN2 v27.8h, v31.4s
368
369 SQADD v16.8h, v16.8h, v6.8h
370 SQADD v17.8h, v17.8h, v6.8h
371 SQADD v18.8h, v18.8h, v6.8h
372 SQADD v19.8h, v19.8h, v6.8h
373 SQADD v24.8h, v24.8h, v6.8h
374 SQADD v25.8h, v25.8h, v6.8h
375 SQADD v26.8h, v26.8h, v6.8h
376 SQADD v27.8h, v27.8h, v6.8h
377 LD1R {v4.16b}, [x11], 1 // clamp min value
378
379 SQXTUN v0.8b, v16.8h
380 SQXTUN v1.8b, v17.8h
381 SQXTUN v2.8b, v18.8h
382 SQXTUN v3.8b, v19.8h
383 LD1R {v5.16b}, [x11] // clamp max value
384 SQXTUN2 v0.16b, v24.8h
385 SQXTUN2 v1.16b, v25.8h
386 SQXTUN2 v2.16b, v26.8h
387 SQXTUN2 v3.16b, v27.8h
388
389 SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer
390
391 UMAX v0.16b, v0.16b, v4.16b
392 UMAX v1.16b, v1.16b, v4.16b
393 UMAX v2.16b, v2.16b, v4.16b
394 UMAX v3.16b, v3.16b, v4.16b
395 SUBS x1, x1, 16
396 UMIN v0.16b, v0.16b, v5.16b
397 UMIN v1.16b, v1.16b, v5.16b
398 UMIN v2.16b, v2.16b, v5.16b
399 UMIN v3.16b, v3.16b, v5.16b
400 B.LO 5f
401
402 # Store full 4 x 16
403 ST1 {v0.16b}, [x6], x12
404 SUB x3, x3, x2 // a0 -= kc
405 ST1 {v1.16b}, [x8], x12
406 SUB x15, x15, x2 // a1 -= kc
407 ST1 {v2.16b}, [x9], x12
408 SUB x13, x13, x2 // a2 -= kc
409 ST1 {v3.16b}, [x7], x12
410 SUB x4, x4, x2 // a3 -= kc
411 B.NE 0b
412
Frank Barchard0c764222021-08-24 16:13:06 -0700413 # Restore d8,d12-d15 from stack
Frank Barchard40668982021-08-24 11:12:04 -0700414 LDP d14, d15, [sp, 32]
415 LDP d12, d13, [sp, 16]
Frank Barchard0c764222021-08-24 16:13:06 -0700416 LDR d8, [sp], 48
Frank Barchard40668982021-08-24 11:12:04 -0700417 RET
418
419 # Remainder- 8 bytes of A
420 .p2align 3
4213:
422 # Is there a remainder?- 8 bytes of A
423 TBZ x0, 3, 4f
424
425 LDR d0, [x3], 8
426 LDR q4, [x5], 16
427 LDR d1, [x15], 8
428 LDR d2, [x13], 8
429 LDR d3, [x4], 8
430 LDR q5, [x5], 16
431
432 UDOT v12.4s, v8.16b, v0.16b // update zero point
433 UDOT v13.4s, v8.16b, v1.16b
434 UDOT v14.4s, v8.16b, v2.16b
435 UDOT v15.4s, v8.16b, v3.16b
436
437 UDOT v16.4s, v4.16b, v0.4b[0]
438 UDOT v17.4s, v4.16b, v1.4b[0]
439 LDP q6, q7, [x5], 32
440 UDOT v18.4s, v4.16b, v2.4b[0]
441 UDOT v19.4s, v4.16b, v3.4b[0]
442 UDOT v20.4s, v5.16b, v0.4b[0]
443 UDOT v21.4s, v5.16b, v1.4b[0]
444 UDOT v22.4s, v5.16b, v2.4b[0]
445 UDOT v23.4s, v5.16b, v3.4b[0]
446 UDOT v24.4s, v6.16b, v0.4b[0]
447 UDOT v25.4s, v6.16b, v1.4b[0]
448 LDP q4, q5, [x5], 32
449 UDOT v26.4s, v6.16b, v2.4b[0]
450 UDOT v27.4s, v6.16b, v3.4b[0]
451 UDOT v28.4s, v7.16b, v0.4b[0]
452 UDOT v29.4s, v7.16b, v1.4b[0]
453 UDOT v30.4s, v7.16b, v2.4b[0]
454 UDOT v31.4s, v7.16b, v3.4b[0]
455 UDOT v16.4s, v4.16b, v0.4b[1]
456 UDOT v17.4s, v4.16b, v1.4b[1]
457 LDP q6, q7, [x5], 32
458 UDOT v18.4s, v4.16b, v2.4b[1]
459 UDOT v19.4s, v4.16b, v3.4b[1]
460 UDOT v20.4s, v5.16b, v0.4b[1]
461 UDOT v21.4s, v5.16b, v1.4b[1]
462 UDOT v22.4s, v5.16b, v2.4b[1]
463 UDOT v23.4s, v5.16b, v3.4b[1]
464 UDOT v24.4s, v6.16b, v0.4b[1]
465 UDOT v25.4s, v6.16b, v1.4b[1]
466 UDOT v26.4s, v6.16b, v2.4b[1]
467 UDOT v27.4s, v6.16b, v3.4b[1]
468 UDOT v28.4s, v7.16b, v0.4b[1]
469 UDOT v29.4s, v7.16b, v1.4b[1]
470 UDOT v30.4s, v7.16b, v2.4b[1]
471 UDOT v31.4s, v7.16b, v3.4b[1]
Frank Barchard40668982021-08-24 11:12:04 -0700472 # Is there a remainder?- 4 bytes of A
473 TBZ x0, 2, 2b
474
Frank Barchard6b30b732021-08-27 12:17:04 -0700475 # Remainder- 4 bytes of A
4764:
Frank Barchard40668982021-08-24 11:12:04 -0700477 LDR s0, [x3], 4
478 LDR q4, [x5], 16
479 LDR s1, [x15], 4
480 LDR s2, [x13], 4
481 LDR s3, [x4], 4
482 LDR q5, [x5], 16
483
484 UDOT v12.4s, v8.16b, v0.16b // update zero point
485 UDOT v13.4s, v8.16b, v1.16b
486 UDOT v14.4s, v8.16b, v2.16b
487 UDOT v15.4s, v8.16b, v3.16b
488
489 UDOT v16.4s, v4.16b, v0.4b[0]
490 UDOT v17.4s, v4.16b, v1.4b[0]
491 UDOT v18.4s, v4.16b, v2.4b[0]
492 UDOT v19.4s, v4.16b, v3.4b[0]
493 LDP q6, q7, [x5], 32
494 UDOT v20.4s, v5.16b, v0.4b[0]
495 UDOT v21.4s, v5.16b, v1.4b[0]
496 UDOT v22.4s, v5.16b, v2.4b[0]
497 UDOT v23.4s, v5.16b, v3.4b[0]
498 UDOT v24.4s, v6.16b, v0.4b[0]
499 UDOT v25.4s, v6.16b, v1.4b[0]
500 UDOT v26.4s, v6.16b, v2.4b[0]
501 UDOT v27.4s, v6.16b, v3.4b[0]
502 UDOT v28.4s, v7.16b, v0.4b[0]
503 UDOT v29.4s, v7.16b, v1.4b[0]
504 UDOT v30.4s, v7.16b, v2.4b[0]
505 UDOT v31.4s, v7.16b, v3.4b[0]
506 B 2b
507
508 # Store odd width
509 .p2align 3
5105:
511 TBZ x1, 3, 6f
512 STR d0, [x6], 8
Frank Barchard40668982021-08-24 11:12:04 -0700513 STR d1, [x8], 8
Frank Barchardb7a7c302021-09-01 16:24:25 -0700514 DUP d0, v0.d[1]
Frank Barchard40668982021-08-24 11:12:04 -0700515 DUP d1, v1.d[1]
516 STR d2, [x9], 8
Frank Barchard40668982021-08-24 11:12:04 -0700517 STR d3, [x7], 8
Frank Barchardb7a7c302021-09-01 16:24:25 -0700518 DUP d2, v2.d[1]
Frank Barchard40668982021-08-24 11:12:04 -0700519 DUP d3, v3.d[1]
5206:
521 TBZ x1, 2, 7f
522 STR s0, [x6], 4
Frank Barchard40668982021-08-24 11:12:04 -0700523 STR s1, [x8], 4
Frank Barchardb7a7c302021-09-01 16:24:25 -0700524 DUP s0, v0.s[1]
Frank Barchard40668982021-08-24 11:12:04 -0700525 DUP s1, v1.s[1]
526 STR s2, [x9], 4
Frank Barchard40668982021-08-24 11:12:04 -0700527 STR s3, [x7], 4
Frank Barchardb7a7c302021-09-01 16:24:25 -0700528 DUP s2, v2.s[1]
Frank Barchard40668982021-08-24 11:12:04 -0700529 DUP s3, v3.s[1]
5307:
531 TBZ x1, 1, 8f
Frank Barchard29833fd2021-09-01 00:36:02 -0700532 STR h0, [x6], 2
Frank Barchard29833fd2021-09-01 00:36:02 -0700533 STR h1, [x8], 2
Frank Barchardb7a7c302021-09-01 16:24:25 -0700534 DUP h0, v0.h[1]
Frank Barchard40668982021-08-24 11:12:04 -0700535 DUP h1, v1.h[1]
Frank Barchard29833fd2021-09-01 00:36:02 -0700536 STR h2, [x9], 2
Frank Barchard29833fd2021-09-01 00:36:02 -0700537 STR h3, [x7], 2
Frank Barchardb7a7c302021-09-01 16:24:25 -0700538 DUP h2, v2.h[1]
Frank Barchard40668982021-08-24 11:12:04 -0700539 DUP h3, v3.h[1]
5408:
541 TBZ x1, 0, 9f
Frank Barchard29833fd2021-09-01 00:36:02 -0700542 STR b0, [x6]
543 STR b1, [x8]
544 STR b2, [x9]
545 STR b3, [x7]
Frank Barchard40668982021-08-24 11:12:04 -07005469:
Frank Barchard0c764222021-08-24 16:13:06 -0700547 # Restore d8,d12-d15 from stack
Frank Barchard40668982021-08-24 11:12:04 -0700548 LDP d14, d15, [sp, 32]
549 LDP d12, d13, [sp, 16]
Frank Barchard0c764222021-08-24 16:13:06 -0700550 LDR d8, [sp], 48
Frank Barchard40668982021-08-24 11:12:04 -0700551 RET
552
553END_FUNCTION xnn_qu8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
554
555#ifdef __ELF__
556.section ".note.GNU-stack","",%progbits
557#endif