blob: 5ee7b9ced173677e85808eecb3748c07c9681bbb [file] [log] [blame]
Frank Barchard91e19992020-03-09 18:46:14 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
Frank Barchard91e19992020-03-09 18:46:14 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34# x4 a5
35
36# C pointers
37# x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42# x7 c5
43
44# x8 temporary vector shadow register
45
46# Vector register usage
47# A0 v0 v3
48# A1 v0[1] v3[1]
49# A2 v1 v4
50# A3 v1[1] v4[1]
51# A4 v2 v5
52# A5 v2[1] v5[1]
53# B v12 v13 v14 v15 second set of B
54# B v16 v17 v18 v19 first set
55# C v20 v21
56# C v22 v23
57# C v24 v25
58# C v26 v27
59# C v28 v29
60# C v30 v31
61# Clamp v6 v7
62# unused A v8 v9 v10 v11
63
64BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55
65
66 # Clamp A and C pointers
67 CMP x0, 2 // if mr < 2
68 ADD x9, x3, x4 // a1 = a0 + a_stride
69 ADD x16, x6, x7 // c1 = c0 + cm_stride
70 CSEL x9, x3, x9, LO // a1 = a0
71 CSEL x16, x6, x16, LO // c1 = c0
72
73 ADD x10, x9, x4 // a2 = a1 + a_stride
74 ADD x17, x16, x7 // c2 = c1 + cm_stride
75 // if mr <= 2
76 CSEL x10, x9, x10, LS // a2 = a1
77 CSEL x17, x16, x17, LS // c2 = c1
78
79 CMP x0, 4 // if mr < 4
80 ADD x11, x10, x4 // a3 = a2 + a_stride
81 ADD x18, x17, x7 // c3 = c2 + cm_stride
82 CSEL x11, x10, x11, LO // a3 = a2
83 CSEL x18, x17, x18, LO // c3 = c2
84
85 ADD x12, x11, x4 // a4 = a3 + a_stride
86 ADD x13, x18, x7 // c4 = c3 + cm_stride
87 // if mr <= 5
88 CSEL x12, x11, x12, LS // a4 = a3
89 CSEL x13, x18, x13, LS // c4 = c3
90
91 # Load acc, params pointer
92 LDP x15, x8, [sp, 8]
93
94 CMP x0, 6 // if mr < 6
95 ADD x4, x12, x4 // a5 = a4 + a_stride
96 ADD x7, x13, x7 // c5 = c4 + cm_stride
97 CSEL x4, x12, x4, LO // a5 = a4
98 CSEL x7, x13, x7, LO // c5 = c4
99
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700100 # Load min/max values
Frank Barchard91e19992020-03-09 18:46:14 -0700101 LD2R {v6.4s, v7.4s}, [x8]
102
103 # Load cn_stride
104 LDR x14, [sp]
105
106 // Save d12-d15 on stack
107 STP d12, d13, [sp, -32]!
108 STP d14, d15, [sp, 16]
109
1100:
111 # Load initial accumulators
112 LDP q20, q21, [x15], 32
113 LDP q22, q23, [x15], 32
114 LDP q24, q25, [x15], 32
115 LDP q26, q27, [x15], 32
116 LDP q28, q29, [x15], 32
117 LDP q30, q31, [x15], 32
118 SUBS x0, x2, 16 // k = kc - 16
119 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
120 PRFM PLDL1KEEP, [x3, 64]
121 PRFM PLDL1KEEP, [x9, 0]
122 PRFM PLDL1KEEP, [x9, 64]
123 PRFM PLDL1KEEP, [x10, 0]
124 PRFM PLDL1KEEP, [x10, 64]
125 PRFM PLDL1KEEP, [x11, 0]
126 PRFM PLDL1KEEP, [x11, 64]
127 PRFM PLDL1KEEP, [x12, 0]
128 PRFM PLDL1KEEP, [x12, 64]
129 PRFM PLDL1KEEP, [x4, 0]
130 PRFM PLDL1KEEP, [x4, 64]
131 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
132 PRFM PLDL1KEEP, [x5, 64]
133 PRFM PLDL1KEEP, [x5, 128]
134 PRFM PLDL1KEEP, [x5, 192]
135 PRFM PLDL1KEEP, [x5, 256]
136 PRFM PLDL1KEEP, [x5, 320]
137
138 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
139 B.LO 5f
140
141 # Prologue - First group loads, no FMA
142 LDR d0, [x3], 8 // a0
143 LDP q16, q17, [x5], 32 // b
144 LDR d1, [x10], 8 // a2
145 LDR d2, [x12], 8 // a4
146 LD1 {v0.d}[1], [x9], 8 // a1
147 LD1 {v1.d}[1], [x11], 8 // a3
148 LD1 {v2.d}[1], [x4], 8 // a5
149 SUBS x0, x0, 16
150 LDR q18, [x5], 16
151 LDR d19, [x5], 8
152 LDR x8, [x5], 8 // ins is in BLOCK 0
153
154 # Is there at least 4 floats (16 bytes) for main loop?
155 B.LO 2f
156
157 # Main loop - 4 floats of A (16 bytes)
158 # 48 FMA + 12 LD64 A + 8 LDR B
1591:
160 # First group of 24 FMA, Second group loads
161 // BLOCK 0
162 FMLA v20.4s, v16.4s, v0.s[0]
163 LDR d3, [x3], 8 // a0
164 FMLA v22.4s, v16.4s, v0.s[2]
165 INS v19.d[1], x8 // b from second group
166 FMLA v24.4s, v16.4s, v1.s[0]
167 LDR x8, [x9], 8 // a1
168
169 // BLOCK 1
170 FMLA v26.4s, v16.4s, v1.s[2]
171 LDR d12, [x5]
172 FMLA v28.4s, v16.4s, v2.s[0]
173 INS v3.d[1], x8 // a1 ins
174 FMLA v30.4s, v16.4s, v2.s[2]
175 LDR x8, [x5, 8] // b
176
177 // BLOCK 2
178 FMLA v21.4s, v17.4s, v0.s[0]
179 LDR d4, [x10], 8 // a2
180 FMLA v23.4s, v17.4s, v0.s[2]
181 INS v12.d[1], x8 // b ins
182 FMLA v25.4s, v17.4s, v1.s[0]
183 LDR x8, [x11], 8 // a3
184
185 // BLOCK 3
186 FMLA v27.4s, v17.4s, v1.s[2]
187 LDR d5, [x12], 8 // a4
188 FMLA v29.4s, v17.4s, v2.s[0]
189 INS v4.d[1], x8 // a3 ins
190 FMLA v31.4s, v17.4s, v2.s[2]
191 LDR x8, [x4], 8 // a5
192
193 // BLOCK 4
194 FMLA v20.4s, v18.4s, v0.s[1]
195 LDR d13, [x5, 16]
196 FMLA v22.4s, v18.4s, v0.s[3]
197 INS v5.d[1], x8 // a5 ins
198 FMLA v24.4s, v18.4s, v1.s[1]
199 LDR x8, [x5, 24]
200
201 // BLOCK 5
202 FMLA v26.4s, v18.4s, v1.s[3]
203 LDR d14, [x5, 32]
204 FMLA v28.4s, v18.4s, v2.s[1]
205 INS v13.d[1], x8 // b
206 FMLA v30.4s, v18.4s, v2.s[3]
207 LDR x8, [x5, 40]
208
209 // BLOCK 6
210 FMLA v21.4s, v19.4s, v0.s[1]
211 LDR d15, [x5, 48]
212 FMLA v23.4s, v19.4s, v0.s[3]
213 INS v14.d[1], x8 // b
214 FMLA v25.4s, v19.4s, v1.s[1]
215 LDR x8, [x5, 56]
216
217 // BLOCK 7
218 FMLA v27.4s, v19.4s, v1.s[3]
219 FMLA v29.4s, v19.4s, v2.s[1]
220 INS v15.d[1], x8
221 FMLA v31.4s, v19.4s, v2.s[3]
222
223 # Second group of 24 FMA, First group of loads
224 // BLOCK 0
225 FMLA v20.4s, v12.4s, v3.s[0]
226 LDR d0, [x3], 8 // a0
227 FMLA v22.4s, v12.4s, v3.s[2]
228 FMLA v24.4s, v12.4s, v4.s[0]
229 LDR x8, [x9], 8 // a1
230
231 // BLOCK 1
232 FMLA v26.4s, v12.4s, v4.s[2]
233 LDR d16, [x5, 64]
234 FMLA v28.4s, v12.4s, v5.s[0]
235 INS v0.d[1], x8 // a1 ins
236 FMLA v30.4s, v12.4s, v5.s[2]
237 LDR x8, [x5, 72] // b
238
239 // BLOCK 2
240 FMLA v21.4s, v13.4s, v3.s[0]
241 LDR d1, [x10], 8 // a2
242 FMLA v23.4s, v13.4s, v3.s[2]
243 INS v16.d[1], x8 // b
244 FMLA v25.4s, v13.4s, v4.s[0]
245 LDR x8, [x11], 8 // a3
246
247 // BLOCK 3
248 FMLA v27.4s, v13.4s, v4.s[2]
249 LDR d2, [x12], 8 // a4
250 FMLA v29.4s, v13.4s, v5.s[0]
251 INS v1.d[1], x8 // a3 ins
252 FMLA v31.4s, v13.4s, v5.s[2]
253 LDR x8, [x4], 8 // a5
254
255 // BLOCK 4
256 FMLA v20.4s, v14.4s, v3.s[1]
257 LDR d17, [x5, 80]
258 FMLA v22.4s, v14.4s, v3.s[3]
259 INS v2.d[1], x8 // a5 ins
260 FMLA v24.4s, v14.4s, v4.s[1]
261 LDR x8, [x5, 88]
262
263 // BLOCK 5
264 FMLA v26.4s, v14.4s, v4.s[3]
265 LDR d18, [x5, 96]
266 FMLA v28.4s, v14.4s, v5.s[1]
267 INS v17.d[1], x8 // b
268 FMLA v30.4s, v14.4s, v5.s[3]
269 LDR x8, [x5, 104]
270
271 // BLOCK 6
272 FMLA v21.4s, v15.4s, v3.s[1]
273 LDR d19, [x5, 112]
274 FMLA v23.4s, v15.4s, v3.s[3]
275 INS v18.d[1], x8 // b
276 FMLA v25.4s, v15.4s, v4.s[1]
277 LDR x8, [x5, 120]
278
279 // BLOCK 7
280 FMLA v27.4s, v15.4s, v4.s[3]
281 SUBS x0, x0, 16
282 FMLA v29.4s, v15.4s, v5.s[1]
283 ADD x5, x5, 128
284 FMLA v31.4s, v15.4s, v5.s[3]
285 B.HS 1b
286
287 # Epilogue - 4 floats of A (16 bytes)
288 # 48 FMA + 12 LD64 A + 8 LDR B
2892:
290 # First group of 24 FMA, Second group loads
291 // BLOCK 0
292 FMLA v20.4s, v16.4s, v0.s[0]
293 LDR d3, [x3], 8 // a0
294 FMLA v22.4s, v16.4s, v0.s[2]
295 INS v19.d[1], x8 // b from second group
296 FMLA v24.4s, v16.4s, v1.s[0]
297 LDR x8, [x9], 8 // a1
298
299 // BLOCK 1
300 FMLA v26.4s, v16.4s, v1.s[2]
301 LDR d12, [x5]
302 FMLA v28.4s, v16.4s, v2.s[0]
303 INS v3.d[1], x8 // a1 ins
304 FMLA v30.4s, v16.4s, v2.s[2]
305 LDR x8, [x5, 8] // b
306
307 // BLOCK 2
308 FMLA v21.4s, v17.4s, v0.s[0]
309 LDR d4, [x10], 8 // a2
310 FMLA v23.4s, v17.4s, v0.s[2]
311 INS v12.d[1], x8 // b ins
312 FMLA v25.4s, v17.4s, v1.s[0]
313 LDR x8, [x11], 8 // a3
314
315 // BLOCK 3
316 FMLA v27.4s, v17.4s, v1.s[2]
317 LDR d5, [x12], 8 // a4
318 FMLA v29.4s, v17.4s, v2.s[0]
319 INS v4.d[1], x8 // a3 ins
320 FMLA v31.4s, v17.4s, v2.s[2]
321 LDR x8, [x4], 8 // a5
322
323 // BLOCK 4
324 FMLA v20.4s, v18.4s, v0.s[1]
325 LDR d13, [x5, 16]
326 FMLA v22.4s, v18.4s, v0.s[3]
327 INS v5.d[1], x8 // a5 ins
328 FMLA v24.4s, v18.4s, v1.s[1]
329 LDR x8, [x5, 24]
330
331 // BLOCK 5
332 FMLA v26.4s, v18.4s, v1.s[3]
333 LDR d14, [x5, 32]
334 FMLA v28.4s, v18.4s, v2.s[1]
335 INS v13.d[1], x8 // b
336 FMLA v30.4s, v18.4s, v2.s[3]
337 LDR x8, [x5, 40]
338
339 // BLOCK 6
340 FMLA v21.4s, v19.4s, v0.s[1]
341 LDR d15, [x5, 48]
342 FMLA v23.4s, v19.4s, v0.s[3]
343 INS v14.d[1], x8 // b
344 FMLA v25.4s, v19.4s, v1.s[1]
345 LDR x8, [x5, 56]
346
347 // BLOCK 7
348 FMLA v27.4s, v19.4s, v1.s[3]
349 FMLA v29.4s, v19.4s, v2.s[1]
350 INS v15.d[1], x8 // b
351 FMLA v31.4s, v19.4s, v2.s[3]
352
353 # Second group of 24 FMA, First group of loads
354 // BLOCK 0
355 FMLA v20.4s, v12.4s, v3.s[0]
356 PRFM PSTL1KEEP, [x6] // Prefetch C0
357 FMLA v22.4s, v12.4s, v3.s[2]
358 PRFM PSTL1KEEP, [x16] // Prefetch C1
359 FMLA v24.4s, v12.4s, v4.s[0]
360 PRFM PSTL1KEEP, [x17] // Prefetch C2
361
362 // BLOCK 1
363 FMLA v26.4s, v12.4s, v4.s[2]
364 PRFM PSTL1KEEP, [x18] // Prefetch C3
365 FMLA v28.4s, v12.4s, v5.s[0]
366 PRFM PSTL1KEEP, [x13] // Prefetch C4
367 FMLA v30.4s, v12.4s, v5.s[2]
368 PRFM PSTL1KEEP, [x7] // Prefetch C5
369
370 // BLOCK 2
371 FMLA v21.4s, v13.4s, v3.s[0]
372 FMLA v23.4s, v13.4s, v3.s[2]
373 FMLA v25.4s, v13.4s, v4.s[0]
374
375 // BLOCK 3
376 FMLA v27.4s, v13.4s, v4.s[2]
377 FMLA v29.4s, v13.4s, v5.s[0]
378 FMLA v31.4s, v13.4s, v5.s[2]
379
380 // BLOCK 4
381 FMLA v20.4s, v14.4s, v3.s[1]
382 FMLA v22.4s, v14.4s, v3.s[3]
383 FMLA v24.4s, v14.4s, v4.s[1]
384
385 // BLOCK 5
386 FMLA v26.4s, v14.4s, v4.s[3]
387 FMLA v28.4s, v14.4s, v5.s[1]
388 FMLA v30.4s, v14.4s, v5.s[3]
389 TST x0, 15
390
391 // BLOCK 6
392 FMLA v21.4s, v15.4s, v3.s[1]
393 FMLA v23.4s, v15.4s, v3.s[3]
394 FMLA v25.4s, v15.4s, v4.s[1]
395 ADD x5, x5, 64
396
397 // BLOCK 7
398 FMLA v27.4s, v15.4s, v4.s[3]
399 FMLA v29.4s, v15.4s, v5.s[1]
400 FMLA v31.4s, v15.4s, v5.s[3]
401
402 # Is there a remainder?- 2 floats of A (8 bytes) or less
403 B.NE 5f
4044:
405 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700406 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard91e19992020-03-09 18:46:14 -0700407 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700408 FMAX v21.4s, v21.4s, v6.4s
409 FMAX v22.4s, v22.4s, v6.4s
410 FMAX v23.4s, v23.4s, v6.4s
411 FMAX v24.4s, v24.4s, v6.4s
412 FMAX v25.4s, v25.4s, v6.4s
413 FMAX v26.4s, v26.4s, v6.4s
414 FMAX v27.4s, v27.4s, v6.4s
415 FMAX v28.4s, v28.4s, v6.4s
416 FMAX v29.4s, v29.4s, v6.4s
417 FMAX v30.4s, v30.4s, v6.4s
418 FMAX v31.4s, v31.4s, v6.4s
419 FMIN v20.4s, v20.4s, v7.4s
420 FMIN v21.4s, v21.4s, v7.4s
421 FMIN v22.4s, v22.4s, v7.4s
422 FMIN v23.4s, v23.4s, v7.4s
423 FMIN v24.4s, v24.4s, v7.4s
424 FMIN v25.4s, v25.4s, v7.4s
425 FMIN v26.4s, v26.4s, v7.4s
426 FMIN v27.4s, v27.4s, v7.4s
427 FMIN v28.4s, v28.4s, v7.4s
428 FMIN v29.4s, v29.4s, v7.4s
429 FMIN v30.4s, v30.4s, v7.4s
430 FMIN v31.4s, v31.4s, v7.4s
Frank Barchard91e19992020-03-09 18:46:14 -0700431
432 # Store full 6 x 8
433 B.LO 8f
434
435 ST1 {v30.16b, v31.16b}, [x7], x14
436 SUB x3, x3, x2 // a0 -= kc
437 ST1 {v28.16b, v29.16b}, [x13], x14
438 SUB x9, x9, x2 // a1 -= kc
439 ST1 {v26.16b, v27.16b}, [x18], x14
440 SUB x10, x10, x2 // a2 -= kc
441 ST1 {v24.16b, v25.16b}, [x17], x14
442 SUB x11, x11, x2 // a3 -= kc
443 ST1 {v22.16b, v23.16b}, [x16], x14
444 SUB x12, x12, x2 // a4 -= kc
445 ST1 {v20.16b, v21.16b}, [x6], x14
446 SUB x4, x4, x2 // a5 -= kc
447
448 B.HI 0b
449
450 // Restore d12-d15 from stack
451 LDP d14, d15, [sp, 16]
452 LDP d12, d13, [sp], 32
453 RET
454
4555:
456 # Is there a remainder?- 2 floats of A (8 bytes)
457 TBZ x0, 3, 6f
458
459 # Remainder- 2 floats of A (8 bytes)
460 LDR d0, [x3], 8
461 LDR q16, [x5], 16
462 LD1 {v0.d}[1], [x9], 8
463 LDR d1, [x10], 8
464 LD1 {v1.d}[1], [x11], 8
465 LDR d2, [x12], 8
466 LD1 {v2.d}[1], [x4], 8
467 LDR q17, [x5], 16
468 LDR q18, [x5], 16
469 LDR q19, [x5], 16
470
471 FMLA v20.4s, v16.4s, v0.s[0]
472 FMLA v22.4s, v16.4s, v0.s[2]
473 FMLA v24.4s, v16.4s, v1.s[0]
474 FMLA v26.4s, v16.4s, v1.s[2]
475 FMLA v28.4s, v16.4s, v2.s[0]
476 FMLA v30.4s, v16.4s, v2.s[2]
477 FMLA v21.4s, v17.4s, v0.s[0]
478 FMLA v23.4s, v17.4s, v0.s[2]
479 FMLA v25.4s, v17.4s, v1.s[0]
480 FMLA v27.4s, v17.4s, v1.s[2]
481 FMLA v29.4s, v17.4s, v2.s[0]
482 FMLA v31.4s, v17.4s, v2.s[2]
483
484 FMLA v20.4s, v18.4s, v0.s[1]
485 FMLA v22.4s, v18.4s, v0.s[3]
486 FMLA v24.4s, v18.4s, v1.s[1]
487 FMLA v26.4s, v18.4s, v1.s[3]
488 FMLA v28.4s, v18.4s, v2.s[1]
489 FMLA v30.4s, v18.4s, v2.s[3]
490 FMLA v21.4s, v19.4s, v0.s[1]
491 FMLA v23.4s, v19.4s, v0.s[3]
492 FMLA v25.4s, v19.4s, v1.s[1]
493 FMLA v27.4s, v19.4s, v1.s[3]
494 FMLA v29.4s, v19.4s, v2.s[1]
495 FMLA v31.4s, v19.4s, v2.s[3]
496
497 # Is there a remainder?- 1 floats of A (4 bytes)
498 TBZ x0, 2, 4b
4996:
500 # Remainder- 1 floats of A (4 bytes)
501 LDR s0, [x3], 4
502 LDR q16, [x5], 16
503 LD1 {v0.s}[2], [x9], 4
504 LDR s1, [x10], 4
505 LD1 {v1.s}[2], [x11], 4
506 LDR s2, [x12], 4
507 LD1 {v2.s}[2], [x4], 4
508 LDR q17, [x5], 16
509
510 FMLA v20.4s, v16.4s, v0.s[0]
511 FMLA v22.4s, v16.4s, v0.s[2]
512 FMLA v24.4s, v16.4s, v1.s[0]
513 FMLA v26.4s, v16.4s, v1.s[2]
514 FMLA v28.4s, v16.4s, v2.s[0]
515 FMLA v30.4s, v16.4s, v2.s[2]
516 FMLA v21.4s, v17.4s, v0.s[0]
517 FMLA v23.4s, v17.4s, v0.s[2]
518 FMLA v25.4s, v17.4s, v1.s[0]
519 FMLA v27.4s, v17.4s, v1.s[2]
520 FMLA v29.4s, v17.4s, v2.s[0]
521 FMLA v31.4s, v17.4s, v2.s[2]
522 B 4b
523
524 # Store odd width
5258:
526 TBZ x1, 2, 9f
527 STR q30, [x7], 16
528 MOV v30.16b, v31.16b
529 STR q28, [x13], 16
530 MOV v28.16b, v29.16b
531 STR q26, [x18], 16
532 MOV v26.16b, v27.16b
533 STR q24, [x17], 16
534 MOV v24.16b, v25.16b
535 STR q22, [x16], 16
536 MOV v22.16b, v23.16b
537 STR q20, [x6], 16
538 MOV v20.16b, v21.16b
539
5409:
541 TBZ x1, 1, 10f
542 STR d30, [x7], 8
543 DUP d30, v30.d[1]
544 STR d28, [x13], 8
545 DUP d28, v28.d[1]
546 STR d26, [x18], 8
547 DUP d26, v26.d[1]
548 STR d24, [x17], 8
549 DUP d24, v24.d[1]
550 STR d22, [x16], 8
551 DUP d22, v22.d[1]
552 STR d20, [x6], 8
553 DUP d20, v20.d[1]
554
55510:
556 TBZ x1, 0, 11f
557 STR s30, [x7]
558 STR s28, [x13]
559 STR s26, [x18]
560 STR s24, [x17]
561 STR s22, [x16]
562 STR s20, [x6]
56311:
564 // Restore d12-d15 from stack
565 LDP d14, d15, [sp, 16]
566 LDP d12, d13, [sp], 32
567 RET
568
569END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55
570
571#ifdef __ELF__
572.section ".note.GNU-stack","",%progbits
573#endif