blob: d26451314f37647b041cab8fabf426b6785debad [file] [log] [blame]
Frank Barchard91e19992020-03-09 18:46:14 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
Frank Barchard91e19992020-03-09 18:46:14 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# x8 temporary vector shadow register
44
45# Vector register usage
46# A0 v0 v3
47# A1 v0[1] v3[1]
48# A2 v1 v4
49# A3 v1[1] v4[1]
50# A4 v2 v5
51# A5 v2[1] v5[1]
52# B v12 v13 v14 v15 second set of B
53# B v16 v17 v18 v19 first set
54# C v20 v21
55# C v22 v23
56# C v24 v25
57# C v26 v27
58# C v28 v29
59# C v30 v31
60# Clamp v6 v7
61# unused A v8 v9 v10 v11
62
63BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55
64
65 # Clamp A and C pointers
66 CMP x0, 2 // if mr < 2
67 ADD x9, x3, x4 // a1 = a0 + a_stride
68 ADD x16, x6, x7 // c1 = c0 + cm_stride
69 CSEL x9, x3, x9, LO // a1 = a0
70 CSEL x16, x6, x16, LO // c1 = c0
71
72 ADD x10, x9, x4 // a2 = a1 + a_stride
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x10, x9, x10, LS // a2 = a1
76 CSEL x17, x16, x17, LS // c2 = c1
77
78 CMP x0, 4 // if mr < 4
79 ADD x11, x10, x4 // a3 = a2 + a_stride
80 ADD x18, x17, x7 // c3 = c2 + cm_stride
81 CSEL x11, x10, x11, LO // a3 = a2
82 CSEL x18, x17, x18, LO // c3 = c2
83
84 ADD x12, x11, x4 // a4 = a3 + a_stride
85 ADD x13, x18, x7 // c4 = c3 + cm_stride
86 // if mr <= 5
87 CSEL x12, x11, x12, LS // a4 = a3
88 CSEL x13, x18, x13, LS // c4 = c3
89
90 # Load params pointer
91 LDR x8, [sp, 8]
92
93 CMP x0, 6 // if mr < 6
94 ADD x4, x12, x4 // a5 = a4 + a_stride
95 ADD x7, x13, x7 // c5 = c4 + cm_stride
96 CSEL x4, x12, x4, LO // a5 = a4
97 CSEL x7, x13, x7, LO // c5 = c4
98
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070099 # Load min/max values
Frank Barchard91e19992020-03-09 18:46:14 -0700100 LD2R {v6.4s, v7.4s}, [x8]
101
102 # Load cn_stride
103 LDR x14, [sp]
104
105 // Save d12-d15 on stack
106 STP d12, d13, [sp, -32]!
107 STP d14, d15, [sp, 16]
108
1090:
110 # Load initial bias from w into accumulators
111 LDP q20, q21, [x5], 32
112 SUBS x0, x2, 16 // k = kc - 16
113 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
114 PRFM PLDL1KEEP, [x3, 64]
115 MOV v22.16b, v20.16b
116 PRFM PLDL1KEEP, [x9, 0]
117 PRFM PLDL1KEEP, [x9, 64]
118 MOV v23.16b, v21.16b
119 PRFM PLDL1KEEP, [x10, 0]
120 PRFM PLDL1KEEP, [x10, 64]
121 MOV v24.16b, v20.16b
122 PRFM PLDL1KEEP, [x11, 0]
123 PRFM PLDL1KEEP, [x11, 64]
124 MOV v25.16b, v21.16b
125 PRFM PLDL1KEEP, [x12, 0]
126 PRFM PLDL1KEEP, [x12, 64]
127 MOV v26.16b, v20.16b
128 PRFM PLDL1KEEP, [x4, 0]
129 PRFM PLDL1KEEP, [x4, 64]
130 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
131 MOV v27.16b, v21.16b
132 PRFM PLDL1KEEP, [x5, 64]
133 MOV v28.16b, v20.16b
134 PRFM PLDL1KEEP, [x5, 128]
135 MOV v29.16b, v21.16b
136 PRFM PLDL1KEEP, [x5, 192]
137 MOV v30.16b, v20.16b
138 PRFM PLDL1KEEP, [x5, 256]
139 MOV v31.16b, v21.16b
140 PRFM PLDL1KEEP, [x5, 320]
141
142 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
143 B.LO 5f
144
145 # Prologue - First group loads, no FMA
146 LDR d0, [x3], 8 // a0
147 LDP q16, q17, [x5], 32 // b
148 LDR d1, [x10], 8 // a2
149 LDR d2, [x12], 8 // a4
150 LD1 {v0.d}[1], [x9], 8 // a1
151 LD1 {v1.d}[1], [x11], 8 // a3
152 LD1 {v2.d}[1], [x4], 8 // a5
153 SUBS x0, x0, 16
154 LDR q18, [x5], 16
155 LDR d19, [x5], 8
156 LDR x8, [x5], 8 // ins is in BLOCK 0
157
158 # Is there at least 4 floats (16 bytes) for main loop?
159 B.LO 2f
160
161 # Main loop - 4 floats of A (16 bytes)
162 # 48 FMA + 12 LD64 A + 8 LDR B
1631:
164 # First group of 24 FMA, Second group loads
165 // BLOCK 0
166 FMLA v20.4s, v16.4s, v0.s[0]
167 LDR d3, [x3], 8 // a0
168 FMLA v22.4s, v16.4s, v0.s[2]
169 INS v19.d[1], x8 // b from second group
170 FMLA v24.4s, v16.4s, v1.s[0]
171 LDR x8, [x9], 8 // a1
172
173 // BLOCK 1
174 FMLA v26.4s, v16.4s, v1.s[2]
175 LDR d12, [x5]
176 FMLA v28.4s, v16.4s, v2.s[0]
177 INS v3.d[1], x8 // a1 ins
178 FMLA v30.4s, v16.4s, v2.s[2]
179 LDR x8, [x5, 8] // b
180
181 // BLOCK 2
182 FMLA v21.4s, v17.4s, v0.s[0]
183 LDR d4, [x10], 8 // a2
184 FMLA v23.4s, v17.4s, v0.s[2]
185 INS v12.d[1], x8 // b ins
186 FMLA v25.4s, v17.4s, v1.s[0]
187 LDR x8, [x11], 8 // a3
188
189 // BLOCK 3
190 FMLA v27.4s, v17.4s, v1.s[2]
191 LDR d5, [x12], 8 // a4
192 FMLA v29.4s, v17.4s, v2.s[0]
193 INS v4.d[1], x8 // a3 ins
194 FMLA v31.4s, v17.4s, v2.s[2]
195 LDR x8, [x4], 8 // a5
196
197 // BLOCK 4
198 FMLA v20.4s, v18.4s, v0.s[1]
199 LDR d13, [x5, 16]
200 FMLA v22.4s, v18.4s, v0.s[3]
201 INS v5.d[1], x8 // a5 ins
202 FMLA v24.4s, v18.4s, v1.s[1]
203 LDR x8, [x5, 24]
204
205 // BLOCK 5
206 FMLA v26.4s, v18.4s, v1.s[3]
207 LDR d14, [x5, 32]
208 FMLA v28.4s, v18.4s, v2.s[1]
209 INS v13.d[1], x8 // b
210 FMLA v30.4s, v18.4s, v2.s[3]
211 LDR x8, [x5, 40]
212
213 // BLOCK 6
214 FMLA v21.4s, v19.4s, v0.s[1]
215 LDR d15, [x5, 48]
216 FMLA v23.4s, v19.4s, v0.s[3]
217 INS v14.d[1], x8 // b
218 FMLA v25.4s, v19.4s, v1.s[1]
219 LDR x8, [x5, 56]
220
221 // BLOCK 7
222 FMLA v27.4s, v19.4s, v1.s[3]
223 FMLA v29.4s, v19.4s, v2.s[1]
224 INS v15.d[1], x8
225 FMLA v31.4s, v19.4s, v2.s[3]
226
227 # Second group of 24 FMA, First group of loads
228 // BLOCK 0
229 FMLA v20.4s, v12.4s, v3.s[0]
230 LDR d0, [x3], 8 // a0
231 FMLA v22.4s, v12.4s, v3.s[2]
232 FMLA v24.4s, v12.4s, v4.s[0]
233 LDR x8, [x9], 8 // a1
234
235 // BLOCK 1
236 FMLA v26.4s, v12.4s, v4.s[2]
237 LDR d16, [x5, 64]
238 FMLA v28.4s, v12.4s, v5.s[0]
239 INS v0.d[1], x8 // a1 ins
240 FMLA v30.4s, v12.4s, v5.s[2]
241 LDR x8, [x5, 72] // b
242
243 // BLOCK 2
244 FMLA v21.4s, v13.4s, v3.s[0]
245 LDR d1, [x10], 8 // a2
246 FMLA v23.4s, v13.4s, v3.s[2]
247 INS v16.d[1], x8 // b
248 FMLA v25.4s, v13.4s, v4.s[0]
249 LDR x8, [x11], 8 // a3
250
251 // BLOCK 3
252 FMLA v27.4s, v13.4s, v4.s[2]
253 LDR d2, [x12], 8 // a4
254 FMLA v29.4s, v13.4s, v5.s[0]
255 INS v1.d[1], x8 // a3 ins
256 FMLA v31.4s, v13.4s, v5.s[2]
257 LDR x8, [x4], 8 // a5
258
259 // BLOCK 4
260 FMLA v20.4s, v14.4s, v3.s[1]
261 LDR d17, [x5, 80]
262 FMLA v22.4s, v14.4s, v3.s[3]
263 INS v2.d[1], x8 // a5 ins
264 FMLA v24.4s, v14.4s, v4.s[1]
265 LDR x8, [x5, 88]
266
267 // BLOCK 5
268 FMLA v26.4s, v14.4s, v4.s[3]
269 LDR d18, [x5, 96]
270 FMLA v28.4s, v14.4s, v5.s[1]
271 INS v17.d[1], x8 // b
272 FMLA v30.4s, v14.4s, v5.s[3]
273 LDR x8, [x5, 104]
274
275 // BLOCK 6
276 FMLA v21.4s, v15.4s, v3.s[1]
277 LDR d19, [x5, 112]
278 FMLA v23.4s, v15.4s, v3.s[3]
279 INS v18.d[1], x8 // b
280 FMLA v25.4s, v15.4s, v4.s[1]
281 LDR x8, [x5, 120]
282
283 // BLOCK 7
284 FMLA v27.4s, v15.4s, v4.s[3]
285 SUBS x0, x0, 16
286 FMLA v29.4s, v15.4s, v5.s[1]
287 ADD x5, x5, 128
288 FMLA v31.4s, v15.4s, v5.s[3]
289 B.HS 1b
290
291 # Epilogue - 4 floats of A (16 bytes)
292 # 48 FMA + 12 LD64 A + 8 LDR B
2932:
294 # First group of 24 FMA, Second group loads
295 // BLOCK 0
296 FMLA v20.4s, v16.4s, v0.s[0]
297 LDR d3, [x3], 8 // a0
298 FMLA v22.4s, v16.4s, v0.s[2]
299 INS v19.d[1], x8 // b from second group
300 FMLA v24.4s, v16.4s, v1.s[0]
301 LDR x8, [x9], 8 // a1
302
303 // BLOCK 1
304 FMLA v26.4s, v16.4s, v1.s[2]
305 LDR d12, [x5]
306 FMLA v28.4s, v16.4s, v2.s[0]
307 INS v3.d[1], x8 // a1 ins
308 FMLA v30.4s, v16.4s, v2.s[2]
309 LDR x8, [x5, 8] // b
310
311 // BLOCK 2
312 FMLA v21.4s, v17.4s, v0.s[0]
313 LDR d4, [x10], 8 // a2
314 FMLA v23.4s, v17.4s, v0.s[2]
315 INS v12.d[1], x8 // b ins
316 FMLA v25.4s, v17.4s, v1.s[0]
317 LDR x8, [x11], 8 // a3
318
319 // BLOCK 3
320 FMLA v27.4s, v17.4s, v1.s[2]
321 LDR d5, [x12], 8 // a4
322 FMLA v29.4s, v17.4s, v2.s[0]
323 INS v4.d[1], x8 // a3 ins
324 FMLA v31.4s, v17.4s, v2.s[2]
325 LDR x8, [x4], 8 // a5
326
327 // BLOCK 4
328 FMLA v20.4s, v18.4s, v0.s[1]
329 LDR d13, [x5, 16]
330 FMLA v22.4s, v18.4s, v0.s[3]
331 INS v5.d[1], x8 // a5 ins
332 FMLA v24.4s, v18.4s, v1.s[1]
333 LDR x8, [x5, 24]
334
335 // BLOCK 5
336 FMLA v26.4s, v18.4s, v1.s[3]
337 LDR d14, [x5, 32]
338 FMLA v28.4s, v18.4s, v2.s[1]
339 INS v13.d[1], x8 // b
340 FMLA v30.4s, v18.4s, v2.s[3]
341 LDR x8, [x5, 40]
342
343 // BLOCK 6
344 FMLA v21.4s, v19.4s, v0.s[1]
345 LDR d15, [x5, 48]
346 FMLA v23.4s, v19.4s, v0.s[3]
347 INS v14.d[1], x8 // b
348 FMLA v25.4s, v19.4s, v1.s[1]
349 LDR x8, [x5, 56]
350
351 // BLOCK 7
352 FMLA v27.4s, v19.4s, v1.s[3]
353 FMLA v29.4s, v19.4s, v2.s[1]
354 INS v15.d[1], x8 // b
355 FMLA v31.4s, v19.4s, v2.s[3]
356
357 # Second group of 24 FMA, First group of loads
358 // BLOCK 0
359 FMLA v20.4s, v12.4s, v3.s[0]
360 PRFM PSTL1KEEP, [x6] // Prefetch C0
361 FMLA v22.4s, v12.4s, v3.s[2]
362 PRFM PSTL1KEEP, [x16] // Prefetch C1
363 FMLA v24.4s, v12.4s, v4.s[0]
364 PRFM PSTL1KEEP, [x17] // Prefetch C2
365
366 // BLOCK 1
367 FMLA v26.4s, v12.4s, v4.s[2]
368 PRFM PSTL1KEEP, [x18] // Prefetch C3
369 FMLA v28.4s, v12.4s, v5.s[0]
370 PRFM PSTL1KEEP, [x13] // Prefetch C4
371 FMLA v30.4s, v12.4s, v5.s[2]
372 PRFM PSTL1KEEP, [x7] // Prefetch C5
373
374 // BLOCK 2
375 FMLA v21.4s, v13.4s, v3.s[0]
376 FMLA v23.4s, v13.4s, v3.s[2]
377 FMLA v25.4s, v13.4s, v4.s[0]
378
379 // BLOCK 3
380 FMLA v27.4s, v13.4s, v4.s[2]
381 FMLA v29.4s, v13.4s, v5.s[0]
382 FMLA v31.4s, v13.4s, v5.s[2]
383
384 // BLOCK 4
385 FMLA v20.4s, v14.4s, v3.s[1]
386 FMLA v22.4s, v14.4s, v3.s[3]
387 FMLA v24.4s, v14.4s, v4.s[1]
388
389 // BLOCK 5
390 FMLA v26.4s, v14.4s, v4.s[3]
391 FMLA v28.4s, v14.4s, v5.s[1]
392 FMLA v30.4s, v14.4s, v5.s[3]
393 TST x0, 15
394
395 // BLOCK 6
396 FMLA v21.4s, v15.4s, v3.s[1]
397 FMLA v23.4s, v15.4s, v3.s[3]
398 FMLA v25.4s, v15.4s, v4.s[1]
399 ADD x5, x5, 64
400
401 // BLOCK 7
402 FMLA v27.4s, v15.4s, v4.s[3]
403 FMLA v29.4s, v15.4s, v5.s[1]
404 FMLA v31.4s, v15.4s, v5.s[3]
405
406 # Is there a remainder?- 2 floats of A (8 bytes) or less
407 B.NE 5f
4084:
409 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700410 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard91e19992020-03-09 18:46:14 -0700411 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700412 FMAX v21.4s, v21.4s, v6.4s
413 FMAX v22.4s, v22.4s, v6.4s
414 FMAX v23.4s, v23.4s, v6.4s
415 FMAX v24.4s, v24.4s, v6.4s
416 FMAX v25.4s, v25.4s, v6.4s
417 FMAX v26.4s, v26.4s, v6.4s
418 FMAX v27.4s, v27.4s, v6.4s
419 FMAX v28.4s, v28.4s, v6.4s
420 FMAX v29.4s, v29.4s, v6.4s
421 FMAX v30.4s, v30.4s, v6.4s
422 FMAX v31.4s, v31.4s, v6.4s
423 FMIN v20.4s, v20.4s, v7.4s
424 FMIN v21.4s, v21.4s, v7.4s
425 FMIN v22.4s, v22.4s, v7.4s
426 FMIN v23.4s, v23.4s, v7.4s
427 FMIN v24.4s, v24.4s, v7.4s
428 FMIN v25.4s, v25.4s, v7.4s
429 FMIN v26.4s, v26.4s, v7.4s
430 FMIN v27.4s, v27.4s, v7.4s
431 FMIN v28.4s, v28.4s, v7.4s
432 FMIN v29.4s, v29.4s, v7.4s
433 FMIN v30.4s, v30.4s, v7.4s
434 FMIN v31.4s, v31.4s, v7.4s
Frank Barchard91e19992020-03-09 18:46:14 -0700435
436 # Store full 6 x 8
437 B.LO 8f
438
439 ST1 {v20.16b, v21.16b}, [x6], x14
440 SUB x3, x3, x2 // a0 -= kc
441 ST1 {v22.16b, v23.16b}, [x16], x14
442 SUB x9, x9, x2 // a1 -= kc
443 ST1 {v24.16b, v25.16b}, [x17], x14
444 SUB x10, x10, x2 // a2 -= kc
445 ST1 {v26.16b, v27.16b}, [x18], x14
446 SUB x11, x11, x2 // a3 -= kc
447 ST1 {v28.16b, v29.16b}, [x13], x14
448 SUB x12, x12, x2 // a4 -= kc
449 ST1 {v30.16b, v31.16b}, [x7], x14
450 SUB x4, x4, x2 // a5 -= kc
451
452 B.HI 0b
453
454 // Restore d12-d15 from stack
455 LDP d14, d15, [sp, 16]
456 LDP d12, d13, [sp], 32
457 RET
458
4595:
460 # Is there a remainder?- 2 floats of A (8 bytes)
461 TBZ x0, 3, 6f
462
463 # Remainder- 2 floats of A (8 bytes)
464 LDR d0, [x3], 8
465 LDR q16, [x5], 16
466 LD1 {v0.d}[1], [x9], 8
467 LDR d1, [x10], 8
468 LD1 {v1.d}[1], [x11], 8
469 LDR d2, [x12], 8
470 LD1 {v2.d}[1], [x4], 8
471 LDR q17, [x5], 16
472 LDR q18, [x5], 16
473 LDR q19, [x5], 16
474
475 FMLA v20.4s, v16.4s, v0.s[0]
476 FMLA v22.4s, v16.4s, v0.s[2]
477 FMLA v24.4s, v16.4s, v1.s[0]
478 FMLA v26.4s, v16.4s, v1.s[2]
479 FMLA v28.4s, v16.4s, v2.s[0]
480 FMLA v30.4s, v16.4s, v2.s[2]
481 FMLA v21.4s, v17.4s, v0.s[0]
482 FMLA v23.4s, v17.4s, v0.s[2]
483 FMLA v25.4s, v17.4s, v1.s[0]
484 FMLA v27.4s, v17.4s, v1.s[2]
485 FMLA v29.4s, v17.4s, v2.s[0]
486 FMLA v31.4s, v17.4s, v2.s[2]
487
488 FMLA v20.4s, v18.4s, v0.s[1]
489 FMLA v22.4s, v18.4s, v0.s[3]
490 FMLA v24.4s, v18.4s, v1.s[1]
491 FMLA v26.4s, v18.4s, v1.s[3]
492 FMLA v28.4s, v18.4s, v2.s[1]
493 FMLA v30.4s, v18.4s, v2.s[3]
494 FMLA v21.4s, v19.4s, v0.s[1]
495 FMLA v23.4s, v19.4s, v0.s[3]
496 FMLA v25.4s, v19.4s, v1.s[1]
497 FMLA v27.4s, v19.4s, v1.s[3]
498 FMLA v29.4s, v19.4s, v2.s[1]
499 FMLA v31.4s, v19.4s, v2.s[3]
500
501 # Is there a remainder?- 1 floats of A (4 bytes)
502 TBZ x0, 2, 4b
5036:
504 # Remainder- 1 floats of A (4 bytes)
505 LDR s0, [x3], 4
506 LDR q16, [x5], 16
507 LD1 {v0.s}[2], [x9], 4
508 LDR s1, [x10], 4
509 LD1 {v1.s}[2], [x11], 4
510 LDR s2, [x12], 4
511 LD1 {v2.s}[2], [x4], 4
512 LDR q17, [x5], 16
513
514 FMLA v20.4s, v16.4s, v0.s[0]
515 FMLA v22.4s, v16.4s, v0.s[2]
516 FMLA v24.4s, v16.4s, v1.s[0]
517 FMLA v26.4s, v16.4s, v1.s[2]
518 FMLA v28.4s, v16.4s, v2.s[0]
519 FMLA v30.4s, v16.4s, v2.s[2]
520 FMLA v21.4s, v17.4s, v0.s[0]
521 FMLA v23.4s, v17.4s, v0.s[2]
522 FMLA v25.4s, v17.4s, v1.s[0]
523 FMLA v27.4s, v17.4s, v1.s[2]
524 FMLA v29.4s, v17.4s, v2.s[0]
525 FMLA v31.4s, v17.4s, v2.s[2]
526 B 4b
527
528 # Store odd width
5298:
530 TBZ x1, 2, 9f
531 STR q20, [x6], 16
532 MOV v20.16b, v21.16b
533 STR q22, [x16], 16
534 MOV v22.16b, v23.16b
535 STR q24, [x17], 16
536 MOV v24.16b, v25.16b
537 STR q26, [x18], 16
538 MOV v26.16b, v27.16b
539 STR q28, [x13], 16
540 MOV v28.16b, v29.16b
541 STR q30, [x7], 16
542 MOV v30.16b, v31.16b
543
5449:
545 TBZ x1, 1, 10f
546 STR d20, [x6], 8
547 DUP d20, v20.d[1]
548 STR d22, [x16], 8
549 DUP d22, v22.d[1]
550 STR d24, [x17], 8
551 DUP d24, v24.d[1]
552 STR d26, [x18], 8
553 DUP d26, v26.d[1]
554 STR d28, [x13], 8
555 DUP d28, v28.d[1]
556 STR d30, [x7], 8
557 DUP d30, v30.d[1]
558
55910:
560 TBZ x1, 0, 11f
561 STR s20, [x6]
562 STR s22, [x16]
563 STR s24, [x17]
564 STR s26, [x18]
565 STR s28, [x13]
566 STR s30, [x7]
56711:
568 // Restore d12-d15 from stack
569 LDP d14, d15, [sp, 16]
570 LDP d12, d13, [sp], 32
571 RET
572
573END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55
574
575#ifdef __ELF__
576.section ".note.GNU-stack","",%progbits
577#endif