blob: 14cca1b06bf0134607787728f85b104bfee6586a [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
39# Vector register usage and GPR shadows
40# a0 v0 first set of A
41# a1 v0[1] x13
42# a2 v1
43# a3 v1[1] x8
44# a0 v2 second set of A
45# a1 v2[1] x13
46# a2 v3
47# a3 v3[1] x8
48# B v6 v7 v8 x20 x21 x16 first set of B
49# B v9 v10 v11 x17 x18 x19
50# B v14 v15 v16 x20 x21 x16 second set of B (same x as first set)
51# B v17 v18 v19 x17 x18 x19
52# C v20 v21 v22
53# C v23 v24 v25
54# C v26 v27 v28
55# C v29 v30 v31
56# Clamp v4 v5
57# v12 to v13 unused.
58
59BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
60
61 $if INC:
62 # Load cn_stride, acc
63 LDP x14, x15, [sp]
64 # Load params pointer
65 LDR x8, [sp, 16]
66 $else:
67 # Load cn_stride, params pointer
68 LDP x14, x8, [sp]
69
70 # Load clamping_params values
71 LD2R {v4.4s, v5.4s}, [x8]
72
73 # Save x19-21 on stack
74 STR x21, [sp, -80]!
75 STP x19, x20, [sp, 16]
76
77 # Save d8-d11,d14,d15 on stack
78 STP d8, d9, [sp, 32]
79 STP d10, d11, [sp, 48]
80 STP d14, d15, [sp, 64]
81
82 # Clamp A and C pointers
83 ADD x11, x3, x4 // a1 = a0 + a_stride
84 ADD x9, x6, x7 // c1 = c0 + cm_stride
85 CMP x0, 2 // if mr < 2
86 CSEL x11, x3, x11, LO // a1 = a0
87 CSEL x9, x6, x9, LO // c1 = c0
88 ADD x12, x11, x4 // a2 = a1 + a_stride
89 ADD x10, x9, x7 // c2 = c1 + cm_stride
90 // if mr <= 2
91 CSEL x12, x11, x12, LS // a2 = a1
92 CSEL x10, x9, x10, LS // c2 = c1
93 ADD x4, x12, x4 // a3 = a2 + a_stride
94 ADD x7, x10, x7 // c3 = c2 + cm_stride
95 CMP x0, 4 // if mr < 4
96 CSEL x4, x12, x4, LO // a3 = a2
97 CSEL x7, x10, x7, LO // c3 = c2
98
990:
100 $if INC:
101 # Load initial accumulators
102 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
103 LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
104 LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
105 LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
106 $else:
107 # Load initial bias from w into accumulators
108 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
109 MOV v23.16b, v20.16b
110 MOV v24.16b, v21.16b
111 MOV v25.16b, v22.16b
112 MOV v26.16b, v20.16b
113 MOV v27.16b, v21.16b
114 MOV v28.16b, v22.16b
115 MOV v29.16b, v20.16b
116 MOV v30.16b, v21.16b
117 MOV v31.16b, v22.16b
118
119 PRFM PLDL1KEEP, [x5]
120 PRFM PLDL1KEEP, [x5, 64]
121 PRFM PLDL1KEEP, [x5, 128]
122 PRFM PLDL1KEEP, [x5, 192]
123 PRFM PLDL1KEEP, [x5, 256]
124 PRFM PLDL1KEEP, [x5, 320]
125
126 # Is there at least 4 floats (16 bytes)?
127 SUBS x0, x2, 16 // k = kc - 16
128 B.LO 3f
129
130 SUBS x0, x0, 16
131
132 # Prologue - loads for first group of 24 FMA
133
134 # Read first block of 4 A.
135 LDR d0, [x3], 8 // a0
136 LDR x13, [x11], 8 // a1
137 LDR d1, [x12], 8 // a2
138 LDR x8, [x4], 8 // a3
139
140 LDR d6, [x5] // vb0x0123
141 LDR x20, [x5, 8]
142
143 LDR d7, [x5, 16] // vb0x4567
144 LDR x21, [x5, 24]
145
146 LDR d8, [x5, 32] // vb0x89AB
147 LDR x16, [x5, 40]
148
149 LDR d9, [x5, 48] // vb1x0123
150 INS v0.d[1], x13
151 LDR x17, [x5, 56]
152
153 LDR d10, [x5, 64] // vb1x4567
154 INS v1.d[1], x8
155 LDR x18, [x5, 72]
156
157 LDR d11, [x5, 80] // vb1x89AB
158 LDR x19, [x5, 88]
159 INS v6.d[1], x20
160 ADD x5, x5, 96
161
162 # Is there at least 4 floats (16 bytes) for main loop?
163 B.LO 2f
164
165 # Main loop - 4 floats of A (16 bytes)
1661:
167 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
168 # A is loaded for 2nd group into v2/v3
169 # INS is 4 blocks (16 cycles) after load
170
171 # BLOCK 0
172 LDR d2, [x3], 8 // a0
173 INS v7.d[1], x21
174 FMLA v20.4s, v6.4s, v0.s[0]
175 LDR x13, [x11], 8 // a1
176 FMLA v23.4s, v6.4s, v0.s[2]
177 PRFM PLDL1KEEP, [x5, 192]
178 FMLA v26.4s, v6.4s, v1.s[0]
179
180 # BLOCK 1
181 LDR d3, [x12], 8 // a2
182 INS v8.d[1], x16
183 FMLA v29.4s, v6.4s, v1.s[2]
184 LDR x8, [x4], 8 // a3
185 FMLA v21.4s, v7.4s, v0.s[0]
186 PRFM PLDL1KEEP, [x5, 256]
187 FMLA v24.4s, v7.4s, v0.s[2]
188
189 # BLOCK 2
190 LDR d14, [x5] // vb0x0123
191 INS v9.d[1], x17
192 FMLA v27.4s, v7.4s, v1.s[0]
193 LDR x20, [x5, 8]
194 FMLA v30.4s, v7.4s, v1.s[2]
195 PRFM PLDL1KEEP, [x5, 320]
196 FMLA v22.4s, v8.4s, v0.s[0]
197
198 # BLOCK 3
199 LDR d15, [x5, 16] // vb0x4567
200 INS v10.d[1], x18
201 FMLA v25.4s, v8.4s, v0.s[2]
202 LDR x21, [x5, 24]
203 FMLA v28.4s, v8.4s, v1.s[0]
204 FMLA v31.4s, v8.4s, v1.s[2]
205
206 # BLOCK 4
207 LDR d16, [x5, 32] // vb0x89AB
208 INS v11.d[1], x19
209 FMLA v20.4s, v9.4s, v0.s[1]
210 LDR x16, [x5, 40]
211 FMLA v23.4s, v9.4s, v0.s[3]
212 FMLA v26.4s, v9.4s, v1.s[1]
213
214 # BLOCK 5
215 LDR d17, [x5, 48] // vb1x0123
216 INS v2.d[1], x13 // a1 was loaded in block 0
217 FMLA v29.4s, v9.4s, v1.s[3]
218 LDR x17, [x5, 56]
219 FMLA v21.4s, v10.4s, v0.s[1]
220 FMLA v24.4s, v10.4s, v0.s[3]
221
222 # BLOCK 6
223 LDR d18, [x5, 64] // vb1x4567
224 INS v3.d[1], x8 // a3 was loaded in block 1
225 FMLA v27.4s, v10.4s, v1.s[1]
226 LDR x18, [x5, 72]
227 FMLA v30.4s, v10.4s, v1.s[3]
228 FMLA v22.4s, v11.4s, v0.s[1]
229
230 # BLOCK 7
231 LDR d19, [x5, 80] // vb1x89AB
232 INS v14.d[1], x20 // v14 was loaded in block 2
233 FMLA v25.4s, v11.4s, v0.s[3]
234 LDR x19, [x5, 88]
235 FMLA v28.4s, v11.4s, v1.s[1]
236 FMLA v31.4s, v11.4s, v1.s[3]
237
238 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
239 # A is loaded for 1st group into v0/v1
240
241 # BLOCK 0
242 LDR d0, [x3], 8 // a0
243 INS v15.d[1], x21
244 FMLA v20.4s, v14.4s, v2.s[0]
245 LDR x13, [x11], 8 // a1
246 FMLA v23.4s, v14.4s, v2.s[2]
247 FMLA v26.4s, v14.4s, v3.s[0]
248
249 # BLOCK 1
250 LDR d1, [x12], 8 // a2
251 INS v16.d[1], x16
252 FMLA v29.4s, v14.4s, v3.s[2]
253 LDR x8, [x4], 8 // a3
254 FMLA v21.4s, v15.4s, v2.s[0]
255 FMLA v24.4s, v15.4s, v2.s[2]
256
257 # BLOCK 2
258 LDR d6, [x5, 96] // vb0x0123
259 INS v17.d[1], x17
260 FMLA v27.4s, v15.4s, v3.s[0]
261 LDR x20, [x5, 104]
262 FMLA v30.4s, v15.4s, v3.s[2]
263 FMLA v22.4s, v16.4s, v2.s[0]
264
265 # BLOCK 3
266 LDR d7, [x5, 112] // vb0x4567
267 INS v18.d[1], x18
268 FMLA v25.4s, v16.4s, v2.s[2]
269 LDR x21, [x5, 120]
270 FMLA v28.4s, v16.4s, v3.s[0]
271 FMLA v31.4s, v16.4s, v3.s[2]
272
273 # BLOCK 4
274 LDR d8, [x5, 128] // vb0x89AB
275 INS v19.d[1], x19
276 FMLA v20.4s, v17.4s, v2.s[1]
277 LDR x16, [x5, 136]
278 FMLA v23.4s, v17.4s, v2.s[3]
279 FMLA v26.4s, v17.4s, v3.s[1]
280
281 # BLOCK 5
282 LDR d9, [x5, 144] // vb1x0123
283 INS v0.d[1], x13 // a1
284 FMLA v29.4s, v17.4s, v3.s[3]
285 LDR x17, [x5, 152]
286 FMLA v21.4s, v18.4s, v2.s[1]
287 FMLA v24.4s, v18.4s, v2.s[3]
288
289 # BLOCK 6
290 LDR d10, [x5, 160] // vb1x4567
291 INS v1.d[1], x8 // a3
292 FMLA v27.4s, v18.4s, v3.s[1]
293 LDR x18, [x5, 168]
294 FMLA v30.4s, v18.4s, v3.s[3]
295 SUBS x0, x0, 16
296 FMLA v22.4s, v19.4s, v2.s[1]
297
298 # BLOCK 7
299 LDR d11, [x5, 176] // vb1x89AB
300 INS v6.d[1], x20
301 FMLA v25.4s, v19.4s, v2.s[3]
302 LDR x19, [x5, 184]
303 FMLA v28.4s, v19.4s, v3.s[1]
304 ADD x5, x5, 192
305 FMLA v31.4s, v19.4s, v3.s[3]
306 B.HS 1b
307
308 # Epilogue
309 # First block same as main loop. Second block has no loads.
3102:
311 # BLOCK 0
312 LDR d2, [x3], 8 // a0
313 INS v7.d[1], x21
314 FMLA v20.4s, v6.4s, v0.s[0]
315 LDR x13, [x11], 8 // a1
316 FMLA v23.4s, v6.4s, v0.s[2]
317 PRFM PLDL1KEEP, [x5, 192]
318 FMLA v26.4s, v6.4s, v1.s[0]
319
320 # BLOCK 1
321 LDR d3, [x12], 8 // a2
322 INS v8.d[1], x16
323 FMLA v29.4s, v6.4s, v1.s[2]
324 LDR x8, [x4], 8 // a3
325 FMLA v21.4s, v7.4s, v0.s[0]
326 PRFM PLDL1KEEP, [x5, 256]
327 FMLA v24.4s, v7.4s, v0.s[2]
328
329 # BLOCK 2
330 LDR d14, [x5] // vb0x0123
331 INS v9.d[1], x17
332 FMLA v27.4s, v7.4s, v1.s[0]
333 LDR x20, [x5, 8]
334 FMLA v30.4s, v7.4s, v1.s[2]
335 PRFM PLDL1KEEP, [x5, 320]
336 FMLA v22.4s, v8.4s, v0.s[0]
337
338 # BLOCK 3
339 LDR d15, [x5, 16] // vb0x4567
340 INS v10.d[1], x18
341 FMLA v25.4s, v8.4s, v0.s[2]
342 LDR x21, [x5, 24]
343 FMLA v28.4s, v8.4s, v1.s[0]
344 FMLA v31.4s, v8.4s, v1.s[2]
345
346 # BLOCK 4
347 LDR d16, [x5, 32] // vb0x89AB
348 INS v11.d[1], x19
349 FMLA v20.4s, v9.4s, v0.s[1]
350 LDR x16, [x5, 40]
351 FMLA v23.4s, v9.4s, v0.s[3]
352 FMLA v26.4s, v9.4s, v1.s[1]
353
354 # BLOCK 5
355 LDR d17, [x5, 48] // vb1x0123
356 INS v2.d[1], x13 // a1 was loaded in block 0
357 FMLA v29.4s, v9.4s, v1.s[3]
358 LDR x17, [x5, 56]
359 FMLA v21.4s, v10.4s, v0.s[1]
360 FMLA v24.4s, v10.4s, v0.s[3]
361
362 # BLOCK 6
363 LDR d18, [x5, 64] // vb1x4567
364 INS v3.d[1], x8 // a3 was loaded in block 1
365 FMLA v27.4s, v10.4s, v1.s[1]
366 LDR x18, [x5, 72]
367 FMLA v30.4s, v10.4s, v1.s[3]
368 FMLA v22.4s, v11.4s, v0.s[1]
369
370 # BLOCK 7
371 LDR d19, [x5, 80] // vb1x89AB
372 INS v14.d[1], x20 // v14 was loaded in block 2
373 FMLA v25.4s, v11.4s, v0.s[3]
374 LDR x19, [x5, 88]
375 FMLA v28.4s, v11.4s, v1.s[1]
376 ADD x5, x5, 96
377 FMLA v31.4s, v11.4s, v1.s[3]
378
379 # Second group of 24 fma. 8 blocks of 4 cycles.
380 # Epilogue version does no loads
381
382 # BLOCK 0
383 INS v15.d[1], x21
384 FMLA v20.4s, v14.4s, v2.s[0]
385 FMLA v23.4s, v14.4s, v2.s[2]
386 FMLA v26.4s, v14.4s, v3.s[0]
387
388 # BLOCK 1
389 INS v16.d[1], x16
390 FMLA v29.4s, v14.4s, v3.s[2]
391 FMLA v21.4s, v15.4s, v2.s[0]
392 FMLA v24.4s, v15.4s, v2.s[2]
393
394 # BLOCK 2
395 INS v17.d[1], x17
396 FMLA v27.4s, v15.4s, v3.s[0]
397 FMLA v30.4s, v15.4s, v3.s[2]
398 FMLA v22.4s, v16.4s, v2.s[0]
399
400 # BLOCK 3
401 INS v18.d[1], x18
402 FMLA v25.4s, v16.4s, v2.s[2]
403 FMLA v28.4s, v16.4s, v3.s[0]
404 FMLA v31.4s, v16.4s, v3.s[2]
405
406 # BLOCK 4
407 INS v19.d[1], x19
408 FMLA v20.4s, v17.4s, v2.s[1]
409 FMLA v23.4s, v17.4s, v2.s[3]
410 FMLA v26.4s, v17.4s, v3.s[1]
411
412 # BLOCK 5
413 FMLA v29.4s, v17.4s, v3.s[3]
414 FMLA v21.4s, v18.4s, v2.s[1]
415 FMLA v24.4s, v18.4s, v2.s[3]
416
417 # BLOCK 6
418 FMLA v27.4s, v18.4s, v3.s[1]
419 FMLA v30.4s, v18.4s, v3.s[3]
420 FMLA v22.4s, v19.4s, v2.s[1]
421
422 # BLOCK 7
423 FMLA v25.4s, v19.4s, v2.s[3]
424 FMLA v28.4s, v19.4s, v3.s[1]
425 FMLA v31.4s, v19.4s, v3.s[3]
426
4273:
428 # Is there a remainder?- 2 floats of A (8 bytes)
429 TBNZ x0, 3, 5f
430 # Is there a remainder?- 1 floats of A (4 bytes)
431 TBNZ x0, 2, 6f
432
4334:
434 # Clamp
435 FMIN v20.4s, v20.4s, v4.4s
436 FMIN v21.4s, v21.4s, v4.4s
437 FMIN v22.4s, v22.4s, v4.4s
438 FMIN v23.4s, v23.4s, v4.4s
439 FMIN v24.4s, v24.4s, v4.4s
440 FMIN v25.4s, v25.4s, v4.4s
441 FMIN v26.4s, v26.4s, v4.4s
442 FMIN v27.4s, v27.4s, v4.4s
443 FMIN v28.4s, v28.4s, v4.4s
444 FMIN v29.4s, v29.4s, v4.4s
445 FMIN v30.4s, v30.4s, v4.4s
446 FMIN v31.4s, v31.4s, v4.4s
447 FMAX v20.4s, v20.4s, v5.4s
448 FMAX v21.4s, v21.4s, v5.4s
449 FMAX v22.4s, v22.4s, v5.4s
450 FMAX v23.4s, v23.4s, v5.4s
451 FMAX v24.4s, v24.4s, v5.4s
452 FMAX v25.4s, v25.4s, v5.4s
453 FMAX v26.4s, v26.4s, v5.4s
454 FMAX v27.4s, v27.4s, v5.4s
455 FMAX v28.4s, v28.4s, v5.4s
456 FMAX v29.4s, v29.4s, v5.4s
457 FMAX v30.4s, v30.4s, v5.4s
458 FMAX v31.4s, v31.4s, v5.4s
459
460 # Store full 4 x 12
461 CMP x1, 12
462 B.LO 7f
463
464 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14
465 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
466 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14
467 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
468
469 SUB x3, x3, x2 // a0 -= kc
470 SUB x11, x11, x2 // a1 -= kc
471 SUB x12, x12, x2 // a2 -= kc
472 SUB x4, x4, x2 // a3 -= kc
473
474 SUBS x1, x1, 12
475 B.HI 0b
476
477 # Restore d8-d11,d14,d15 from stack
478 LDP d14, d15, [sp, 64]
479 LDP d10, d11, [sp, 48]
480 LDP d8, d9, [sp, 32]
481
482 # Restore x19-21 from stack
483 LDP x19, x20, [sp, 16]
484 LDR x21, [sp], 80
485 RET
486
4875:
488 # Remainder - 2 floats of A (8 bytes)
489 # Read first block of 4 A.
490 LDR d0, [x3], 8 // a0
491 LDR d1, [x11], 8 // a1
492 LDR d2, [x12], 8 // a2
493 LDR d3, [x4], 8 // a3
494 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
495 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
496
497 # First block of 3 B
498 FMLA v20.4s, v6.4s, v0.s[0]
499 FMLA v23.4s, v6.4s, v1.s[0]
500 FMLA v26.4s, v6.4s, v2.s[0]
501 FMLA v29.4s, v6.4s, v3.s[0]
502 FMLA v21.4s, v7.4s, v0.s[0]
503 FMLA v24.4s, v7.4s, v1.s[0]
504 FMLA v27.4s, v7.4s, v2.s[0]
505 FMLA v30.4s, v7.4s, v3.s[0]
506 FMLA v22.4s, v8.4s, v0.s[0]
507 FMLA v25.4s, v8.4s, v1.s[0]
508 FMLA v28.4s, v8.4s, v2.s[0]
509 FMLA v31.4s, v8.4s, v3.s[0]
510
511 # Second block of 3 B
512 FMLA v20.4s, v9.4s, v0.s[1]
513 FMLA v23.4s, v9.4s, v1.s[1]
514 FMLA v26.4s, v9.4s, v2.s[1]
515 FMLA v29.4s, v9.4s, v3.s[1]
516 FMLA v21.4s, v10.4s, v0.s[1]
517 FMLA v24.4s, v10.4s, v1.s[1]
518 FMLA v27.4s, v10.4s, v2.s[1]
519 FMLA v30.4s, v10.4s, v3.s[1]
520 FMLA v22.4s, v11.4s, v0.s[1]
521 FMLA v25.4s, v11.4s, v1.s[1]
522 FMLA v28.4s, v11.4s, v2.s[1]
523 FMLA v31.4s, v11.4s, v3.s[1]
524
525 TBZ x0, 2, 4b
5266:
527 # Remainder - 1 float of A (4 bytes)
528 LDR s0, [x3], 4 // a0
529 LDR s1, [x11], 4 // a1
530 LDR s2, [x12], 4 // a2
531 LDR s3, [x4], 4 // a3
532 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
533
534 FMLA v20.4s, v6.4s, v0.s[0]
535 FMLA v23.4s, v6.4s, v1.s[0]
536 FMLA v26.4s, v6.4s, v2.s[0]
537 FMLA v29.4s, v6.4s, v3.s[0]
538 FMLA v21.4s, v7.4s, v0.s[0]
539 FMLA v24.4s, v7.4s, v1.s[0]
540 FMLA v27.4s, v7.4s, v2.s[0]
541 FMLA v30.4s, v7.4s, v3.s[0]
542 FMLA v22.4s, v8.4s, v0.s[0]
543 FMLA v25.4s, v8.4s, v1.s[0]
544 FMLA v28.4s, v8.4s, v2.s[0]
545 FMLA v31.4s, v8.4s, v3.s[0]
546 B 4b
547
5487:
549 # Store odd channels
550 TBZ x1, 3, 8f
551 STP q29, q30, [x7]
552 ADD x7, x7, 32
553 MOV v29.16b, v31.16b
554 STP q26, q27, [x10]
555 ADD x10, x10, 32
556 MOV v26.16b, v28.16b
557 STP q23, q24, [x9]
558 ADD x9, x9, 32
559 MOV v23.16b, v25.16b
560 STP q20, q21, [x6]
561 ADD x6, x6, 32
562 MOV v20.16b, v22.16b
563
5648:
565 TBZ x1, 2, 9f
566 STR q29, [x7], 16
567 MOV v29.16b, v30.16b
568 STR q26, [x10], 16
569 MOV v26.16b, v27.16b
570 STR q23, [x9], 16
571 MOV v23.16b, v24.16b
572 STR q20, [x6], 16
573 MOV v20.16b, v21.16b
574
5759:
576 TBZ x1, 1, 10f
577 STR d29, [x7], 8
578 DUP d29, v29.d[1]
579 STR d26, [x10], 8
580 DUP d26, v26.d[1]
581 STR d23, [x9], 8
582 DUP d23, v23.d[1]
583 STR d20, [x6], 8
584 DUP d20, v20.d[1]
585
58610:
587 TBZ x1, 0, 11f
588 STR s29, [x7]
589 STR s26, [x10]
590 STR s23, [x9]
591 STR s20, [x6]
59211:
593 # Restore d8-d11,d14,d15 from stack
594 LDP d14, d15, [sp, 64]
595 LDP d10, d11, [sp, 48]
596 LDP d8, d9, [sp, 32]
597
598 # Restore x19-21 from stack
599 LDP x19, x20, [sp, 16]
600 LDR x21, [sp], 80
601 RET
602
603END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
604
605#ifdef __ELF__
606.section ".note.GNU-stack","",%progbits
607#endif