blob: bd81473db2968f85fffecba2923a2d7f0bf9d5b2 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0 v6
45# A1 v1 v7
46# A2 v2 v8
47# A3 v3 v9
48# A4 v4 v10
49# A5 v5 v11
50# B v12 v13 v14 v15
51# B v16 v17 v18 v19
52# C v20 v21
53# C v22 v23
54# C v24 v25
55# C v26 v27
56# C v28 v29
57# C v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75
61
62 # Clamp A and C pointers / Save d8-d15 on stack
63 STP d8, d9, [sp, -64]!
Frank Barchard684bbb02019-11-16 14:14:42 -080064 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 ADD x9, x3, x4 // a1 = a0 + a_stride
66 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 CSEL x9, x3, x9, LO // a1 = a0
68 CSEL x16, x6, x16, LO // c1 = c0
69
70 STP d10, d11, [sp, 16]
71 ADD x10, x9, x4 // a2 = a1 + a_stride
72 ADD x17, x16, x7 // c2 = c1 + cm_stride
73 // if mr <= 2
74 CSEL x10, x9, x10, LS // a2 = a1
75 CSEL x17, x16, x17, LS // c2 = c1
76
77 STP d12, d13, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080078 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 ADD x11, x10, x4 // a3 = a2 + a_stride
80 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 CSEL x11, x10, x11, LO // a3 = a2
82 CSEL x18, x17, x18, LO // c3 = c2
83
84 STP d14, d15, [sp, 48]
85 ADD x12, x11, x4 // a4 = a3 + a_stride
86 ADD x13, x18, x7 // c4 = c3 + cm_stride
87 // if mr <= 5
88 CSEL x12, x11, x12, LS // a4 = a3
89 CSEL x13, x18, x13, LS // c4 = c3
90
91 # Load params pointer
92 LDR x8, [sp, 72]
93
Frank Barchard684bbb02019-11-16 14:14:42 -080094 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 ADD x4, x12, x4 // a5 = a4 + a_stride
96 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070097 CSEL x4, x12, x4, LO // a5 = a4
98 CSEL x7, x13, x7, LO // c5 = c4
99
100 # Load cn_stride
101 LDR x14, [sp, 64]
102
1030:
104 # Load initial bias from w into accumulators
105 LDP q20, q21, [x5], 32
106 MOV v22.16b, v20.16b
107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
108 MOV v23.16b, v21.16b
109 PRFM PLDL1KEEP, [x5, 64]
110 MOV v24.16b, v20.16b
111 PRFM PLDL1KEEP, [x5, 128]
112 MOV v25.16b, v21.16b
113 PRFM PLDL1KEEP, [x5, 192]
114 MOV v26.16b, v20.16b
115 PRFM PLDL1KEEP, [x3] // Prefetch A
116 MOV v27.16b, v21.16b
117 PRFM PLDL1KEEP, [x9]
118 MOV v28.16b, v20.16b
119 PRFM PLDL1KEEP, [x10]
120 MOV v29.16b, v21.16b
121 PRFM PLDL1KEEP, [x11]
122 MOV v30.16b, v20.16b
123 PRFM PLDL1KEEP, [x12]
124 MOV v31.16b, v21.16b
125 PRFM PLDL1KEEP, [x4]
126
127 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
128 SUBS x0, x2, 32 // k = kc - 32
129 B.LO 4f
130
131 # Prologue - loads for main loop of 96 FMA
132 LDR q0, [x3], 16
133 LDR q1, [x9], 16
134 LDR q2, [x10], 16
135 LDR q3, [x11], 16
136 LDR q4, [x12], 16
137 LDR q5, [x4], 16
138 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
139 LDP q14, q15, [x5], 32
140 LDP q16, q17, [x5], 32
141
142 # Is there at least 8 floats (32 bytes) for main loop?
143 SUBS x0, x0, 32
144 B.LO 2f
145
146 # Main loop - 8 floats of A (32 bytes)
147 # 96 FMA + 6 LDP A + 8 LDP B
1481:
149 # First group of 4 A. 48 FMA.
150 FMLA v20.4s, v12.4s, v0.s[0]
151 LDP q18, q19, [x5], 32 // Load last B
152 FMLA v22.4s, v12.4s, v1.s[0]
153 FMLA v24.4s, v12.4s, v2.s[0]
154 FMLA v26.4s, v12.4s, v3.s[0]
155 FMLA v28.4s, v12.4s, v4.s[0]
156 FMLA v30.4s, v12.4s, v5.s[0]
157 FMLA v21.4s, v13.4s, v0.s[0]
158 FMLA v23.4s, v13.4s, v1.s[0]
159 FMLA v25.4s, v13.4s, v2.s[0]
160 FMLA v27.4s, v13.4s, v3.s[0]
161 FMLA v29.4s, v13.4s, v4.s[0]
162
163 FMLA v31.4s, v13.4s, v5.s[0]
164 FMLA v20.4s, v14.4s, v0.s[1]
165 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
166 FMLA v22.4s, v14.4s, v1.s[1]
167 FMLA v24.4s, v14.4s, v2.s[1]
168 FMLA v26.4s, v14.4s, v3.s[1]
169 FMLA v28.4s, v14.4s, v4.s[1]
170 PRFM PLDL1KEEP, [x5, 256]
171 FMLA v30.4s, v14.4s, v5.s[1]
172 FMLA v21.4s, v15.4s, v0.s[1]
173 FMLA v23.4s, v15.4s, v1.s[1]
174 FMLA v25.4s, v15.4s, v2.s[1]
175 LDR q6, [x3], 16 // Load next 6 A
176 FMLA v27.4s, v15.4s, v3.s[1]
177 FMLA v29.4s, v15.4s, v4.s[1]
178 FMLA v31.4s, v15.4s, v5.s[1]
179 LDR q7, [x9], 16
180
181 FMLA v20.4s, v16.4s, v0.s[2]
182 FMLA v22.4s, v16.4s, v1.s[2]
183 FMLA v24.4s, v16.4s, v2.s[2]
184 LDR q8, [x10], 16
185 FMLA v26.4s, v16.4s, v3.s[2]
186 FMLA v28.4s, v16.4s, v4.s[2]
187 FMLA v30.4s, v16.4s, v5.s[2]
188 LDR q9, [x11], 16
189 FMLA v21.4s, v17.4s, v0.s[2]
190 FMLA v23.4s, v17.4s, v1.s[2]
191 FMLA v25.4s, v17.4s, v2.s[2]
192 LDR q10, [x12], 16
193 FMLA v27.4s, v17.4s, v3.s[2]
194 FMLA v29.4s, v17.4s, v4.s[2]
195 FMLA v31.4s, v17.4s, v5.s[2]
196 LDR q11, [x4], 16
197
198 FMLA v20.4s, v18.4s, v0.s[3]
199 FMLA v22.4s, v18.4s, v1.s[3]
200 FMLA v24.4s, v18.4s, v2.s[3]
201 LDP q12, q13, [x5], 32 // Load 4 B
202 FMLA v26.4s, v18.4s, v3.s[3]
203 FMLA v28.4s, v18.4s, v4.s[3]
204 FMLA v30.4s, v18.4s, v5.s[3]
205 LDP q14, q15, [x5], 32
206 FMLA v21.4s, v19.4s, v0.s[3]
207 FMLA v23.4s, v19.4s, v1.s[3]
208 FMLA v25.4s, v19.4s, v2.s[3]
209 LDP q16, q17, [x5], 32
210 FMLA v27.4s, v19.4s, v3.s[3]
211 FMLA v29.4s, v19.4s, v4.s[3]
212 FMLA v31.4s, v19.4s, v5.s[3]
213 LDP q18, q19, [x5], 32
214
215 # Second group of 4 A. 48 FMA.
216 FMLA v20.4s, v12.4s, v6.s[0]
217 FMLA v22.4s, v12.4s, v7.s[0]
218 FMLA v24.4s, v12.4s, v8.s[0]
219 LDR q0, [x3], 16 // Load next 6 A
220 FMLA v26.4s, v12.4s, v9.s[0]
221 FMLA v28.4s, v12.4s, v10.s[0]
222 FMLA v30.4s, v12.4s, v11.s[0]
223 LDR q1, [x9], 16
224 FMLA v21.4s, v13.4s, v6.s[0]
225 FMLA v23.4s, v13.4s, v7.s[0]
226 FMLA v25.4s, v13.4s, v8.s[0]
227 LDR q2, [x10], 16
228 FMLA v27.4s, v13.4s, v9.s[0]
229 FMLA v29.4s, v13.4s, v10.s[0]
230 FMLA v31.4s, v13.4s, v11.s[0]
231 LDR q3, [x11], 16
232
233 FMLA v20.4s, v14.4s, v6.s[1]
234 FMLA v22.4s, v14.4s, v7.s[1]
235 FMLA v24.4s, v14.4s, v8.s[1]
236 LDR q4, [x12], 16
237 FMLA v26.4s, v14.4s, v9.s[1]
238 FMLA v28.4s, v14.4s, v10.s[1]
239 FMLA v30.4s, v14.4s, v11.s[1]
240 LDR q5, [x4], 16
241 FMLA v21.4s, v15.4s, v6.s[1]
242 FMLA v23.4s, v15.4s, v7.s[1]
243 FMLA v25.4s, v15.4s, v8.s[1]
244 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
245 FMLA v27.4s, v15.4s, v9.s[1]
246 FMLA v29.4s, v15.4s, v10.s[1]
247 FMLA v31.4s, v15.4s, v11.s[1]
248 LDP q14, q15, [x5], 32
249
250 FMLA v20.4s, v16.4s, v6.s[2]
251 FMLA v22.4s, v16.4s, v7.s[2]
252 FMLA v24.4s, v16.4s, v8.s[2]
253 FMLA v26.4s, v16.4s, v9.s[2]
254 FMLA v28.4s, v16.4s, v10.s[2]
255 FMLA v30.4s, v16.4s, v11.s[2]
256 FMLA v21.4s, v17.4s, v6.s[2]
257 FMLA v23.4s, v17.4s, v7.s[2]
258 FMLA v25.4s, v17.4s, v8.s[2]
259 FMLA v27.4s, v17.4s, v9.s[2]
260 FMLA v29.4s, v17.4s, v10.s[2]
261 FMLA v31.4s, v17.4s, v11.s[2]
262 LDP q16, q17, [x5], 32
263
264 FMLA v20.4s, v18.4s, v6.s[3]
265 FMLA v22.4s, v18.4s, v7.s[3]
266 SUBS x0, x0, 32
267 FMLA v24.4s, v18.4s, v8.s[3]
268 FMLA v26.4s, v18.4s, v9.s[3]
269 FMLA v28.4s, v18.4s, v10.s[3]
270 FMLA v30.4s, v18.4s, v11.s[3]
271 FMLA v21.4s, v19.4s, v6.s[3]
272 FMLA v23.4s, v19.4s, v7.s[3]
273 FMLA v25.4s, v19.4s, v8.s[3]
274 FMLA v27.4s, v19.4s, v9.s[3]
275 FMLA v29.4s, v19.4s, v10.s[3]
276 FMLA v31.4s, v19.4s, v11.s[3]
277 B.HS 1b
278
279 # Epilogue - 8 floats of A (32 bytes)
280 # 96 FMA + 6 LDP A + 8 LDP B
281 # First block same as main loop. Second block has no preloads.
2822:
283 # First group of 4 A. 48 FMA.
284 FMLA v20.4s, v12.4s, v0.s[0]
285 LDP q18, q19, [x5], 32 // Load last B
286 FMLA v22.4s, v12.4s, v1.s[0]
287 FMLA v24.4s, v12.4s, v2.s[0]
288 FMLA v26.4s, v12.4s, v3.s[0]
289 FMLA v28.4s, v12.4s, v4.s[0]
290 FMLA v30.4s, v12.4s, v5.s[0]
291 FMLA v21.4s, v13.4s, v0.s[0]
292 FMLA v23.4s, v13.4s, v1.s[0]
293 FMLA v25.4s, v13.4s, v2.s[0]
294 FMLA v27.4s, v13.4s, v3.s[0]
295 FMLA v29.4s, v13.4s, v4.s[0]
296
297 FMLA v31.4s, v13.4s, v5.s[0]
298 FMLA v20.4s, v14.4s, v0.s[1]
299 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
300 FMLA v22.4s, v14.4s, v1.s[1]
301 FMLA v24.4s, v14.4s, v2.s[1]
302 FMLA v26.4s, v14.4s, v3.s[1]
303 FMLA v28.4s, v14.4s, v4.s[1]
304 PRFM PLDL1KEEP, [x5, 256]
305 FMLA v30.4s, v14.4s, v5.s[1]
306 FMLA v21.4s, v15.4s, v0.s[1]
307 FMLA v23.4s, v15.4s, v1.s[1]
308 FMLA v25.4s, v15.4s, v2.s[1]
309 LDR q6, [x3], 16 // Load next 6 A
310 FMLA v27.4s, v15.4s, v3.s[1]
311 FMLA v29.4s, v15.4s, v4.s[1]
312 FMLA v31.4s, v15.4s, v5.s[1]
313 LDR q7, [x9], 16
314
315 FMLA v20.4s, v16.4s, v0.s[2]
316 FMLA v22.4s, v16.4s, v1.s[2]
317 FMLA v24.4s, v16.4s, v2.s[2]
318 LDR q8, [x10], 16
319 FMLA v26.4s, v16.4s, v3.s[2]
320 FMLA v28.4s, v16.4s, v4.s[2]
321 FMLA v30.4s, v16.4s, v5.s[2]
322 LDR q9, [x11], 16
323 FMLA v21.4s, v17.4s, v0.s[2]
324 FMLA v23.4s, v17.4s, v1.s[2]
325 FMLA v25.4s, v17.4s, v2.s[2]
326 LDR q10, [x12], 16
327 FMLA v27.4s, v17.4s, v3.s[2]
328 FMLA v29.4s, v17.4s, v4.s[2]
329 FMLA v31.4s, v17.4s, v5.s[2]
330 LDR q11, [x4], 16
331
332 FMLA v20.4s, v18.4s, v0.s[3]
333 FMLA v22.4s, v18.4s, v1.s[3]
334 FMLA v24.4s, v18.4s, v2.s[3]
335 LDP q12, q13, [x5], 32 // Load 4 B
336 FMLA v26.4s, v18.4s, v3.s[3]
337 FMLA v28.4s, v18.4s, v4.s[3]
338 FMLA v30.4s, v18.4s, v5.s[3]
339 LDP q14, q15, [x5], 32
340 FMLA v21.4s, v19.4s, v0.s[3]
341 FMLA v23.4s, v19.4s, v1.s[3]
342 FMLA v25.4s, v19.4s, v2.s[3]
343 LDP q16, q17, [x5], 32
344 FMLA v27.4s, v19.4s, v3.s[3]
345 FMLA v29.4s, v19.4s, v4.s[3]
346 FMLA v31.4s, v19.4s, v5.s[3]
347 LDP q18, q19, [x5], 32
348
349 # Second group of 4 A. 48 FMA.
350 FMLA v20.4s, v12.4s, v6.s[0]
351 FMLA v22.4s, v12.4s, v7.s[0]
352 FMLA v24.4s, v12.4s, v8.s[0]
353 FMLA v26.4s, v12.4s, v9.s[0]
354 FMLA v28.4s, v12.4s, v10.s[0]
355 FMLA v30.4s, v12.4s, v11.s[0]
356 FMLA v21.4s, v13.4s, v6.s[0]
357 FMLA v23.4s, v13.4s, v7.s[0]
358 FMLA v25.4s, v13.4s, v8.s[0]
359 FMLA v27.4s, v13.4s, v9.s[0]
360 FMLA v29.4s, v13.4s, v10.s[0]
361 FMLA v31.4s, v13.4s, v11.s[0]
362
363 FMLA v20.4s, v14.4s, v6.s[1]
364 FMLA v22.4s, v14.4s, v7.s[1]
365 FMLA v24.4s, v14.4s, v8.s[1]
366 FMLA v26.4s, v14.4s, v9.s[1]
367 FMLA v28.4s, v14.4s, v10.s[1]
368 FMLA v30.4s, v14.4s, v11.s[1]
369 FMLA v21.4s, v15.4s, v6.s[1]
370 FMLA v23.4s, v15.4s, v7.s[1]
371 FMLA v25.4s, v15.4s, v8.s[1]
372 FMLA v27.4s, v15.4s, v9.s[1]
373 FMLA v29.4s, v15.4s, v10.s[1]
374 FMLA v31.4s, v15.4s, v11.s[1]
375
376 FMLA v20.4s, v16.4s, v6.s[2]
377 FMLA v22.4s, v16.4s, v7.s[2]
378 FMLA v24.4s, v16.4s, v8.s[2]
379 FMLA v26.4s, v16.4s, v9.s[2]
380 FMLA v28.4s, v16.4s, v10.s[2]
381 FMLA v30.4s, v16.4s, v11.s[2]
382 FMLA v21.4s, v17.4s, v6.s[2]
383 FMLA v23.4s, v17.4s, v7.s[2]
384 FMLA v25.4s, v17.4s, v8.s[2]
385 FMLA v27.4s, v17.4s, v9.s[2]
386 FMLA v29.4s, v17.4s, v10.s[2]
387 FMLA v31.4s, v17.4s, v11.s[2]
388
389 FMLA v20.4s, v18.4s, v6.s[3]
390 FMLA v22.4s, v18.4s, v7.s[3]
391 FMLA v24.4s, v18.4s, v8.s[3]
392 FMLA v26.4s, v18.4s, v9.s[3]
393 FMLA v28.4s, v18.4s, v10.s[3]
394 FMLA v30.4s, v18.4s, v11.s[3]
395 FMLA v21.4s, v19.4s, v6.s[3]
396 FMLA v23.4s, v19.4s, v7.s[3]
397
398 # Load clamping_params values
399 LD2R {v6.4s, v7.4s}, [x8]
400
401 FMLA v25.4s, v19.4s, v8.s[3]
402 FMLA v27.4s, v19.4s, v9.s[3]
403 # Is there a remainder?- 4 floats of A (16 bytes) or less
404 TST x0, 31
405 FMLA v29.4s, v19.4s, v10.s[3]
406 FMLA v31.4s, v19.4s, v11.s[3]
407 B.NE 4f
408
409 # Clamp
4103:
411 FMIN v20.4s, v20.4s, v6.4s
412 FMIN v21.4s, v21.4s, v6.4s
413 FMIN v22.4s, v22.4s, v6.4s
414 FMIN v23.4s, v23.4s, v6.4s
415 FMIN v24.4s, v24.4s, v6.4s
416 FMIN v25.4s, v25.4s, v6.4s
417 FMIN v26.4s, v26.4s, v6.4s
418 FMIN v27.4s, v27.4s, v6.4s
419 FMIN v28.4s, v28.4s, v6.4s
420 FMIN v29.4s, v29.4s, v6.4s
421 FMIN v30.4s, v30.4s, v6.4s
422 FMIN v31.4s, v31.4s, v6.4s
423 FMAX v20.4s, v20.4s, v7.4s
424 FMAX v21.4s, v21.4s, v7.4s
425 FMAX v22.4s, v22.4s, v7.4s
426 FMAX v23.4s, v23.4s, v7.4s
427 FMAX v24.4s, v24.4s, v7.4s
428 FMAX v25.4s, v25.4s, v7.4s
429 FMAX v26.4s, v26.4s, v7.4s
430 FMAX v27.4s, v27.4s, v7.4s
431 FMAX v28.4s, v28.4s, v7.4s
432 FMAX v29.4s, v29.4s, v7.4s
433 FMAX v30.4s, v30.4s, v7.4s
434 FMAX v31.4s, v31.4s, v7.4s
435
436 # Store full 6 x 8
437 CMP x1, 8
438 B.LO 7f
439
440 STP q20, q21, [x6]
441 ADD x6, x6, x14
442 SUB x3, x3, x2 // a0 -= kc
443 STP q22, q23, [x16]
444 ADD x16, x16, x14
445 SUB x9, x9, x2 // a1 -= kc
446 STP q24, q25, [x17]
447 ADD x17, x17, x14
448 SUB x10, x10, x2 // a2 -= kc
449 STP q26, q27, [x18]
450 ADD x18, x18, x14
451 SUB x11, x11, x2 // a3 -= kc
452 STP q28, q29, [x13]
453 ADD x13, x13, x14
454 SUB x12, x12, x2 // a4 -= kc
455 STP q30, q31, [x7]
456 ADD x7, x7, x14
457 SUB x4, x4, x2 // a5 -= kc
458
459 SUBS x1, x1, 8
460 B.HI 0b
461
462 # Restore d8-d15 from stack
463 LDP d14, d15, [sp, 48]
464 LDP d12, d13, [sp, 32]
465 LDP d10, d11, [sp, 16]
466 LDP d8, d9, [sp], 64
467 RET
468
4694:
470 # Load clamping_params values
471 LD2R {v6.4s, v7.4s}, [x8]
472
473 # Is there a remainder?- 4 floats of A (16 bytes)
474 TBZ x0, 4, 5f
475
476 # Remainder- 4 floats of A (16 bytes)
477 # Load A
478 LDR q0, [x3], 16
479 LDR q1, [x9], 16
480 LDR q2, [x10], 16
481 LDR q3, [x11], 16
482 LDR q4, [x12], 16
483 LDR q5, [x4], 16
484 # Load B
485 LDP q12, q13, [x5], 32
486 LDP q14, q15, [x5], 32
487 LDP q16, q17, [x5], 32
488 LDP q18, q19, [x5], 32
489
490 FMLA v20.4s, v12.4s, v0.s[0]
491 FMLA v22.4s, v12.4s, v1.s[0]
492 FMLA v24.4s, v12.4s, v2.s[0]
493 FMLA v26.4s, v12.4s, v3.s[0]
494 FMLA v28.4s, v12.4s, v4.s[0]
495 FMLA v30.4s, v12.4s, v5.s[0]
496 FMLA v21.4s, v13.4s, v0.s[0]
497 FMLA v23.4s, v13.4s, v1.s[0]
498 FMLA v25.4s, v13.4s, v2.s[0]
499 FMLA v27.4s, v13.4s, v3.s[0]
500 FMLA v29.4s, v13.4s, v4.s[0]
501 FMLA v31.4s, v13.4s, v5.s[0]
502
503 FMLA v20.4s, v14.4s, v0.s[1]
504 FMLA v22.4s, v14.4s, v1.s[1]
505 FMLA v24.4s, v14.4s, v2.s[1]
506 FMLA v26.4s, v14.4s, v3.s[1]
507 FMLA v28.4s, v14.4s, v4.s[1]
508 FMLA v30.4s, v14.4s, v5.s[1]
509 FMLA v21.4s, v15.4s, v0.s[1]
510 FMLA v23.4s, v15.4s, v1.s[1]
511 FMLA v25.4s, v15.4s, v2.s[1]
512 FMLA v27.4s, v15.4s, v3.s[1]
513 FMLA v29.4s, v15.4s, v4.s[1]
514 FMLA v31.4s, v15.4s, v5.s[1]
515
516 FMLA v20.4s, v16.4s, v0.s[2]
517 FMLA v22.4s, v16.4s, v1.s[2]
518 FMLA v24.4s, v16.4s, v2.s[2]
519 FMLA v26.4s, v16.4s, v3.s[2]
520 FMLA v28.4s, v16.4s, v4.s[2]
521 FMLA v30.4s, v16.4s, v5.s[2]
522 FMLA v21.4s, v17.4s, v0.s[2]
523 FMLA v23.4s, v17.4s, v1.s[2]
524 FMLA v25.4s, v17.4s, v2.s[2]
525 FMLA v27.4s, v17.4s, v3.s[2]
526 FMLA v29.4s, v17.4s, v4.s[2]
527 FMLA v31.4s, v17.4s, v5.s[2]
528
529 FMLA v20.4s, v18.4s, v0.s[3]
530 FMLA v22.4s, v18.4s, v1.s[3]
531 FMLA v24.4s, v18.4s, v2.s[3]
532 FMLA v26.4s, v18.4s, v3.s[3]
533 FMLA v28.4s, v18.4s, v4.s[3]
534 FMLA v30.4s, v18.4s, v5.s[3]
535 FMLA v21.4s, v19.4s, v0.s[3]
536 FMLA v23.4s, v19.4s, v1.s[3]
537 FMLA v25.4s, v19.4s, v2.s[3]
538 FMLA v27.4s, v19.4s, v3.s[3]
539 FMLA v29.4s, v19.4s, v4.s[3]
540 FMLA v31.4s, v19.4s, v5.s[3]
541
542 # Is there a remainder?- 2 floats of A (8 bytes)
5435:
544 TBZ x0, 3, 6f
545
546 # Remainder- 2 floats of A (8 bytes)
547 # Load A
548 LDR d0, [x3], 8
549 LDR d1, [x9], 8
550 LDR d2, [x10], 8
551 LDR d3, [x11], 8
552 LDR d4, [x12], 8
553 LDR d5, [x4], 8
554 # Load B
555 LDP q12, q13, [x5], 32
556 LDP q14, q15, [x5], 32
557
558 FMLA v20.4s, v12.4s, v0.s[0]
559 FMLA v22.4s, v12.4s, v1.s[0]
560 FMLA v24.4s, v12.4s, v2.s[0]
561 FMLA v26.4s, v12.4s, v3.s[0]
562 FMLA v28.4s, v12.4s, v4.s[0]
563 FMLA v30.4s, v12.4s, v5.s[0]
564 FMLA v21.4s, v13.4s, v0.s[0]
565 FMLA v23.4s, v13.4s, v1.s[0]
566 FMLA v25.4s, v13.4s, v2.s[0]
567 FMLA v27.4s, v13.4s, v3.s[0]
568 FMLA v29.4s, v13.4s, v4.s[0]
569 FMLA v31.4s, v13.4s, v5.s[0]
570
571 FMLA v20.4s, v14.4s, v0.s[1]
572 FMLA v22.4s, v14.4s, v1.s[1]
573 FMLA v24.4s, v14.4s, v2.s[1]
574 FMLA v26.4s, v14.4s, v3.s[1]
575 FMLA v28.4s, v14.4s, v4.s[1]
576 FMLA v30.4s, v14.4s, v5.s[1]
577 FMLA v21.4s, v15.4s, v0.s[1]
578 FMLA v23.4s, v15.4s, v1.s[1]
579 FMLA v25.4s, v15.4s, v2.s[1]
580 FMLA v27.4s, v15.4s, v3.s[1]
581 FMLA v29.4s, v15.4s, v4.s[1]
582 FMLA v31.4s, v15.4s, v5.s[1]
583
584 # Is there a remainder?- 1 float of A (4 bytes)
5856:
586 TBZ x0, 2, 3b
587
588 # Remainder- 1 float of A (4 bytes)
589 # Load A
590 LDR s0, [x3], 4
591 LDR s1, [x9], 4
592 LDR s2, [x10], 4
593 LDR s3, [x11], 4
594 LDR s4, [x12], 4
595 LDR s5, [x4], 4
596 # Load B
597 LDP q12, q13, [x5], 32
598
599 FMLA v20.4s, v12.4s, v0.s[0]
600 FMLA v22.4s, v12.4s, v1.s[0]
601 FMLA v24.4s, v12.4s, v2.s[0]
602 FMLA v26.4s, v12.4s, v3.s[0]
603 FMLA v28.4s, v12.4s, v4.s[0]
604 FMLA v30.4s, v12.4s, v5.s[0]
605 FMLA v21.4s, v13.4s, v0.s[0]
606 FMLA v23.4s, v13.4s, v1.s[0]
607 FMLA v25.4s, v13.4s, v2.s[0]
608 FMLA v27.4s, v13.4s, v3.s[0]
609 FMLA v29.4s, v13.4s, v4.s[0]
610 FMLA v31.4s, v13.4s, v5.s[0]
611 B 3b
612
613 # Store odd width
6147:
615 TBZ x1, 2, 8f
616 STR q20, [x6], 16
617 MOV v20.16b, v21.16b
618 STR q22, [x16], 16
619 MOV v22.16b, v23.16b
620 STR q24, [x17], 16
621 MOV v24.16b, v25.16b
622 STR q26, [x18], 16
623 MOV v26.16b, v27.16b
624 STR q28, [x13], 16
625 MOV v28.16b, v29.16b
626 STR q30, [x7], 16
627 MOV v30.16b, v31.16b
6288:
629 TBZ x1, 1, 9f
630 STR d20, [x6], 8
631 DUP d20, v20.d[1]
632 STR d22, [x16], 8
633 DUP d22, v22.d[1]
634 STR d24, [x17], 8
635 DUP d24, v24.d[1]
636 STR d26, [x18], 8
637 DUP d26, v26.d[1]
638 STR d28, [x13], 8
639 DUP d28, v28.d[1]
640 STR d30, [x7], 8
641 DUP d30, v30.d[1]
642
6439:
644 TBZ x1, 0, 10f
645 STR s20, [x6]
646 STR s22, [x16]
647 STR s24, [x17]
648 STR s26, [x18]
649 STR s28, [x13]
650 STR s30, [x7]
65110:
652 # Restore d8-d15 from stack
653 LDP d14, d15, [sp, 48]
654 LDP d12, d13, [sp, 32]
655 LDP d10, d11, [sp, 16]
656 LDP d8, d9, [sp], 64
657 RET
658
659END_FUNCTION f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75
660
661#ifdef __ELF__
662.section ".note.GNU-stack","",%progbits
663#endif