blob: 56add56318cda051f93aa41e836e75a17d361dd7 [file] [log] [blame]
Frank Barchard0d1052c2020-03-23 17:28:13 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> (x0)
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
Frank Barchard0d1052c2020-03-23 17:28:13 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34# x4 a5
35
36# C pointers
37# x6 c0
38# x16 c1
39# x17 c2
40# x14 c3
41# x13 c4
42# x7 c5
43
44# Vector register usage
45# A0 v0 v6
46# A1 v1 v7
47# A2 v2 v8
48# A3 v3 v9
49# A4 v4 v10
50# A5 v5 v11
51# B v12 v13 v14 v15
52# B v16 v17 v18 v19
53# C v20 v21
54# C v22 v23
55# C v24 v25
56# C v26 v27
57# C v28 v29
58# C v30 v31
59# Clamp v6 v7
60
61# IOS microkernel is based on Cortex-A75 kernel but avoids X18 by
62# using X14 instead of X18, and reloading cn_stride into x0.
63
64BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios
65
66 # Clamp A and C pointers / Save d8-d15 on stack
67 STP d8, d9, [sp, -64]!
68 CMP x0, 2 // if mr < 2
69 ADD x9, x3, x4 // a1 = a0 + a_stride
70 ADD x16, x6, x7 // c1 = c0 + cm_stride
71 CSEL x9, x3, x9, LO // a1 = a0
72 CSEL x16, x6, x16, LO // c1 = c0
73
74 STP d10, d11, [sp, 16]
75 ADD x10, x9, x4 // a2 = a1 + a_stride
76 ADD x17, x16, x7 // c2 = c1 + cm_stride
77 // if mr <= 2
78 CSEL x10, x9, x10, LS // a2 = a1
79 CSEL x17, x16, x17, LS // c2 = c1
80
81 STP d12, d13, [sp, 32]
82 CMP x0, 4 // if mr < 4
83 ADD x11, x10, x4 // a3 = a2 + a_stride
84 ADD x14, x17, x7 // c3 = c2 + cm_stride
85 CSEL x11, x10, x11, LO // a3 = a2
86 CSEL x14, x17, x14, LO // c3 = c2
87
88 STP d14, d15, [sp, 48]
89 ADD x12, x11, x4 // a4 = a3 + a_stride
90 ADD x13, x14, x7 // c4 = c3 + cm_stride
91 // if mr <= 5
92 CSEL x12, x11, x12, LS // a4 = a3
93 CSEL x13, x14, x13, LS // c4 = c3
94
95 # Load acc, params pointer
96 LDP x15, x8, [sp, 72]
97
98 CMP x0, 6 // if mr < 6
99 ADD x4, x12, x4 // a5 = a4 + a_stride
100 ADD x7, x13, x7 // c5 = c4 + cm_stride
101 CSEL x4, x12, x4, LO // a5 = a4
102 CSEL x7, x13, x7, LO // c5 = c4
103
1040:
105 # Load initial accumulators
106 LDP q20, q21, [x15], 32
107 LDP q22, q23, [x15], 32
108 LDP q24, q25, [x15], 32
109 LDP q26, q27, [x15], 32
110 LDP q28, q29, [x15], 32
111 LDP q30, q31, [x15], 32
112
113 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
114 SUBS x0, x2, 32 // k = kc - 32
115 B.LO 4f
116
117 # Prologue - loads for main loop of 96 FMA
118 LDR q0, [x3], 16
119 LDR q1, [x9], 16
120 LDR q2, [x10], 16
121 LDR q3, [x11], 16
122 LDR q4, [x12], 16
123 LDR q5, [x4], 16
124 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
125 LDP q14, q15, [x5], 32
126 LDP q16, q17, [x5], 32
127
128 # Is there at least 8 floats (32 bytes) for main loop?
129 SUBS x0, x0, 32
130 B.LO 2f
131
132 # Main loop - 8 floats of A (32 bytes)
133 # 96 FMA + 6 LDP A + 8 LDP B
1341:
135 # First group of 4 A. 48 FMA.
136 FMLA v20.4s, v12.4s, v0.s[0]
137 LDP q18, q19, [x5], 32 // Load last B
138 FMLA v22.4s, v12.4s, v1.s[0]
139 FMLA v24.4s, v12.4s, v2.s[0]
140 FMLA v26.4s, v12.4s, v3.s[0]
141 FMLA v28.4s, v12.4s, v4.s[0]
142 FMLA v30.4s, v12.4s, v5.s[0]
143 FMLA v21.4s, v13.4s, v0.s[0]
144 FMLA v23.4s, v13.4s, v1.s[0]
145 FMLA v25.4s, v13.4s, v2.s[0]
146 FMLA v27.4s, v13.4s, v3.s[0]
147 FMLA v29.4s, v13.4s, v4.s[0]
148
149 FMLA v31.4s, v13.4s, v5.s[0]
150 FMLA v20.4s, v14.4s, v0.s[1]
151 FMLA v22.4s, v14.4s, v1.s[1]
152 FMLA v24.4s, v14.4s, v2.s[1]
153 FMLA v26.4s, v14.4s, v3.s[1]
154 FMLA v28.4s, v14.4s, v4.s[1]
155 FMLA v30.4s, v14.4s, v5.s[1]
156 FMLA v21.4s, v15.4s, v0.s[1]
157 FMLA v23.4s, v15.4s, v1.s[1]
158 FMLA v25.4s, v15.4s, v2.s[1]
159 LDR q6, [x3], 16 // Load next 6 A
160 FMLA v27.4s, v15.4s, v3.s[1]
161 FMLA v29.4s, v15.4s, v4.s[1]
162 FMLA v31.4s, v15.4s, v5.s[1]
163 LDR q7, [x9], 16
164
165 FMLA v20.4s, v16.4s, v0.s[2]
166 FMLA v22.4s, v16.4s, v1.s[2]
167 FMLA v24.4s, v16.4s, v2.s[2]
168 LDR q8, [x10], 16
169 FMLA v26.4s, v16.4s, v3.s[2]
170 FMLA v28.4s, v16.4s, v4.s[2]
171 FMLA v30.4s, v16.4s, v5.s[2]
172 LDR q9, [x11], 16
173 FMLA v21.4s, v17.4s, v0.s[2]
174 FMLA v23.4s, v17.4s, v1.s[2]
175 FMLA v25.4s, v17.4s, v2.s[2]
176 LDR q10, [x12], 16
177 FMLA v27.4s, v17.4s, v3.s[2]
178 FMLA v29.4s, v17.4s, v4.s[2]
179 FMLA v31.4s, v17.4s, v5.s[2]
180 LDR q11, [x4], 16
181
182 FMLA v20.4s, v18.4s, v0.s[3]
183 FMLA v22.4s, v18.4s, v1.s[3]
184 FMLA v24.4s, v18.4s, v2.s[3]
185 LDP q12, q13, [x5], 32 // Load 4 B
186 FMLA v26.4s, v18.4s, v3.s[3]
187 FMLA v28.4s, v18.4s, v4.s[3]
188 FMLA v30.4s, v18.4s, v5.s[3]
189 LDP q14, q15, [x5], 32
190 FMLA v21.4s, v19.4s, v0.s[3]
191 FMLA v23.4s, v19.4s, v1.s[3]
192 FMLA v25.4s, v19.4s, v2.s[3]
193 LDP q16, q17, [x5], 32
194 FMLA v27.4s, v19.4s, v3.s[3]
195 FMLA v29.4s, v19.4s, v4.s[3]
196 FMLA v31.4s, v19.4s, v5.s[3]
197 LDP q18, q19, [x5], 32
198
199 # Second group of 4 A. 48 FMA.
200 FMLA v20.4s, v12.4s, v6.s[0]
201 FMLA v22.4s, v12.4s, v7.s[0]
202 FMLA v24.4s, v12.4s, v8.s[0]
203 LDR q0, [x3], 16 // Load next 6 A
204 FMLA v26.4s, v12.4s, v9.s[0]
205 FMLA v28.4s, v12.4s, v10.s[0]
206 FMLA v30.4s, v12.4s, v11.s[0]
207 LDR q1, [x9], 16
208 FMLA v21.4s, v13.4s, v6.s[0]
209 FMLA v23.4s, v13.4s, v7.s[0]
210 FMLA v25.4s, v13.4s, v8.s[0]
211 LDR q2, [x10], 16
212 FMLA v27.4s, v13.4s, v9.s[0]
213 FMLA v29.4s, v13.4s, v10.s[0]
214 FMLA v31.4s, v13.4s, v11.s[0]
215 LDR q3, [x11], 16
216
217 FMLA v20.4s, v14.4s, v6.s[1]
218 FMLA v22.4s, v14.4s, v7.s[1]
219 FMLA v24.4s, v14.4s, v8.s[1]
220 LDR q4, [x12], 16
221 FMLA v26.4s, v14.4s, v9.s[1]
222 FMLA v28.4s, v14.4s, v10.s[1]
223 FMLA v30.4s, v14.4s, v11.s[1]
224 LDR q5, [x4], 16
225 FMLA v21.4s, v15.4s, v6.s[1]
226 FMLA v23.4s, v15.4s, v7.s[1]
227 FMLA v25.4s, v15.4s, v8.s[1]
228 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
229 FMLA v27.4s, v15.4s, v9.s[1]
230 FMLA v29.4s, v15.4s, v10.s[1]
231 FMLA v31.4s, v15.4s, v11.s[1]
232 LDP q14, q15, [x5], 32
233
234 FMLA v20.4s, v16.4s, v6.s[2]
235 FMLA v22.4s, v16.4s, v7.s[2]
236 FMLA v24.4s, v16.4s, v8.s[2]
237 FMLA v26.4s, v16.4s, v9.s[2]
238 FMLA v28.4s, v16.4s, v10.s[2]
239 FMLA v30.4s, v16.4s, v11.s[2]
240 FMLA v21.4s, v17.4s, v6.s[2]
241 FMLA v23.4s, v17.4s, v7.s[2]
242 FMLA v25.4s, v17.4s, v8.s[2]
243 FMLA v27.4s, v17.4s, v9.s[2]
244 FMLA v29.4s, v17.4s, v10.s[2]
245 FMLA v31.4s, v17.4s, v11.s[2]
246 LDP q16, q17, [x5], 32
247
248 FMLA v20.4s, v18.4s, v6.s[3]
249 FMLA v22.4s, v18.4s, v7.s[3]
250 SUBS x0, x0, 32
251 FMLA v24.4s, v18.4s, v8.s[3]
252 FMLA v26.4s, v18.4s, v9.s[3]
253 FMLA v28.4s, v18.4s, v10.s[3]
254 FMLA v30.4s, v18.4s, v11.s[3]
255 FMLA v21.4s, v19.4s, v6.s[3]
256 FMLA v23.4s, v19.4s, v7.s[3]
257 FMLA v25.4s, v19.4s, v8.s[3]
258 FMLA v27.4s, v19.4s, v9.s[3]
259 FMLA v29.4s, v19.4s, v10.s[3]
260 FMLA v31.4s, v19.4s, v11.s[3]
261 B.HS 1b
262
263 # Epilogue - 8 floats of A (32 bytes)
264 # 96 FMA + 6 LDP A + 8 LDP B
265 # First block same as main loop. Second block has no preloads.
2662:
267 # First group of 4 A. 48 FMA.
268 FMLA v20.4s, v12.4s, v0.s[0]
269 LDP q18, q19, [x5], 32 // Load last B
270 FMLA v22.4s, v12.4s, v1.s[0]
271 FMLA v24.4s, v12.4s, v2.s[0]
272 FMLA v26.4s, v12.4s, v3.s[0]
273 FMLA v28.4s, v12.4s, v4.s[0]
274 FMLA v30.4s, v12.4s, v5.s[0]
275 FMLA v21.4s, v13.4s, v0.s[0]
276 FMLA v23.4s, v13.4s, v1.s[0]
277 FMLA v25.4s, v13.4s, v2.s[0]
278 FMLA v27.4s, v13.4s, v3.s[0]
279 FMLA v29.4s, v13.4s, v4.s[0]
280
281 FMLA v31.4s, v13.4s, v5.s[0]
282 FMLA v20.4s, v14.4s, v0.s[1]
283 FMLA v22.4s, v14.4s, v1.s[1]
284 FMLA v24.4s, v14.4s, v2.s[1]
285 FMLA v26.4s, v14.4s, v3.s[1]
286 FMLA v28.4s, v14.4s, v4.s[1]
287 FMLA v30.4s, v14.4s, v5.s[1]
288 FMLA v21.4s, v15.4s, v0.s[1]
289 FMLA v23.4s, v15.4s, v1.s[1]
290 FMLA v25.4s, v15.4s, v2.s[1]
291 LDR q6, [x3], 16 // Load next 6 A
292 FMLA v27.4s, v15.4s, v3.s[1]
293 FMLA v29.4s, v15.4s, v4.s[1]
294 FMLA v31.4s, v15.4s, v5.s[1]
295 LDR q7, [x9], 16
296
297 FMLA v20.4s, v16.4s, v0.s[2]
298 FMLA v22.4s, v16.4s, v1.s[2]
299 FMLA v24.4s, v16.4s, v2.s[2]
300 LDR q8, [x10], 16
301 FMLA v26.4s, v16.4s, v3.s[2]
302 FMLA v28.4s, v16.4s, v4.s[2]
303 FMLA v30.4s, v16.4s, v5.s[2]
304 LDR q9, [x11], 16
305 FMLA v21.4s, v17.4s, v0.s[2]
306 FMLA v23.4s, v17.4s, v1.s[2]
307 FMLA v25.4s, v17.4s, v2.s[2]
308 LDR q10, [x12], 16
309 FMLA v27.4s, v17.4s, v3.s[2]
310 FMLA v29.4s, v17.4s, v4.s[2]
311 FMLA v31.4s, v17.4s, v5.s[2]
312 LDR q11, [x4], 16
313
314 FMLA v20.4s, v18.4s, v0.s[3]
315 FMLA v22.4s, v18.4s, v1.s[3]
316 FMLA v24.4s, v18.4s, v2.s[3]
317 LDP q12, q13, [x5], 32 // Load 4 B
318 FMLA v26.4s, v18.4s, v3.s[3]
319 FMLA v28.4s, v18.4s, v4.s[3]
320 FMLA v30.4s, v18.4s, v5.s[3]
321 LDP q14, q15, [x5], 32
322 FMLA v21.4s, v19.4s, v0.s[3]
323 FMLA v23.4s, v19.4s, v1.s[3]
324 FMLA v25.4s, v19.4s, v2.s[3]
325 LDP q16, q17, [x5], 32
326 FMLA v27.4s, v19.4s, v3.s[3]
327 FMLA v29.4s, v19.4s, v4.s[3]
328 FMLA v31.4s, v19.4s, v5.s[3]
329 LDP q18, q19, [x5], 32
330
331 # Second group of 4 A. 48 FMA.
332 FMLA v20.4s, v12.4s, v6.s[0]
333 FMLA v22.4s, v12.4s, v7.s[0]
334 FMLA v24.4s, v12.4s, v8.s[0]
335 FMLA v26.4s, v12.4s, v9.s[0]
336 FMLA v28.4s, v12.4s, v10.s[0]
337 FMLA v30.4s, v12.4s, v11.s[0]
338 FMLA v21.4s, v13.4s, v6.s[0]
339 FMLA v23.4s, v13.4s, v7.s[0]
340 FMLA v25.4s, v13.4s, v8.s[0]
341 FMLA v27.4s, v13.4s, v9.s[0]
342 FMLA v29.4s, v13.4s, v10.s[0]
343 FMLA v31.4s, v13.4s, v11.s[0]
344
345 FMLA v20.4s, v14.4s, v6.s[1]
346 FMLA v22.4s, v14.4s, v7.s[1]
347 FMLA v24.4s, v14.4s, v8.s[1]
348 FMLA v26.4s, v14.4s, v9.s[1]
349 FMLA v28.4s, v14.4s, v10.s[1]
350 FMLA v30.4s, v14.4s, v11.s[1]
351 FMLA v21.4s, v15.4s, v6.s[1]
352 FMLA v23.4s, v15.4s, v7.s[1]
353 FMLA v25.4s, v15.4s, v8.s[1]
354 FMLA v27.4s, v15.4s, v9.s[1]
355 FMLA v29.4s, v15.4s, v10.s[1]
356 FMLA v31.4s, v15.4s, v11.s[1]
357
358 FMLA v20.4s, v16.4s, v6.s[2]
359 FMLA v22.4s, v16.4s, v7.s[2]
360 FMLA v24.4s, v16.4s, v8.s[2]
361 FMLA v26.4s, v16.4s, v9.s[2]
362 FMLA v28.4s, v16.4s, v10.s[2]
363 FMLA v30.4s, v16.4s, v11.s[2]
364 FMLA v21.4s, v17.4s, v6.s[2]
365 FMLA v23.4s, v17.4s, v7.s[2]
366 FMLA v25.4s, v17.4s, v8.s[2]
367 FMLA v27.4s, v17.4s, v9.s[2]
368 FMLA v29.4s, v17.4s, v10.s[2]
369 FMLA v31.4s, v17.4s, v11.s[2]
370
371 FMLA v20.4s, v18.4s, v6.s[3]
372 FMLA v22.4s, v18.4s, v7.s[3]
373 FMLA v24.4s, v18.4s, v8.s[3]
374 FMLA v26.4s, v18.4s, v9.s[3]
375 FMLA v28.4s, v18.4s, v10.s[3]
376 FMLA v30.4s, v18.4s, v11.s[3]
377 FMLA v21.4s, v19.4s, v6.s[3]
378 FMLA v23.4s, v19.4s, v7.s[3]
379
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700380 # Load min/max values
Frank Barchard0d1052c2020-03-23 17:28:13 -0700381 LD2R {v6.4s, v7.4s}, [x8]
382
383 FMLA v25.4s, v19.4s, v8.s[3]
384 FMLA v27.4s, v19.4s, v9.s[3]
385 # Is there a remainder?- 4 floats of A (16 bytes) or less
386 TST x0, 31
387 FMLA v29.4s, v19.4s, v10.s[3]
388 FMLA v31.4s, v19.4s, v11.s[3]
389 B.NE 4f
390
391 # Clamp
3923:
Marat Dukhana51cf482020-04-08 16:16:19 -0700393 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700394 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700395 FMAX v21.4s, v21.4s, v6.4s
396 FMAX v22.4s, v22.4s, v6.4s
397 FMAX v23.4s, v23.4s, v6.4s
398 FMAX v24.4s, v24.4s, v6.4s
399 FMAX v25.4s, v25.4s, v6.4s
400 FMAX v26.4s, v26.4s, v6.4s
401 FMAX v27.4s, v27.4s, v6.4s
402 FMAX v28.4s, v28.4s, v6.4s
403 FMAX v29.4s, v29.4s, v6.4s
404 FMAX v30.4s, v30.4s, v6.4s
405 FMAX v31.4s, v31.4s, v6.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700406 # Load cn_stride
407 LDR x0, [sp, 64]
Marat Dukhana51cf482020-04-08 16:16:19 -0700408 FMIN v20.4s, v20.4s, v7.4s
409 FMIN v21.4s, v21.4s, v7.4s
410 FMIN v22.4s, v22.4s, v7.4s
411 FMIN v23.4s, v23.4s, v7.4s
412 FMIN v24.4s, v24.4s, v7.4s
413 FMIN v25.4s, v25.4s, v7.4s
414 FMIN v26.4s, v26.4s, v7.4s
415 FMIN v27.4s, v27.4s, v7.4s
416 FMIN v28.4s, v28.4s, v7.4s
417 FMIN v29.4s, v29.4s, v7.4s
418 FMIN v30.4s, v30.4s, v7.4s
419 FMIN v31.4s, v31.4s, v7.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700420
421 # Store full 6 x 8
422 B.LO 7f
423
424 STP q30, q31, [x7]
425 ADD x7, x7, x0
426 SUB x3, x3, x2 // a0 -= kc
427 STP q28, q29, [x13]
428 ADD x13, x13, x0
429 SUB x9, x9, x2 // a1 -= kc
430 STP q26, q27, [x14]
431 ADD x14, x14, x0
432 SUB x10, x10, x2 // a2 -= kc
433 STP q24, q25, [x17]
434 ADD x17, x17, x0
435 SUB x11, x11, x2 // a3 -= kc
436 STP q22, q23, [x16]
437 ADD x16, x16, x0
438 SUB x12, x12, x2 // a4 -= kc
439 STP q20, q21, [x6]
440 ADD x6, x6, x0
441 SUB x4, x4, x2 // a5 -= kc
442
443 B.HI 0b
444
445 # Restore d8-d15 from stack
446 LDP d14, d15, [sp, 48]
447 LDP d12, d13, [sp, 32]
448 LDP d10, d11, [sp, 16]
449 LDP d8, d9, [sp], 64
450 RET
451
4524:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700453 # Load min/max values
Frank Barchard0d1052c2020-03-23 17:28:13 -0700454 LD2R {v6.4s, v7.4s}, [x8]
455
456 # Is there a remainder?- 4 floats of A (16 bytes)
457 TBZ x0, 4, 5f
458
459 # Remainder- 4 floats of A (16 bytes)
460 # Load A
461 LDR q0, [x3], 16
462 LDR q1, [x9], 16
463 LDR q2, [x10], 16
464 LDR q3, [x11], 16
465 LDR q4, [x12], 16
466 LDR q5, [x4], 16
467 # Load B
468 LDP q12, q13, [x5], 32
469 LDP q14, q15, [x5], 32
470 LDP q16, q17, [x5], 32
471 LDP q18, q19, [x5], 32
472
473 FMLA v20.4s, v12.4s, v0.s[0]
474 FMLA v22.4s, v12.4s, v1.s[0]
475 FMLA v24.4s, v12.4s, v2.s[0]
476 FMLA v26.4s, v12.4s, v3.s[0]
477 FMLA v28.4s, v12.4s, v4.s[0]
478 FMLA v30.4s, v12.4s, v5.s[0]
479 FMLA v21.4s, v13.4s, v0.s[0]
480 FMLA v23.4s, v13.4s, v1.s[0]
481 FMLA v25.4s, v13.4s, v2.s[0]
482 FMLA v27.4s, v13.4s, v3.s[0]
483 FMLA v29.4s, v13.4s, v4.s[0]
484 FMLA v31.4s, v13.4s, v5.s[0]
485
486 FMLA v20.4s, v14.4s, v0.s[1]
487 FMLA v22.4s, v14.4s, v1.s[1]
488 FMLA v24.4s, v14.4s, v2.s[1]
489 FMLA v26.4s, v14.4s, v3.s[1]
490 FMLA v28.4s, v14.4s, v4.s[1]
491 FMLA v30.4s, v14.4s, v5.s[1]
492 FMLA v21.4s, v15.4s, v0.s[1]
493 FMLA v23.4s, v15.4s, v1.s[1]
494 FMLA v25.4s, v15.4s, v2.s[1]
495 FMLA v27.4s, v15.4s, v3.s[1]
496 FMLA v29.4s, v15.4s, v4.s[1]
497 FMLA v31.4s, v15.4s, v5.s[1]
498
499 FMLA v20.4s, v16.4s, v0.s[2]
500 FMLA v22.4s, v16.4s, v1.s[2]
501 FMLA v24.4s, v16.4s, v2.s[2]
502 FMLA v26.4s, v16.4s, v3.s[2]
503 FMLA v28.4s, v16.4s, v4.s[2]
504 FMLA v30.4s, v16.4s, v5.s[2]
505 FMLA v21.4s, v17.4s, v0.s[2]
506 FMLA v23.4s, v17.4s, v1.s[2]
507 FMLA v25.4s, v17.4s, v2.s[2]
508 FMLA v27.4s, v17.4s, v3.s[2]
509 FMLA v29.4s, v17.4s, v4.s[2]
510 FMLA v31.4s, v17.4s, v5.s[2]
511
512 FMLA v20.4s, v18.4s, v0.s[3]
513 FMLA v22.4s, v18.4s, v1.s[3]
514 FMLA v24.4s, v18.4s, v2.s[3]
515 FMLA v26.4s, v18.4s, v3.s[3]
516 FMLA v28.4s, v18.4s, v4.s[3]
517 FMLA v30.4s, v18.4s, v5.s[3]
518 FMLA v21.4s, v19.4s, v0.s[3]
519 FMLA v23.4s, v19.4s, v1.s[3]
520 FMLA v25.4s, v19.4s, v2.s[3]
521 FMLA v27.4s, v19.4s, v3.s[3]
522 FMLA v29.4s, v19.4s, v4.s[3]
523 FMLA v31.4s, v19.4s, v5.s[3]
524
525 # Is there a remainder?- 2 floats of A (8 bytes)
5265:
527 TBZ x0, 3, 6f
528
529 # Remainder- 2 floats of A (8 bytes)
530 # Load A
531 LDR d0, [x3], 8
532 LDR d1, [x9], 8
533 LDR d2, [x10], 8
534 LDR d3, [x11], 8
535 LDR d4, [x12], 8
536 LDR d5, [x4], 8
537 # Load B
538 LDP q12, q13, [x5], 32
539 LDP q14, q15, [x5], 32
540
541 FMLA v20.4s, v12.4s, v0.s[0]
542 FMLA v22.4s, v12.4s, v1.s[0]
543 FMLA v24.4s, v12.4s, v2.s[0]
544 FMLA v26.4s, v12.4s, v3.s[0]
545 FMLA v28.4s, v12.4s, v4.s[0]
546 FMLA v30.4s, v12.4s, v5.s[0]
547 FMLA v21.4s, v13.4s, v0.s[0]
548 FMLA v23.4s, v13.4s, v1.s[0]
549 FMLA v25.4s, v13.4s, v2.s[0]
550 FMLA v27.4s, v13.4s, v3.s[0]
551 FMLA v29.4s, v13.4s, v4.s[0]
552 FMLA v31.4s, v13.4s, v5.s[0]
553
554 FMLA v20.4s, v14.4s, v0.s[1]
555 FMLA v22.4s, v14.4s, v1.s[1]
556 FMLA v24.4s, v14.4s, v2.s[1]
557 FMLA v26.4s, v14.4s, v3.s[1]
558 FMLA v28.4s, v14.4s, v4.s[1]
559 FMLA v30.4s, v14.4s, v5.s[1]
560 FMLA v21.4s, v15.4s, v0.s[1]
561 FMLA v23.4s, v15.4s, v1.s[1]
562 FMLA v25.4s, v15.4s, v2.s[1]
563 FMLA v27.4s, v15.4s, v3.s[1]
564 FMLA v29.4s, v15.4s, v4.s[1]
565 FMLA v31.4s, v15.4s, v5.s[1]
566
567 # Is there a remainder?- 1 float of A (4 bytes)
5686:
569 TBZ x0, 2, 3b
570
571 # Remainder- 1 float of A (4 bytes)
572 # Load A
573 LDR s0, [x3], 4
574 LDR s1, [x9], 4
575 LDR s2, [x10], 4
576 LDR s3, [x11], 4
577 LDR s4, [x12], 4
578 LDR s5, [x4], 4
579 # Load B
580 LDP q12, q13, [x5], 32
581
582 FMLA v20.4s, v12.4s, v0.s[0]
583 FMLA v22.4s, v12.4s, v1.s[0]
584 FMLA v24.4s, v12.4s, v2.s[0]
585 FMLA v26.4s, v12.4s, v3.s[0]
586 FMLA v28.4s, v12.4s, v4.s[0]
587 FMLA v30.4s, v12.4s, v5.s[0]
588 FMLA v21.4s, v13.4s, v0.s[0]
589 FMLA v23.4s, v13.4s, v1.s[0]
590 FMLA v25.4s, v13.4s, v2.s[0]
591 FMLA v27.4s, v13.4s, v3.s[0]
592 FMLA v29.4s, v13.4s, v4.s[0]
593 FMLA v31.4s, v13.4s, v5.s[0]
594 B 3b
595
596 # Store odd width
5977:
598 TBZ x1, 2, 8f
599 STR q30, [x7], 16
600 MOV v30.16b, v31.16b
601 STR q28, [x13], 16
602 MOV v28.16b, v29.16b
603 STR q26, [x14], 16
604 MOV v26.16b, v27.16b
605 STR q24, [x17], 16
606 MOV v24.16b, v25.16b
607 STR q22, [x16], 16
608 MOV v22.16b, v23.16b
609 STR q20, [x6], 16
610 MOV v20.16b, v21.16b
6118:
612 TBZ x1, 1, 9f
613 STR d30, [x7], 8
614 DUP d30, v30.d[1]
615 STR d28, [x13], 8
616 DUP d28, v28.d[1]
617 STR d26, [x14], 8
618 DUP d26, v26.d[1]
619 STR d24, [x17], 8
620 DUP d24, v24.d[1]
621 STR d22, [x16], 8
622 DUP d22, v22.d[1]
623 STR d20, [x6], 8
624 DUP d20, v20.d[1]
625
6269:
627 TBZ x1, 0, 10f
628 STR s30, [x7]
629 STR s28, [x13]
630 STR s26, [x14]
631 STR s24, [x17]
632 STR s22, [x16]
633 STR s20, [x6]
63410:
635 # Restore d8-d15 from stack
636 LDP d14, d15, [sp, 48]
637 LDP d12, d13, [sp, 32]
638 LDP d10, d11, [sp, 16]
639 LDP d8, d9, [sp], 64
640 RET
641
642END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios
643
644#ifdef __ELF__
645.section ".note.GNU-stack","",%progbits
646#endif