blob: 28e114d68c9671d9223262ca6e01d3435a458ea0 [file] [log] [blame]
Frank Barchard0d1052c2020-03-23 17:28:13 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-igemm/6x8-aarch64-neonfma-ios.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# size_t ks, x3 / x9
17# const float**restrict a, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> (x0)
22# size_t a_offset, [sp + 8] -> x11
23# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070024# const xnn_f32_minmax_params params [sp + 24] -> x8
Frank Barchard0d1052c2020-03-23 17:28:13 -070025
26# d8-d15 need to be preserved if used.
27# x19-30 need to be preserved if used.
28
29# A pointers
30# x14 a0
31# x15 a1
32# x20 a2
33# x21 a3
34# x22 a4
35# x23 a5
36
37# C pointers
38# x6 c0
39# x16 c1
40# x17 c2
41# x10 c3
42# x13 c4
43# x7 c5
44
45# Vector register usage
46# A0 v0 v6
47# A1 v1 v7
48# A2 v2 v8
49# A3 v3 v9
50# A4 v4 v10
51# A5 v5 v11
52# B v12 v13 v14 v15
53# B v16 v17 v18 v19
54# C v20 v21
55# C v22 v23
56# C v24 v25
57# C v26 v27
58# C v28 v29
59# C v30 v31
60# Clamp v6 v7
61
62BEGIN_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios
63
64 # Clamp C pointers / Save d8-d15 on stack
65 STP d8, d9, [sp, -96]!
66 CMP x0, 2 // if mr < 2
67 ADD x16, x6, x7 // c1 = c0 + cm_stride
68 CSEL x16, x6, x16, LO // c1 = c0
69
70 STP d10, d11, [sp, 16]
71 ADD x17, x16, x7 // c2 = c1 + cm_stride
72 // if mr <= 2
73 CSEL x17, x16, x17, LS // c2 = c1
74
75 STP d12, d13, [sp, 32]
76 CMP x0, 4 // if mr < 4
77 ADD x10, x17, x7 // c3 = c2 + cm_stride
78 CSEL x10, x17, x10, LO // c3 = c2
79
80 STP d14, d15, [sp, 48]
81 ADD x13, x10, x7 // c4 = c3 + cm_stride
82 // if mr <= 5
83 CSEL x13, x10, x13, LS // c4 = c3
84
85 # Save x20,x21,x22,x23 on stack
86 STP x20, x21, [sp, 64]
87 STP x22, x23, [sp, 80]
88
89 CMP x0, 6 // if mr < 6
90 ADD x7, x13, x7 // c5 = c4 + cm_stride
91 CSEL x7, x13, x7, LO // c5 = c4
92
93 # Load a_offset
94 LDR x11, [sp, 104]
95
96 # Load zero, clamping params pointer
97 LDP x12, x8, [sp, 112]
98
990:
100 # Load initial bias from w into accumulators
101 LDP q20, q21, [x5], 32
102 MOV v22.16b, v20.16b
103 MOV v23.16b, v21.16b
104 MOV v24.16b, v20.16b
105 MOV v25.16b, v21.16b
106 MOV v26.16b, v20.16b
107 MOV v27.16b, v21.16b
108 MOV v28.16b, v20.16b
109 MOV v29.16b, v21.16b
110 MOV v30.16b, v20.16b
111 MOV v31.16b, v21.16b
112
113 MOV x9, x3 // p = ks
114
1151:
116 # Load next 6 A pointers
117 LDP x14, x15, [x4], 16
118 LDP x20, x21, [x4], 16
119 LDP x22, x23, [x4], 16
120
121 CMP x14, x12 // if a0 == zero
122 ADD x14, x14, x11 // a0 += a_offset
123 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset
124 CMP x15, x12 // if a1 == zero
125 ADD x15, x15, x11 // a1 += a_offset
126 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset
127 CMP x20, x12 // if a2 == zero
128 ADD x20, x20, x11 // a2 += a_offset
129 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset
130 CMP x21, x12 // if a3 == zero
131 ADD x21, x21, x11 // a3 += a_offset
132 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset
133 CMP x22, x12 // if a4 == zero
134 ADD x22, x22, x11 // a4 += a_offset
135 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset
136 CMP x23, x12 // if a5 == zero
137 ADD x23, x23, x11 // a5 += a_offset
138 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset
139
140 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
141 SUBS x0, x2, 32 // k = kc - 32
142 B.LO 5f
143
144 # Prologue - loads for main loop of 96 FMA
145 LDR q0, [x14], 16
146 LDR q1, [x15], 16
147 LDR q2, [x20], 16
148 LDR q3, [x21], 16
149 LDR q4, [x22], 16
150 LDR q5, [x23], 16
151 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
152 LDP q14, q15, [x5], 32
153 LDP q16, q17, [x5], 32
154
155 # Is there at least 8 floats (32 bytes) for main loop?
156 SUBS x0, x0, 32
157 B.LO 3f
158
159 # Main loop - 8 floats of A (32 bytes)
160 # 96 FMA + 6 LDP A + 8 LDP B
1612:
162 # First group of 4 A. 48 FMA.
163 FMLA v20.4s, v12.4s, v0.s[0]
164 LDP q18, q19, [x5], 32 // Load last B
165 FMLA v22.4s, v12.4s, v1.s[0]
166 FMLA v24.4s, v12.4s, v2.s[0]
167 FMLA v26.4s, v12.4s, v3.s[0]
168 FMLA v28.4s, v12.4s, v4.s[0]
169 FMLA v30.4s, v12.4s, v5.s[0]
170 FMLA v21.4s, v13.4s, v0.s[0]
171 FMLA v23.4s, v13.4s, v1.s[0]
172 FMLA v25.4s, v13.4s, v2.s[0]
173 FMLA v27.4s, v13.4s, v3.s[0]
174 FMLA v29.4s, v13.4s, v4.s[0]
175
176 FMLA v31.4s, v13.4s, v5.s[0]
177 FMLA v20.4s, v14.4s, v0.s[1]
178 FMLA v22.4s, v14.4s, v1.s[1]
179 FMLA v24.4s, v14.4s, v2.s[1]
180 FMLA v26.4s, v14.4s, v3.s[1]
181 FMLA v28.4s, v14.4s, v4.s[1]
182 FMLA v30.4s, v14.4s, v5.s[1]
183 FMLA v21.4s, v15.4s, v0.s[1]
184 FMLA v23.4s, v15.4s, v1.s[1]
185 FMLA v25.4s, v15.4s, v2.s[1]
186 LDR q6, [x14], 16 // Load next 6 A
187 FMLA v27.4s, v15.4s, v3.s[1]
188 FMLA v29.4s, v15.4s, v4.s[1]
189 FMLA v31.4s, v15.4s, v5.s[1]
190 LDR q7, [x15], 16
191
192 FMLA v20.4s, v16.4s, v0.s[2]
193 FMLA v22.4s, v16.4s, v1.s[2]
194 FMLA v24.4s, v16.4s, v2.s[2]
195 LDR q8, [x20], 16
196 FMLA v26.4s, v16.4s, v3.s[2]
197 FMLA v28.4s, v16.4s, v4.s[2]
198 FMLA v30.4s, v16.4s, v5.s[2]
199 LDR q9, [x21], 16
200 FMLA v21.4s, v17.4s, v0.s[2]
201 FMLA v23.4s, v17.4s, v1.s[2]
202 FMLA v25.4s, v17.4s, v2.s[2]
203 LDR q10, [x22], 16
204 FMLA v27.4s, v17.4s, v3.s[2]
205 FMLA v29.4s, v17.4s, v4.s[2]
206 FMLA v31.4s, v17.4s, v5.s[2]
207 LDR q11, [x23], 16
208
209 FMLA v20.4s, v18.4s, v0.s[3]
210 FMLA v22.4s, v18.4s, v1.s[3]
211 FMLA v24.4s, v18.4s, v2.s[3]
212 LDP q12, q13, [x5], 32 // Load 4 B
213 FMLA v26.4s, v18.4s, v3.s[3]
214 FMLA v28.4s, v18.4s, v4.s[3]
215 FMLA v30.4s, v18.4s, v5.s[3]
216 LDP q14, q15, [x5], 32
217 FMLA v21.4s, v19.4s, v0.s[3]
218 FMLA v23.4s, v19.4s, v1.s[3]
219 FMLA v25.4s, v19.4s, v2.s[3]
220 LDP q16, q17, [x5], 32
221 FMLA v27.4s, v19.4s, v3.s[3]
222 FMLA v29.4s, v19.4s, v4.s[3]
223 FMLA v31.4s, v19.4s, v5.s[3]
224 LDP q18, q19, [x5], 32
225
226 # Second group of 4 A. 48 FMA.
227 FMLA v20.4s, v12.4s, v6.s[0]
228 FMLA v22.4s, v12.4s, v7.s[0]
229 FMLA v24.4s, v12.4s, v8.s[0]
230 LDR q0, [x14], 16 // Load next 6 A
231 FMLA v26.4s, v12.4s, v9.s[0]
232 FMLA v28.4s, v12.4s, v10.s[0]
233 FMLA v30.4s, v12.4s, v11.s[0]
234 LDR q1, [x15], 16
235 FMLA v21.4s, v13.4s, v6.s[0]
236 FMLA v23.4s, v13.4s, v7.s[0]
237 FMLA v25.4s, v13.4s, v8.s[0]
238 LDR q2, [x20], 16
239 FMLA v27.4s, v13.4s, v9.s[0]
240 FMLA v29.4s, v13.4s, v10.s[0]
241 FMLA v31.4s, v13.4s, v11.s[0]
242 LDR q3, [x21], 16
243
244 FMLA v20.4s, v14.4s, v6.s[1]
245 FMLA v22.4s, v14.4s, v7.s[1]
246 FMLA v24.4s, v14.4s, v8.s[1]
247 LDR q4, [x22], 16
248 FMLA v26.4s, v14.4s, v9.s[1]
249 FMLA v28.4s, v14.4s, v10.s[1]
250 FMLA v30.4s, v14.4s, v11.s[1]
251 LDR q5, [x23], 16
252 FMLA v21.4s, v15.4s, v6.s[1]
253 FMLA v23.4s, v15.4s, v7.s[1]
254 FMLA v25.4s, v15.4s, v8.s[1]
255 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
256 FMLA v27.4s, v15.4s, v9.s[1]
257 FMLA v29.4s, v15.4s, v10.s[1]
258 FMLA v31.4s, v15.4s, v11.s[1]
259 LDP q14, q15, [x5], 32
260
261 FMLA v20.4s, v16.4s, v6.s[2]
262 FMLA v22.4s, v16.4s, v7.s[2]
263 FMLA v24.4s, v16.4s, v8.s[2]
264 FMLA v26.4s, v16.4s, v9.s[2]
265 FMLA v28.4s, v16.4s, v10.s[2]
266 FMLA v30.4s, v16.4s, v11.s[2]
267 FMLA v21.4s, v17.4s, v6.s[2]
268 FMLA v23.4s, v17.4s, v7.s[2]
269 FMLA v25.4s, v17.4s, v8.s[2]
270 FMLA v27.4s, v17.4s, v9.s[2]
271 FMLA v29.4s, v17.4s, v10.s[2]
272 FMLA v31.4s, v17.4s, v11.s[2]
273 LDP q16, q17, [x5], 32
274
275 FMLA v20.4s, v18.4s, v6.s[3]
276 FMLA v22.4s, v18.4s, v7.s[3]
277 SUBS x0, x0, 32
278 FMLA v24.4s, v18.4s, v8.s[3]
279 FMLA v26.4s, v18.4s, v9.s[3]
280 FMLA v28.4s, v18.4s, v10.s[3]
281 FMLA v30.4s, v18.4s, v11.s[3]
282 FMLA v21.4s, v19.4s, v6.s[3]
283 FMLA v23.4s, v19.4s, v7.s[3]
284 FMLA v25.4s, v19.4s, v8.s[3]
285 FMLA v27.4s, v19.4s, v9.s[3]
286 FMLA v29.4s, v19.4s, v10.s[3]
287 FMLA v31.4s, v19.4s, v11.s[3]
288 B.HS 2b
289
290 # Epilogue - 8 floats of A (32 bytes)
291 # 96 FMA + 6 LDP A + 8 LDP B
292 # First block same as main loop. Second block has no preloads.
2933:
294 # First group of 4 A. 48 FMA.
295 FMLA v20.4s, v12.4s, v0.s[0]
296 LDP q18, q19, [x5], 32 // Load last B
297 FMLA v22.4s, v12.4s, v1.s[0]
298 FMLA v24.4s, v12.4s, v2.s[0]
299 FMLA v26.4s, v12.4s, v3.s[0]
300 FMLA v28.4s, v12.4s, v4.s[0]
301 FMLA v30.4s, v12.4s, v5.s[0]
302 FMLA v21.4s, v13.4s, v0.s[0]
303 FMLA v23.4s, v13.4s, v1.s[0]
304 FMLA v25.4s, v13.4s, v2.s[0]
305 FMLA v27.4s, v13.4s, v3.s[0]
306 FMLA v29.4s, v13.4s, v4.s[0]
307
308 FMLA v31.4s, v13.4s, v5.s[0]
309 FMLA v20.4s, v14.4s, v0.s[1]
310 FMLA v22.4s, v14.4s, v1.s[1]
311 FMLA v24.4s, v14.4s, v2.s[1]
312 FMLA v26.4s, v14.4s, v3.s[1]
313 FMLA v28.4s, v14.4s, v4.s[1]
314 FMLA v30.4s, v14.4s, v5.s[1]
315 FMLA v21.4s, v15.4s, v0.s[1]
316 FMLA v23.4s, v15.4s, v1.s[1]
317 FMLA v25.4s, v15.4s, v2.s[1]
318 LDR q6, [x14], 16 // Load next 6 A
319 FMLA v27.4s, v15.4s, v3.s[1]
320 FMLA v29.4s, v15.4s, v4.s[1]
321 FMLA v31.4s, v15.4s, v5.s[1]
322 LDR q7, [x15], 16
323
324 FMLA v20.4s, v16.4s, v0.s[2]
325 FMLA v22.4s, v16.4s, v1.s[2]
326 FMLA v24.4s, v16.4s, v2.s[2]
327 LDR q8, [x20], 16
328 FMLA v26.4s, v16.4s, v3.s[2]
329 FMLA v28.4s, v16.4s, v4.s[2]
330 FMLA v30.4s, v16.4s, v5.s[2]
331 LDR q9, [x21], 16
332 FMLA v21.4s, v17.4s, v0.s[2]
333 FMLA v23.4s, v17.4s, v1.s[2]
334 FMLA v25.4s, v17.4s, v2.s[2]
335 LDR q10, [x22], 16
336 FMLA v27.4s, v17.4s, v3.s[2]
337 FMLA v29.4s, v17.4s, v4.s[2]
338 FMLA v31.4s, v17.4s, v5.s[2]
339 LDR q11, [x23], 16
340
341 FMLA v20.4s, v18.4s, v0.s[3]
342 FMLA v22.4s, v18.4s, v1.s[3]
343 FMLA v24.4s, v18.4s, v2.s[3]
344 LDP q12, q13, [x5], 32 // Load 4 B
345 FMLA v26.4s, v18.4s, v3.s[3]
346 FMLA v28.4s, v18.4s, v4.s[3]
347 FMLA v30.4s, v18.4s, v5.s[3]
348 LDP q14, q15, [x5], 32
349 FMLA v21.4s, v19.4s, v0.s[3]
350 FMLA v23.4s, v19.4s, v1.s[3]
351 FMLA v25.4s, v19.4s, v2.s[3]
352 LDP q16, q17, [x5], 32
353 FMLA v27.4s, v19.4s, v3.s[3]
354 FMLA v29.4s, v19.4s, v4.s[3]
355 FMLA v31.4s, v19.4s, v5.s[3]
356 LDP q18, q19, [x5], 32
357
358 # Second group of 4 A. 48 FMA.
359 FMLA v20.4s, v12.4s, v6.s[0]
360 FMLA v22.4s, v12.4s, v7.s[0]
361 FMLA v24.4s, v12.4s, v8.s[0]
362 FMLA v26.4s, v12.4s, v9.s[0]
363 FMLA v28.4s, v12.4s, v10.s[0]
364 FMLA v30.4s, v12.4s, v11.s[0]
365 FMLA v21.4s, v13.4s, v6.s[0]
366 FMLA v23.4s, v13.4s, v7.s[0]
367 FMLA v25.4s, v13.4s, v8.s[0]
368 FMLA v27.4s, v13.4s, v9.s[0]
369 FMLA v29.4s, v13.4s, v10.s[0]
370 FMLA v31.4s, v13.4s, v11.s[0]
371
372 FMLA v20.4s, v14.4s, v6.s[1]
373 FMLA v22.4s, v14.4s, v7.s[1]
374 FMLA v24.4s, v14.4s, v8.s[1]
375 FMLA v26.4s, v14.4s, v9.s[1]
376 FMLA v28.4s, v14.4s, v10.s[1]
377 FMLA v30.4s, v14.4s, v11.s[1]
378 FMLA v21.4s, v15.4s, v6.s[1]
379 FMLA v23.4s, v15.4s, v7.s[1]
380 FMLA v25.4s, v15.4s, v8.s[1]
381 FMLA v27.4s, v15.4s, v9.s[1]
382 FMLA v29.4s, v15.4s, v10.s[1]
383 FMLA v31.4s, v15.4s, v11.s[1]
384
385 FMLA v20.4s, v16.4s, v6.s[2]
386 FMLA v22.4s, v16.4s, v7.s[2]
387 FMLA v24.4s, v16.4s, v8.s[2]
388 FMLA v26.4s, v16.4s, v9.s[2]
389 FMLA v28.4s, v16.4s, v10.s[2]
390 FMLA v30.4s, v16.4s, v11.s[2]
391 FMLA v21.4s, v17.4s, v6.s[2]
392 FMLA v23.4s, v17.4s, v7.s[2]
393 FMLA v25.4s, v17.4s, v8.s[2]
394 FMLA v27.4s, v17.4s, v9.s[2]
395 FMLA v29.4s, v17.4s, v10.s[2]
396 FMLA v31.4s, v17.4s, v11.s[2]
397
398 FMLA v20.4s, v18.4s, v6.s[3]
399 FMLA v22.4s, v18.4s, v7.s[3]
400 FMLA v24.4s, v18.4s, v8.s[3]
401 FMLA v26.4s, v18.4s, v9.s[3]
402 FMLA v28.4s, v18.4s, v10.s[3]
403 FMLA v30.4s, v18.4s, v11.s[3]
404 FMLA v21.4s, v19.4s, v6.s[3]
405 FMLA v23.4s, v19.4s, v7.s[3]
406
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700407 # Load min/max values
Frank Barchard0d1052c2020-03-23 17:28:13 -0700408 LD2R {v6.4s, v7.4s}, [x8]
409
410 FMLA v25.4s, v19.4s, v8.s[3]
411 FMLA v27.4s, v19.4s, v9.s[3]
412 # Is there a remainder?- 4 floats of A (16 bytes) or less
413 TST x0, 31
414 FMLA v29.4s, v19.4s, v10.s[3]
415 FMLA v31.4s, v19.4s, v11.s[3]
416 B.NE 5f
417
4184:
419 # ks loop
420 SUBS x9, x9, 48 // ks -= MR * sizeof(void*)
421 B.HI 1b
422
423 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700424 FMAX v20.4s, v20.4s, v6.4s
425 FMAX v21.4s, v21.4s, v6.4s
426 FMAX v22.4s, v22.4s, v6.4s
427 FMAX v23.4s, v23.4s, v6.4s
428 FMAX v24.4s, v24.4s, v6.4s
429 FMAX v25.4s, v25.4s, v6.4s
430 FMAX v26.4s, v26.4s, v6.4s
431 FMAX v27.4s, v27.4s, v6.4s
432 FMAX v28.4s, v28.4s, v6.4s
433 FMAX v29.4s, v29.4s, v6.4s
434 FMAX v30.4s, v30.4s, v6.4s
435 FMAX v31.4s, v31.4s, v6.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700436 # Load cn_stride
437 LDR x0, [sp, 96]
Marat Dukhana51cf482020-04-08 16:16:19 -0700438 FMIN v20.4s, v20.4s, v7.4s
439 FMIN v21.4s, v21.4s, v7.4s
440 FMIN v22.4s, v22.4s, v7.4s
441 FMIN v23.4s, v23.4s, v7.4s
442 FMIN v24.4s, v24.4s, v7.4s
443 FMIN v25.4s, v25.4s, v7.4s
444 FMIN v26.4s, v26.4s, v7.4s
445 FMIN v27.4s, v27.4s, v7.4s
446 FMIN v28.4s, v28.4s, v7.4s
447 FMIN v29.4s, v29.4s, v7.4s
448 FMIN v30.4s, v30.4s, v7.4s
449 FMIN v31.4s, v31.4s, v7.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700450
451 # Store full 6 x 8
452 SUBS x1, x1, 8
453 B.LO 8f
454
455 STP q30, q31, [x7]
456 ADD x7, x7, x0
457 STP q28, q29, [x13]
458 ADD x13, x13, x0
459 STP q26, q27, [x10]
460 ADD x10, x10, x0
461 STP q24, q25, [x17]
462 ADD x17, x17, x0
463 STP q22, q23, [x16]
464 ADD x16, x16, x0
465 STP q20, q21, [x6]
466 ADD x6, x6, x0
467
468 SUB x4, x4, x3 // a -= ks
469
470 # nc loop
471 B.HI 0b
472
473 # Restore x20,x21,x22,x23 from stack
474 LDP x22, x23, [sp, 80]
475 LDP x20, x21, [sp, 64]
476
477 # Restore d8-d15 from stack
478 LDP d14, d15, [sp, 48]
479 LDP d12, d13, [sp, 32]
480 LDP d10, d11, [sp, 16]
481 LDP d8, d9, [sp], 96
482 RET
483
4845:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700485 # Load min/max values
Frank Barchard0d1052c2020-03-23 17:28:13 -0700486 LD2R {v6.4s, v7.4s}, [x8]
487
488 # Is there a remainder?- 4 floats of A (16 bytes)
489 TBZ x0, 4, 6f
490
491 # Remainder- 4 floats of A (16 bytes)
492 # Load A
493 LDR q0, [x14], 16
494 LDR q1, [x15], 16
495 LDR q2, [x20], 16
496 LDR q3, [x21], 16
497 LDR q4, [x22], 16
498 LDR q5, [x23], 16
499 # Load B
500 LDP q12, q13, [x5], 32
501 LDP q14, q15, [x5], 32
502 LDP q16, q17, [x5], 32
503 LDP q18, q19, [x5], 32
504
505 FMLA v20.4s, v12.4s, v0.s[0]
506 FMLA v22.4s, v12.4s, v1.s[0]
507 FMLA v24.4s, v12.4s, v2.s[0]
508 FMLA v26.4s, v12.4s, v3.s[0]
509 FMLA v28.4s, v12.4s, v4.s[0]
510 FMLA v30.4s, v12.4s, v5.s[0]
511 FMLA v21.4s, v13.4s, v0.s[0]
512 FMLA v23.4s, v13.4s, v1.s[0]
513 FMLA v25.4s, v13.4s, v2.s[0]
514 FMLA v27.4s, v13.4s, v3.s[0]
515 FMLA v29.4s, v13.4s, v4.s[0]
516 FMLA v31.4s, v13.4s, v5.s[0]
517
518 FMLA v20.4s, v14.4s, v0.s[1]
519 FMLA v22.4s, v14.4s, v1.s[1]
520 FMLA v24.4s, v14.4s, v2.s[1]
521 FMLA v26.4s, v14.4s, v3.s[1]
522 FMLA v28.4s, v14.4s, v4.s[1]
523 FMLA v30.4s, v14.4s, v5.s[1]
524 FMLA v21.4s, v15.4s, v0.s[1]
525 FMLA v23.4s, v15.4s, v1.s[1]
526 FMLA v25.4s, v15.4s, v2.s[1]
527 FMLA v27.4s, v15.4s, v3.s[1]
528 FMLA v29.4s, v15.4s, v4.s[1]
529 FMLA v31.4s, v15.4s, v5.s[1]
530
531 FMLA v20.4s, v16.4s, v0.s[2]
532 FMLA v22.4s, v16.4s, v1.s[2]
533 FMLA v24.4s, v16.4s, v2.s[2]
534 FMLA v26.4s, v16.4s, v3.s[2]
535 FMLA v28.4s, v16.4s, v4.s[2]
536 FMLA v30.4s, v16.4s, v5.s[2]
537 FMLA v21.4s, v17.4s, v0.s[2]
538 FMLA v23.4s, v17.4s, v1.s[2]
539 FMLA v25.4s, v17.4s, v2.s[2]
540 FMLA v27.4s, v17.4s, v3.s[2]
541 FMLA v29.4s, v17.4s, v4.s[2]
542 FMLA v31.4s, v17.4s, v5.s[2]
543
544 FMLA v20.4s, v18.4s, v0.s[3]
545 FMLA v22.4s, v18.4s, v1.s[3]
546 FMLA v24.4s, v18.4s, v2.s[3]
547 FMLA v26.4s, v18.4s, v3.s[3]
548 FMLA v28.4s, v18.4s, v4.s[3]
549 FMLA v30.4s, v18.4s, v5.s[3]
550 FMLA v21.4s, v19.4s, v0.s[3]
551 FMLA v23.4s, v19.4s, v1.s[3]
552 FMLA v25.4s, v19.4s, v2.s[3]
553 FMLA v27.4s, v19.4s, v3.s[3]
554 FMLA v29.4s, v19.4s, v4.s[3]
555 FMLA v31.4s, v19.4s, v5.s[3]
556
557 # Is there a remainder?- 2 floats of A (8 bytes)
5586:
559 TBZ x0, 3, 7f
560
561 # Remainder- 2 floats of A (8 bytes)
562 # Load A
563 LDR d0, [x14], 8
564 LDR d1, [x15], 8
565 LDR d2, [x20], 8
566 LDR d3, [x21], 8
567 LDR d4, [x22], 8
568 LDR d5, [x23], 8
569 # Load B
570 LDP q12, q13, [x5], 32
571 LDP q14, q15, [x5], 32
572
573 FMLA v20.4s, v12.4s, v0.s[0]
574 FMLA v22.4s, v12.4s, v1.s[0]
575 FMLA v24.4s, v12.4s, v2.s[0]
576 FMLA v26.4s, v12.4s, v3.s[0]
577 FMLA v28.4s, v12.4s, v4.s[0]
578 FMLA v30.4s, v12.4s, v5.s[0]
579 FMLA v21.4s, v13.4s, v0.s[0]
580 FMLA v23.4s, v13.4s, v1.s[0]
581 FMLA v25.4s, v13.4s, v2.s[0]
582 FMLA v27.4s, v13.4s, v3.s[0]
583 FMLA v29.4s, v13.4s, v4.s[0]
584 FMLA v31.4s, v13.4s, v5.s[0]
585
586 FMLA v20.4s, v14.4s, v0.s[1]
587 FMLA v22.4s, v14.4s, v1.s[1]
588 FMLA v24.4s, v14.4s, v2.s[1]
589 FMLA v26.4s, v14.4s, v3.s[1]
590 FMLA v28.4s, v14.4s, v4.s[1]
591 FMLA v30.4s, v14.4s, v5.s[1]
592 FMLA v21.4s, v15.4s, v0.s[1]
593 FMLA v23.4s, v15.4s, v1.s[1]
594 FMLA v25.4s, v15.4s, v2.s[1]
595 FMLA v27.4s, v15.4s, v3.s[1]
596 FMLA v29.4s, v15.4s, v4.s[1]
597 FMLA v31.4s, v15.4s, v5.s[1]
598
599 # Is there a remainder?- 1 float of A (4 bytes)
6007:
601 TBZ x0, 2, 4b
602
603 # Remainder- 1 float of A (4 bytes)
604 # Load A
605 LDR s0, [x14], 4
606 LDR s1, [x15], 4
607 LDR s2, [x20], 4
608 LDR s3, [x21], 4
609 LDR s4, [x22], 4
610 LDR s5, [x23], 4
611 # Load B
612 LDP q12, q13, [x5], 32
613
614 FMLA v20.4s, v12.4s, v0.s[0]
615 FMLA v22.4s, v12.4s, v1.s[0]
616 FMLA v24.4s, v12.4s, v2.s[0]
617 FMLA v26.4s, v12.4s, v3.s[0]
618 FMLA v28.4s, v12.4s, v4.s[0]
619 FMLA v30.4s, v12.4s, v5.s[0]
620 FMLA v21.4s, v13.4s, v0.s[0]
621 FMLA v23.4s, v13.4s, v1.s[0]
622 FMLA v25.4s, v13.4s, v2.s[0]
623 FMLA v27.4s, v13.4s, v3.s[0]
624 FMLA v29.4s, v13.4s, v4.s[0]
625 FMLA v31.4s, v13.4s, v5.s[0]
626 B 4b
627
628 # Store odd width
6298:
630 TBZ x1, 2, 9f
631 STR q30, [x7], 16
632 MOV v30.16b, v31.16b
633 STR q28, [x13], 16
634 MOV v28.16b, v29.16b
635 STR q26, [x10], 16
636 MOV v26.16b, v27.16b
637 STR q24, [x17], 16
638 MOV v24.16b, v25.16b
639 STR q22, [x16], 16
640 MOV v22.16b, v23.16b
641 STR q20, [x6], 16
642 MOV v20.16b, v21.16b
6439:
644 TBZ x1, 1, 10f
645 STR d30, [x7], 8
646 DUP d30, v30.d[1]
647 STR d28, [x13], 8
648 DUP d28, v28.d[1]
649 STR d26, [x10], 8
650 DUP d26, v26.d[1]
651 STR d24, [x17], 8
652 DUP d24, v24.d[1]
653 STR d22, [x16], 8
654 DUP d22, v22.d[1]
655 STR d20, [x6], 8
656 DUP d20, v20.d[1]
657
65810:
659 TBZ x1, 0, 11f
660 STR s30, [x7]
661 STR s28, [x13]
662 STR s26, [x10]
663 STR s24, [x17]
664 STR s22, [x16]
665 STR s20, [x6]
66611:
667 # Restore x20,x21,x22,x23 from stack
668 LDP x22, x23, [sp, 80]
669 LDP x20, x21, [sp, 64]
670
671 # Restore d8-d15 from stack
672 LDP d14, d15, [sp, 48]
673 LDP d12, d13, [sp, 32]
674 LDP d10, d11, [sp, 16]
675 LDP d8, d9, [sp], 96
676 RET
677
678END_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios
679
680#ifdef __ELF__
681.section ".note.GNU-stack","",%progbits
682#endif