blob: f20821bf5ad9307f7201536e4b3afa3b9fad5725 [file] [log] [blame]
Frank Barchard0d1052c2020-03-23 17:28:13 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> (x0)
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
Frank Barchard0d1052c2020-03-23 17:28:13 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x14 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0 v6
45# A1 v1 v7
46# A2 v2 v8
47# A3 v3 v9
48# A4 v4 v10
49# A5 v5 v11
50# B v12 v13 v14 v15
51# B v16 v17 v18 v19
52# C v20 v21
53# C v22 v23
54# C v24 v25
55# C v26 v27
56# C v28 v29
57# C v30 v31
58# Clamp v6 v7
59
60# IOS microkernel is based on Cortex-A75 kernel but avoids X18 by
61# using X14 instead of X18, and reloading cn_stride into x0.
62
63BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios
64
65 # Clamp A and C pointers / Save d8-d15 on stack
66 STP d8, d9, [sp, -64]!
67 CMP x0, 2 // if mr < 2
68 ADD x9, x3, x4 // a1 = a0 + a_stride
69 ADD x16, x6, x7 // c1 = c0 + cm_stride
70 CSEL x9, x3, x9, LO // a1 = a0
71 CSEL x16, x6, x16, LO // c1 = c0
72
73 STP d10, d11, [sp, 16]
74 ADD x10, x9, x4 // a2 = a1 + a_stride
75 ADD x17, x16, x7 // c2 = c1 + cm_stride
76 // if mr <= 2
77 CSEL x10, x9, x10, LS // a2 = a1
78 CSEL x17, x16, x17, LS // c2 = c1
79
80 STP d12, d13, [sp, 32]
81 CMP x0, 4 // if mr < 4
82 ADD x11, x10, x4 // a3 = a2 + a_stride
83 ADD x14, x17, x7 // c3 = c2 + cm_stride
84 CSEL x11, x10, x11, LO // a3 = a2
85 CSEL x14, x17, x14, LO // c3 = c2
86
87 STP d14, d15, [sp, 48]
88 ADD x12, x11, x4 // a4 = a3 + a_stride
89 ADD x13, x14, x7 // c4 = c3 + cm_stride
90 // if mr <= 5
91 CSEL x12, x11, x12, LS // a4 = a3
92 CSEL x13, x14, x13, LS // c4 = c3
93
94 # Load params pointer
95 LDR x8, [sp, 72]
96
97 CMP x0, 6 // if mr < 6
98 ADD x4, x12, x4 // a5 = a4 + a_stride
99 ADD x7, x13, x7 // c5 = c4 + cm_stride
100 CSEL x4, x12, x4, LO // a5 = a4
101 CSEL x7, x13, x7, LO // c5 = c4
102
1030:
104 # Load initial bias from w into accumulators
105 LDP q20, q21, [x5], 32
106 MOV v22.16b, v20.16b
107 MOV v23.16b, v21.16b
108 MOV v24.16b, v20.16b
109 MOV v25.16b, v21.16b
110 MOV v26.16b, v20.16b
111 MOV v27.16b, v21.16b
112 MOV v28.16b, v20.16b
113 MOV v29.16b, v21.16b
114 MOV v30.16b, v20.16b
115 MOV v31.16b, v21.16b
116
117 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
118 SUBS x0, x2, 32 // k = kc - 32
119 B.LO 4f
120
121 # Prologue - loads for main loop of 96 FMA
122 LDR q0, [x3], 16
123 LDR q1, [x9], 16
124 LDR q2, [x10], 16
125 LDR q3, [x11], 16
126 LDR q4, [x12], 16
127 LDR q5, [x4], 16
128 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
129 LDP q14, q15, [x5], 32
130 LDP q16, q17, [x5], 32
131
132 # Is there at least 8 floats (32 bytes) for main loop?
133 SUBS x0, x0, 32
134 B.LO 2f
135
136 # Main loop - 8 floats of A (32 bytes)
137 # 96 FMA + 6 LDP A + 8 LDP B
1381:
139 # First group of 4 A. 48 FMA.
140 FMLA v20.4s, v12.4s, v0.s[0]
141 LDP q18, q19, [x5], 32 // Load last B
142 FMLA v22.4s, v12.4s, v1.s[0]
143 FMLA v24.4s, v12.4s, v2.s[0]
144 FMLA v26.4s, v12.4s, v3.s[0]
145 FMLA v28.4s, v12.4s, v4.s[0]
146 FMLA v30.4s, v12.4s, v5.s[0]
147 FMLA v21.4s, v13.4s, v0.s[0]
148 FMLA v23.4s, v13.4s, v1.s[0]
149 FMLA v25.4s, v13.4s, v2.s[0]
150 FMLA v27.4s, v13.4s, v3.s[0]
151 FMLA v29.4s, v13.4s, v4.s[0]
152
153 FMLA v31.4s, v13.4s, v5.s[0]
154 FMLA v20.4s, v14.4s, v0.s[1]
155 FMLA v22.4s, v14.4s, v1.s[1]
156 FMLA v24.4s, v14.4s, v2.s[1]
157 FMLA v26.4s, v14.4s, v3.s[1]
158 FMLA v28.4s, v14.4s, v4.s[1]
159 FMLA v30.4s, v14.4s, v5.s[1]
160 FMLA v21.4s, v15.4s, v0.s[1]
161 FMLA v23.4s, v15.4s, v1.s[1]
162 FMLA v25.4s, v15.4s, v2.s[1]
163 LDR q6, [x3], 16 // Load next 6 A
164 FMLA v27.4s, v15.4s, v3.s[1]
165 FMLA v29.4s, v15.4s, v4.s[1]
166 FMLA v31.4s, v15.4s, v5.s[1]
167 LDR q7, [x9], 16
168
169 FMLA v20.4s, v16.4s, v0.s[2]
170 FMLA v22.4s, v16.4s, v1.s[2]
171 FMLA v24.4s, v16.4s, v2.s[2]
172 LDR q8, [x10], 16
173 FMLA v26.4s, v16.4s, v3.s[2]
174 FMLA v28.4s, v16.4s, v4.s[2]
175 FMLA v30.4s, v16.4s, v5.s[2]
176 LDR q9, [x11], 16
177 FMLA v21.4s, v17.4s, v0.s[2]
178 FMLA v23.4s, v17.4s, v1.s[2]
179 FMLA v25.4s, v17.4s, v2.s[2]
180 LDR q10, [x12], 16
181 FMLA v27.4s, v17.4s, v3.s[2]
182 FMLA v29.4s, v17.4s, v4.s[2]
183 FMLA v31.4s, v17.4s, v5.s[2]
184 LDR q11, [x4], 16
185
186 FMLA v20.4s, v18.4s, v0.s[3]
187 FMLA v22.4s, v18.4s, v1.s[3]
188 FMLA v24.4s, v18.4s, v2.s[3]
189 LDP q12, q13, [x5], 32 // Load 4 B
190 FMLA v26.4s, v18.4s, v3.s[3]
191 FMLA v28.4s, v18.4s, v4.s[3]
192 FMLA v30.4s, v18.4s, v5.s[3]
193 LDP q14, q15, [x5], 32
194 FMLA v21.4s, v19.4s, v0.s[3]
195 FMLA v23.4s, v19.4s, v1.s[3]
196 FMLA v25.4s, v19.4s, v2.s[3]
197 LDP q16, q17, [x5], 32
198 FMLA v27.4s, v19.4s, v3.s[3]
199 FMLA v29.4s, v19.4s, v4.s[3]
200 FMLA v31.4s, v19.4s, v5.s[3]
201 LDP q18, q19, [x5], 32
202
203 # Second group of 4 A. 48 FMA.
204 FMLA v20.4s, v12.4s, v6.s[0]
205 FMLA v22.4s, v12.4s, v7.s[0]
206 FMLA v24.4s, v12.4s, v8.s[0]
207 LDR q0, [x3], 16 // Load next 6 A
208 FMLA v26.4s, v12.4s, v9.s[0]
209 FMLA v28.4s, v12.4s, v10.s[0]
210 FMLA v30.4s, v12.4s, v11.s[0]
211 LDR q1, [x9], 16
212 FMLA v21.4s, v13.4s, v6.s[0]
213 FMLA v23.4s, v13.4s, v7.s[0]
214 FMLA v25.4s, v13.4s, v8.s[0]
215 LDR q2, [x10], 16
216 FMLA v27.4s, v13.4s, v9.s[0]
217 FMLA v29.4s, v13.4s, v10.s[0]
218 FMLA v31.4s, v13.4s, v11.s[0]
219 LDR q3, [x11], 16
220
221 FMLA v20.4s, v14.4s, v6.s[1]
222 FMLA v22.4s, v14.4s, v7.s[1]
223 FMLA v24.4s, v14.4s, v8.s[1]
224 LDR q4, [x12], 16
225 FMLA v26.4s, v14.4s, v9.s[1]
226 FMLA v28.4s, v14.4s, v10.s[1]
227 FMLA v30.4s, v14.4s, v11.s[1]
228 LDR q5, [x4], 16
229 FMLA v21.4s, v15.4s, v6.s[1]
230 FMLA v23.4s, v15.4s, v7.s[1]
231 FMLA v25.4s, v15.4s, v8.s[1]
232 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
233 FMLA v27.4s, v15.4s, v9.s[1]
234 FMLA v29.4s, v15.4s, v10.s[1]
235 FMLA v31.4s, v15.4s, v11.s[1]
236 LDP q14, q15, [x5], 32
237
238 FMLA v20.4s, v16.4s, v6.s[2]
239 FMLA v22.4s, v16.4s, v7.s[2]
240 FMLA v24.4s, v16.4s, v8.s[2]
241 FMLA v26.4s, v16.4s, v9.s[2]
242 FMLA v28.4s, v16.4s, v10.s[2]
243 FMLA v30.4s, v16.4s, v11.s[2]
244 FMLA v21.4s, v17.4s, v6.s[2]
245 FMLA v23.4s, v17.4s, v7.s[2]
246 FMLA v25.4s, v17.4s, v8.s[2]
247 FMLA v27.4s, v17.4s, v9.s[2]
248 FMLA v29.4s, v17.4s, v10.s[2]
249 FMLA v31.4s, v17.4s, v11.s[2]
250 LDP q16, q17, [x5], 32
251
252 FMLA v20.4s, v18.4s, v6.s[3]
253 FMLA v22.4s, v18.4s, v7.s[3]
254 SUBS x0, x0, 32
255 FMLA v24.4s, v18.4s, v8.s[3]
256 FMLA v26.4s, v18.4s, v9.s[3]
257 FMLA v28.4s, v18.4s, v10.s[3]
258 FMLA v30.4s, v18.4s, v11.s[3]
259 FMLA v21.4s, v19.4s, v6.s[3]
260 FMLA v23.4s, v19.4s, v7.s[3]
261 FMLA v25.4s, v19.4s, v8.s[3]
262 FMLA v27.4s, v19.4s, v9.s[3]
263 FMLA v29.4s, v19.4s, v10.s[3]
264 FMLA v31.4s, v19.4s, v11.s[3]
265 B.HS 1b
266
267 # Epilogue - 8 floats of A (32 bytes)
268 # 96 FMA + 6 LDP A + 8 LDP B
269 # First block same as main loop. Second block has no preloads.
2702:
271 # First group of 4 A. 48 FMA.
272 FMLA v20.4s, v12.4s, v0.s[0]
273 LDP q18, q19, [x5], 32 // Load last B
274 FMLA v22.4s, v12.4s, v1.s[0]
275 FMLA v24.4s, v12.4s, v2.s[0]
276 FMLA v26.4s, v12.4s, v3.s[0]
277 FMLA v28.4s, v12.4s, v4.s[0]
278 FMLA v30.4s, v12.4s, v5.s[0]
279 FMLA v21.4s, v13.4s, v0.s[0]
280 FMLA v23.4s, v13.4s, v1.s[0]
281 FMLA v25.4s, v13.4s, v2.s[0]
282 FMLA v27.4s, v13.4s, v3.s[0]
283 FMLA v29.4s, v13.4s, v4.s[0]
284
285 FMLA v31.4s, v13.4s, v5.s[0]
286 FMLA v20.4s, v14.4s, v0.s[1]
287 FMLA v22.4s, v14.4s, v1.s[1]
288 FMLA v24.4s, v14.4s, v2.s[1]
289 FMLA v26.4s, v14.4s, v3.s[1]
290 FMLA v28.4s, v14.4s, v4.s[1]
291 FMLA v30.4s, v14.4s, v5.s[1]
292 FMLA v21.4s, v15.4s, v0.s[1]
293 FMLA v23.4s, v15.4s, v1.s[1]
294 FMLA v25.4s, v15.4s, v2.s[1]
295 LDR q6, [x3], 16 // Load next 6 A
296 FMLA v27.4s, v15.4s, v3.s[1]
297 FMLA v29.4s, v15.4s, v4.s[1]
298 FMLA v31.4s, v15.4s, v5.s[1]
299 LDR q7, [x9], 16
300
301 FMLA v20.4s, v16.4s, v0.s[2]
302 FMLA v22.4s, v16.4s, v1.s[2]
303 FMLA v24.4s, v16.4s, v2.s[2]
304 LDR q8, [x10], 16
305 FMLA v26.4s, v16.4s, v3.s[2]
306 FMLA v28.4s, v16.4s, v4.s[2]
307 FMLA v30.4s, v16.4s, v5.s[2]
308 LDR q9, [x11], 16
309 FMLA v21.4s, v17.4s, v0.s[2]
310 FMLA v23.4s, v17.4s, v1.s[2]
311 FMLA v25.4s, v17.4s, v2.s[2]
312 LDR q10, [x12], 16
313 FMLA v27.4s, v17.4s, v3.s[2]
314 FMLA v29.4s, v17.4s, v4.s[2]
315 FMLA v31.4s, v17.4s, v5.s[2]
316 LDR q11, [x4], 16
317
318 FMLA v20.4s, v18.4s, v0.s[3]
319 FMLA v22.4s, v18.4s, v1.s[3]
320 FMLA v24.4s, v18.4s, v2.s[3]
321 LDP q12, q13, [x5], 32 // Load 4 B
322 FMLA v26.4s, v18.4s, v3.s[3]
323 FMLA v28.4s, v18.4s, v4.s[3]
324 FMLA v30.4s, v18.4s, v5.s[3]
325 LDP q14, q15, [x5], 32
326 FMLA v21.4s, v19.4s, v0.s[3]
327 FMLA v23.4s, v19.4s, v1.s[3]
328 FMLA v25.4s, v19.4s, v2.s[3]
329 LDP q16, q17, [x5], 32
330 FMLA v27.4s, v19.4s, v3.s[3]
331 FMLA v29.4s, v19.4s, v4.s[3]
332 FMLA v31.4s, v19.4s, v5.s[3]
333 LDP q18, q19, [x5], 32
334
335 # Second group of 4 A. 48 FMA.
336 FMLA v20.4s, v12.4s, v6.s[0]
337 FMLA v22.4s, v12.4s, v7.s[0]
338 FMLA v24.4s, v12.4s, v8.s[0]
339 FMLA v26.4s, v12.4s, v9.s[0]
340 FMLA v28.4s, v12.4s, v10.s[0]
341 FMLA v30.4s, v12.4s, v11.s[0]
342 FMLA v21.4s, v13.4s, v6.s[0]
343 FMLA v23.4s, v13.4s, v7.s[0]
344 FMLA v25.4s, v13.4s, v8.s[0]
345 FMLA v27.4s, v13.4s, v9.s[0]
346 FMLA v29.4s, v13.4s, v10.s[0]
347 FMLA v31.4s, v13.4s, v11.s[0]
348
349 FMLA v20.4s, v14.4s, v6.s[1]
350 FMLA v22.4s, v14.4s, v7.s[1]
351 FMLA v24.4s, v14.4s, v8.s[1]
352 FMLA v26.4s, v14.4s, v9.s[1]
353 FMLA v28.4s, v14.4s, v10.s[1]
354 FMLA v30.4s, v14.4s, v11.s[1]
355 FMLA v21.4s, v15.4s, v6.s[1]
356 FMLA v23.4s, v15.4s, v7.s[1]
357 FMLA v25.4s, v15.4s, v8.s[1]
358 FMLA v27.4s, v15.4s, v9.s[1]
359 FMLA v29.4s, v15.4s, v10.s[1]
360 FMLA v31.4s, v15.4s, v11.s[1]
361
362 FMLA v20.4s, v16.4s, v6.s[2]
363 FMLA v22.4s, v16.4s, v7.s[2]
364 FMLA v24.4s, v16.4s, v8.s[2]
365 FMLA v26.4s, v16.4s, v9.s[2]
366 FMLA v28.4s, v16.4s, v10.s[2]
367 FMLA v30.4s, v16.4s, v11.s[2]
368 FMLA v21.4s, v17.4s, v6.s[2]
369 FMLA v23.4s, v17.4s, v7.s[2]
370 FMLA v25.4s, v17.4s, v8.s[2]
371 FMLA v27.4s, v17.4s, v9.s[2]
372 FMLA v29.4s, v17.4s, v10.s[2]
373 FMLA v31.4s, v17.4s, v11.s[2]
374
375 FMLA v20.4s, v18.4s, v6.s[3]
376 FMLA v22.4s, v18.4s, v7.s[3]
377 FMLA v24.4s, v18.4s, v8.s[3]
378 FMLA v26.4s, v18.4s, v9.s[3]
379 FMLA v28.4s, v18.4s, v10.s[3]
380 FMLA v30.4s, v18.4s, v11.s[3]
381 FMLA v21.4s, v19.4s, v6.s[3]
382 FMLA v23.4s, v19.4s, v7.s[3]
383
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700384 # Load min/max values
Frank Barchard0d1052c2020-03-23 17:28:13 -0700385 LD2R {v6.4s, v7.4s}, [x8]
386
387 FMLA v25.4s, v19.4s, v8.s[3]
388 FMLA v27.4s, v19.4s, v9.s[3]
389 # Is there a remainder?- 4 floats of A (16 bytes) or less
390 TST x0, 31
391 FMLA v29.4s, v19.4s, v10.s[3]
392 FMLA v31.4s, v19.4s, v11.s[3]
393 B.NE 4f
394
395 # Clamp
3963:
Marat Dukhana51cf482020-04-08 16:16:19 -0700397 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700398 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700399 FMAX v21.4s, v21.4s, v6.4s
400 FMAX v22.4s, v22.4s, v6.4s
401 FMAX v23.4s, v23.4s, v6.4s
402 FMAX v24.4s, v24.4s, v6.4s
403 FMAX v25.4s, v25.4s, v6.4s
404 FMAX v26.4s, v26.4s, v6.4s
405 FMAX v27.4s, v27.4s, v6.4s
406 FMAX v28.4s, v28.4s, v6.4s
407 FMAX v29.4s, v29.4s, v6.4s
408 FMAX v30.4s, v30.4s, v6.4s
409 FMAX v31.4s, v31.4s, v6.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700410 # Load cn_stride
411 LDR x0, [sp, 64]
Marat Dukhana51cf482020-04-08 16:16:19 -0700412 FMIN v20.4s, v20.4s, v7.4s
413 FMIN v21.4s, v21.4s, v7.4s
414 FMIN v22.4s, v22.4s, v7.4s
415 FMIN v23.4s, v23.4s, v7.4s
416 FMIN v24.4s, v24.4s, v7.4s
417 FMIN v25.4s, v25.4s, v7.4s
418 FMIN v26.4s, v26.4s, v7.4s
419 FMIN v27.4s, v27.4s, v7.4s
420 FMIN v28.4s, v28.4s, v7.4s
421 FMIN v29.4s, v29.4s, v7.4s
422 FMIN v30.4s, v30.4s, v7.4s
423 FMIN v31.4s, v31.4s, v7.4s
Frank Barchard0d1052c2020-03-23 17:28:13 -0700424
425 # Store full 6 x 8
426 B.LO 7f
427
428 STP q20, q21, [x6]
429 ADD x6, x6, x0
430 SUB x3, x3, x2 // a0 -= kc
431 STP q22, q23, [x16]
432 ADD x16, x16, x0
433 SUB x9, x9, x2 // a1 -= kc
434 STP q24, q25, [x17]
435 ADD x17, x17, x0
436 SUB x10, x10, x2 // a2 -= kc
437 STP q26, q27, [x14]
438 ADD x14, x14, x0
439 SUB x11, x11, x2 // a3 -= kc
440 STP q28, q29, [x13]
441 ADD x13, x13, x0
442 SUB x12, x12, x2 // a4 -= kc
443 STP q30, q31, [x7]
444 ADD x7, x7, x0
445 SUB x4, x4, x2 // a5 -= kc
446
447 B.HI 0b
448
449 # Restore d8-d15 from stack
450 LDP d14, d15, [sp, 48]
451 LDP d12, d13, [sp, 32]
452 LDP d10, d11, [sp, 16]
453 LDP d8, d9, [sp], 64
454 RET
455
4564:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700457 # Load min/max values
Frank Barchard0d1052c2020-03-23 17:28:13 -0700458 LD2R {v6.4s, v7.4s}, [x8]
459
460 # Is there a remainder?- 4 floats of A (16 bytes)
461 TBZ x0, 4, 5f
462
463 # Remainder- 4 floats of A (16 bytes)
464 # Load A
465 LDR q0, [x3], 16
466 LDR q1, [x9], 16
467 LDR q2, [x10], 16
468 LDR q3, [x11], 16
469 LDR q4, [x12], 16
470 LDR q5, [x4], 16
471 # Load B
472 LDP q12, q13, [x5], 32
473 LDP q14, q15, [x5], 32
474 LDP q16, q17, [x5], 32
475 LDP q18, q19, [x5], 32
476
477 FMLA v20.4s, v12.4s, v0.s[0]
478 FMLA v22.4s, v12.4s, v1.s[0]
479 FMLA v24.4s, v12.4s, v2.s[0]
480 FMLA v26.4s, v12.4s, v3.s[0]
481 FMLA v28.4s, v12.4s, v4.s[0]
482 FMLA v30.4s, v12.4s, v5.s[0]
483 FMLA v21.4s, v13.4s, v0.s[0]
484 FMLA v23.4s, v13.4s, v1.s[0]
485 FMLA v25.4s, v13.4s, v2.s[0]
486 FMLA v27.4s, v13.4s, v3.s[0]
487 FMLA v29.4s, v13.4s, v4.s[0]
488 FMLA v31.4s, v13.4s, v5.s[0]
489
490 FMLA v20.4s, v14.4s, v0.s[1]
491 FMLA v22.4s, v14.4s, v1.s[1]
492 FMLA v24.4s, v14.4s, v2.s[1]
493 FMLA v26.4s, v14.4s, v3.s[1]
494 FMLA v28.4s, v14.4s, v4.s[1]
495 FMLA v30.4s, v14.4s, v5.s[1]
496 FMLA v21.4s, v15.4s, v0.s[1]
497 FMLA v23.4s, v15.4s, v1.s[1]
498 FMLA v25.4s, v15.4s, v2.s[1]
499 FMLA v27.4s, v15.4s, v3.s[1]
500 FMLA v29.4s, v15.4s, v4.s[1]
501 FMLA v31.4s, v15.4s, v5.s[1]
502
503 FMLA v20.4s, v16.4s, v0.s[2]
504 FMLA v22.4s, v16.4s, v1.s[2]
505 FMLA v24.4s, v16.4s, v2.s[2]
506 FMLA v26.4s, v16.4s, v3.s[2]
507 FMLA v28.4s, v16.4s, v4.s[2]
508 FMLA v30.4s, v16.4s, v5.s[2]
509 FMLA v21.4s, v17.4s, v0.s[2]
510 FMLA v23.4s, v17.4s, v1.s[2]
511 FMLA v25.4s, v17.4s, v2.s[2]
512 FMLA v27.4s, v17.4s, v3.s[2]
513 FMLA v29.4s, v17.4s, v4.s[2]
514 FMLA v31.4s, v17.4s, v5.s[2]
515
516 FMLA v20.4s, v18.4s, v0.s[3]
517 FMLA v22.4s, v18.4s, v1.s[3]
518 FMLA v24.4s, v18.4s, v2.s[3]
519 FMLA v26.4s, v18.4s, v3.s[3]
520 FMLA v28.4s, v18.4s, v4.s[3]
521 FMLA v30.4s, v18.4s, v5.s[3]
522 FMLA v21.4s, v19.4s, v0.s[3]
523 FMLA v23.4s, v19.4s, v1.s[3]
524 FMLA v25.4s, v19.4s, v2.s[3]
525 FMLA v27.4s, v19.4s, v3.s[3]
526 FMLA v29.4s, v19.4s, v4.s[3]
527 FMLA v31.4s, v19.4s, v5.s[3]
528
529 # Is there a remainder?- 2 floats of A (8 bytes)
5305:
531 TBZ x0, 3, 6f
532
533 # Remainder- 2 floats of A (8 bytes)
534 # Load A
535 LDR d0, [x3], 8
536 LDR d1, [x9], 8
537 LDR d2, [x10], 8
538 LDR d3, [x11], 8
539 LDR d4, [x12], 8
540 LDR d5, [x4], 8
541 # Load B
542 LDP q12, q13, [x5], 32
543 LDP q14, q15, [x5], 32
544
545 FMLA v20.4s, v12.4s, v0.s[0]
546 FMLA v22.4s, v12.4s, v1.s[0]
547 FMLA v24.4s, v12.4s, v2.s[0]
548 FMLA v26.4s, v12.4s, v3.s[0]
549 FMLA v28.4s, v12.4s, v4.s[0]
550 FMLA v30.4s, v12.4s, v5.s[0]
551 FMLA v21.4s, v13.4s, v0.s[0]
552 FMLA v23.4s, v13.4s, v1.s[0]
553 FMLA v25.4s, v13.4s, v2.s[0]
554 FMLA v27.4s, v13.4s, v3.s[0]
555 FMLA v29.4s, v13.4s, v4.s[0]
556 FMLA v31.4s, v13.4s, v5.s[0]
557
558 FMLA v20.4s, v14.4s, v0.s[1]
559 FMLA v22.4s, v14.4s, v1.s[1]
560 FMLA v24.4s, v14.4s, v2.s[1]
561 FMLA v26.4s, v14.4s, v3.s[1]
562 FMLA v28.4s, v14.4s, v4.s[1]
563 FMLA v30.4s, v14.4s, v5.s[1]
564 FMLA v21.4s, v15.4s, v0.s[1]
565 FMLA v23.4s, v15.4s, v1.s[1]
566 FMLA v25.4s, v15.4s, v2.s[1]
567 FMLA v27.4s, v15.4s, v3.s[1]
568 FMLA v29.4s, v15.4s, v4.s[1]
569 FMLA v31.4s, v15.4s, v5.s[1]
570
571 # Is there a remainder?- 1 float of A (4 bytes)
5726:
573 TBZ x0, 2, 3b
574
575 # Remainder- 1 float of A (4 bytes)
576 # Load A
577 LDR s0, [x3], 4
578 LDR s1, [x9], 4
579 LDR s2, [x10], 4
580 LDR s3, [x11], 4
581 LDR s4, [x12], 4
582 LDR s5, [x4], 4
583 # Load B
584 LDP q12, q13, [x5], 32
585
586 FMLA v20.4s, v12.4s, v0.s[0]
587 FMLA v22.4s, v12.4s, v1.s[0]
588 FMLA v24.4s, v12.4s, v2.s[0]
589 FMLA v26.4s, v12.4s, v3.s[0]
590 FMLA v28.4s, v12.4s, v4.s[0]
591 FMLA v30.4s, v12.4s, v5.s[0]
592 FMLA v21.4s, v13.4s, v0.s[0]
593 FMLA v23.4s, v13.4s, v1.s[0]
594 FMLA v25.4s, v13.4s, v2.s[0]
595 FMLA v27.4s, v13.4s, v3.s[0]
596 FMLA v29.4s, v13.4s, v4.s[0]
597 FMLA v31.4s, v13.4s, v5.s[0]
598 B 3b
599
600 # Store odd width
6017:
602 TBZ x1, 2, 8f
603 STR q20, [x6], 16
604 MOV v20.16b, v21.16b
605 STR q22, [x16], 16
606 MOV v22.16b, v23.16b
607 STR q24, [x17], 16
608 MOV v24.16b, v25.16b
609 STR q26, [x14], 16
610 MOV v26.16b, v27.16b
611 STR q28, [x13], 16
612 MOV v28.16b, v29.16b
613 STR q30, [x7], 16
614 MOV v30.16b, v31.16b
6158:
616 TBZ x1, 1, 9f
617 STR d20, [x6], 8
618 DUP d20, v20.d[1]
619 STR d22, [x16], 8
620 DUP d22, v22.d[1]
621 STR d24, [x17], 8
622 DUP d24, v24.d[1]
623 STR d26, [x14], 8
624 DUP d26, v26.d[1]
625 STR d28, [x13], 8
626 DUP d28, v28.d[1]
627 STR d30, [x7], 8
628 DUP d30, v30.d[1]
629
6309:
631 TBZ x1, 0, 10f
632 STR s20, [x6]
633 STR s22, [x16]
634 STR s24, [x17]
635 STR s26, [x14]
636 STR s28, [x13]
637 STR s30, [x7]
63810:
639 # Restore d8-d15 from stack
640 LDP d14, d15, [sp, 48]
641 LDP d12, d13, [sp, 32]
642 LDP d10, d11, [sp, 16]
643 LDP d8, d9, [sp], 64
644 RET
645
646END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios
647
648#ifdef __ELF__
649.section ".note.GNU-stack","",%progbits
650#endif