blob: b27691c7e69654b959160ff075c43b7d49cf1a37 [file] [log] [blame]
Frank Barchard387c2d12019-12-16 19:14:07 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# size_t ks, x3 / x9
17# const float**restrict a, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x10
22# size_t a_offset, [sp + 8] -> x11
23# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070024# const xnn_f32_minmax_params params [sp + 24] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26# 5x8 strips the following out of 5x8
27# x23 a5
28# x7 c5 x13 unused
29# A5 v10 v11
30# C v30 v31
31
32# d8-d15 need to be preserved if used.
33# x19-x30 need to be preserved if used. x18 is reserved for OS.
34
35# A pointers
36# x14 a0
37# x15 a1
38# x20 a2
39# x21 a3
40# x8 a4
41
42# C pointers
43# x6 c0
44# x16 c1
45# x17 c2
46# x13 c3
47# x7 c4
48
49# Vector register usage
50# A0 v0 v1
51# A1 v2 v3
52# A2 v4 v5
53# A3 v6 v7
54# A4 v8 v9
55# B v12 v13 v14 v15
56# B v16 v17 v18 v19
57# C v20 v21
58# C v22 v23
59# C v24 v25
60# C v26 v27
61# C v28 v29
62# Clamp v30 v31
63
64BEGIN_FUNCTION xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75
65
66 # Clamp C pointers / Save d8-d15 on stack
67 STP d8, d9, [sp, -64]!
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 CMP x0, 2 // if mr < 2
Frank Barchard684bbb02019-11-16 14:14:42 -080069 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 CSEL x16, x6, x16, LO // c1 = c0
71
72 STP d12, d13, [sp, 16]
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x17, x16, x17, LS // c2 = c1
76
77 STP d14, d15, [sp, 32]
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 CMP x0, 4 // if mr < 4
Frank Barchard684bbb02019-11-16 14:14:42 -080079 ADD x13, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 CSEL x13, x17, x13, LO // c3 = c2
81
82 # Load zero, clamping params pointer
83 LDP x12, x8, [sp, 80]
84 ADD x7, x13, x7 // c4 = c3 + cm_stride
85 // if mr <= 5
86 CSEL x7, x13, x7, LS // c4 = c3
87
88 # Save x20,x21 on stack
89 STP x20, x21, [sp, 48]
90
91 # Load clamp values
92 LD2R {v30.4s, v31.4s}, [x8]
93
94 # Load cn_stride, a_offset
95 LDP x10, x11, [sp, 64]
96
970:
98 # Load initial bias from w into accumulators
99 LDP q20, q21, [x5], 32
100 MOV v22.16b, v20.16b
101 MOV v23.16b, v21.16b
102 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
103 MOV v24.16b, v20.16b
104 MOV v25.16b, v21.16b
105 PRFM PLDL1KEEP, [x5, 64]
106 MOV v26.16b, v20.16b
107 MOV v27.16b, v21.16b
108 PRFM PLDL1KEEP, [x5, 128]
109 MOV v28.16b, v20.16b
110 MOV v29.16b, v21.16b
111 PRFM PLDL1KEEP, [x5, 192]
112
113 MOV x9, x3 // p = ks
114
1151:
116 # Load next 5 A pointers
117 LDP x14, x15, [x4], 16
118 LDP x20, x21, [x4], 16
119 LDR x8, [x4], 8
120
121 CMP x14, x12 // if a0 == zero
122 ADD x14, x14, x11 // a0 += a_offset
123 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset
124 CMP x15, x12 // if a1 == zero
125 ADD x15, x15, x11 // a1 += a_offset
126 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset
127 CMP x20, x12 // if a2 == zero
128 ADD x20, x20, x11 // a2 += a_offset
129 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset
130 CMP x21, x12 // if a3 == zero
131 ADD x21, x21, x11 // a3 += a_offset
132 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset
133 CMP x8, x12 // if a4 == zero
134 ADD x8, x8, x11 // a4 += a_offset
135 CSEL x8, x12, x8, EQ // a4 = zero, else += a4 + a_offset
136
137 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
138 SUBS x0, x2, 32 // k = kc - 32
139 B.LO 5f
140
141 # Prologue - loads for main loop of 96 FMA
142 LDR q0, [x14], 16
143 LDR q2, [x15], 16
144 LDR q4, [x20], 16
145 LDR q6, [x21], 16
146 LDR q8, [x8], 16
147 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
148 LDP q14, q15, [x5], 32
149 LDP q16, q17, [x5], 32
150
151 # Is there at least 8 floats (32 bytes) for main loop?
152 SUBS x0, x0, 32
153 B.LO 3f
154
155 # Main loop - 8 floats of A (32 bytes)
156 # 80 FMA + 5 LDP A + 8 LDP B
1572:
158 # First group of 4 A. 40 FMA.
159 FMLA v20.4s, v12.4s, v0.s[0]
160 LDP q18, q19, [x5], 32 // Load last B
161 FMLA v22.4s, v12.4s, v2.s[0]
162 FMLA v24.4s, v12.4s, v4.s[0]
163 FMLA v26.4s, v12.4s, v6.s[0]
164 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
165 FMLA v28.4s, v12.4s, v8.s[0]
166 FMLA v21.4s, v13.4s, v0.s[0]
167 FMLA v23.4s, v13.4s, v2.s[0]
168 PRFM PLDL1KEEP, [x5, 256]
169 FMLA v25.4s, v13.4s, v4.s[0]
170 FMLA v27.4s, v13.4s, v6.s[0]
171 FMLA v29.4s, v13.4s, v8.s[0]
172 LDR q1, [x14], 16 // Load next 5 A
173
174 FMLA v20.4s, v14.4s, v0.s[1]
175 FMLA v22.4s, v14.4s, v2.s[1]
176 FMLA v24.4s, v14.4s, v4.s[1]
177 LDR q3, [x15], 16
178 FMLA v26.4s, v14.4s, v6.s[1]
179 FMLA v28.4s, v14.4s, v8.s[1]
180 FMLA v21.4s, v15.4s, v0.s[1]
181 LDR q5, [x20], 16
182 FMLA v23.4s, v15.4s, v2.s[1]
183 FMLA v25.4s, v15.4s, v4.s[1]
184 FMLA v27.4s, v15.4s, v6.s[1]
185 LDR q7, [x21], 16
186 FMLA v29.4s, v15.4s, v8.s[1]
187
188 FMLA v20.4s, v16.4s, v0.s[2]
189 FMLA v22.4s, v16.4s, v2.s[2]
190 LDR q9, [x8], 16
191 FMLA v24.4s, v16.4s, v4.s[2]
192 FMLA v26.4s, v16.4s, v6.s[2]
193 FMLA v28.4s, v16.4s, v8.s[2]
194 LDP q12, q13, [x5], 32 // Load 4 B
195 FMLA v21.4s, v17.4s, v0.s[2]
196 FMLA v23.4s, v17.4s, v2.s[2]
197 FMLA v25.4s, v17.4s, v4.s[2]
198 FMLA v27.4s, v17.4s, v6.s[2]
199 FMLA v29.4s, v17.4s, v8.s[2]
200
201 FMLA v20.4s, v18.4s, v0.s[3]
202 FMLA v22.4s, v18.4s, v2.s[3]
203 FMLA v24.4s, v18.4s, v4.s[3]
204 FMLA v26.4s, v18.4s, v6.s[3]
205 LDP q14, q15, [x5], 32
206 FMLA v28.4s, v18.4s, v8.s[3]
207 FMLA v21.4s, v19.4s, v0.s[3]
208 FMLA v23.4s, v19.4s, v2.s[3]
209 LDP q16, q17, [x5], 32
210 FMLA v25.4s, v19.4s, v4.s[3]
211 FMLA v27.4s, v19.4s, v6.s[3]
212 FMLA v29.4s, v19.4s, v8.s[3]
213 LDP q18, q19, [x5], 32
214
215 # Second group of 4 A. 40 FMA.
216 FMLA v20.4s, v12.4s, v1.s[0]
217 FMLA v22.4s, v12.4s, v3.s[0]
218 FMLA v24.4s, v12.4s, v5.s[0]
219 LDR q0, [x14], 16 // Load next 5 A
220 FMLA v26.4s, v12.4s, v7.s[0]
221 FMLA v28.4s, v12.4s, v9.s[0]
222 FMLA v21.4s, v13.4s, v1.s[0]
223 LDR q2, [x15], 16
224 FMLA v23.4s, v13.4s, v3.s[0]
225 FMLA v25.4s, v13.4s, v5.s[0]
226 FMLA v27.4s, v13.4s, v7.s[0]
227 LDR q4, [x20], 16
228 FMLA v29.4s, v13.4s, v9.s[0]
229
230 FMLA v20.4s, v14.4s, v1.s[1]
231 FMLA v22.4s, v14.4s, v3.s[1]
232 LDR q6, [x21], 16
233 FMLA v24.4s, v14.4s, v5.s[1]
234 FMLA v26.4s, v14.4s, v7.s[1]
235 FMLA v28.4s, v14.4s, v9.s[1]
236 LDR q8, [x8], 16
237 FMLA v21.4s, v15.4s, v1.s[1]
238 FMLA v23.4s, v15.4s, v3.s[1]
239 FMLA v25.4s, v15.4s, v5.s[1]
240 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
241 FMLA v27.4s, v15.4s, v7.s[1]
242 FMLA v29.4s, v15.4s, v9.s[1]
243
244 FMLA v20.4s, v16.4s, v1.s[2]
245 FMLA v22.4s, v16.4s, v3.s[2]
246 FMLA v24.4s, v16.4s, v5.s[2]
247 FMLA v26.4s, v16.4s, v7.s[2]
248 FMLA v28.4s, v16.4s, v9.s[2]
249 FMLA v21.4s, v17.4s, v1.s[2]
250 FMLA v23.4s, v17.4s, v3.s[2]
251 LDP q14, q15, [x5], 32
252 FMLA v25.4s, v17.4s, v5.s[2]
253 FMLA v27.4s, v17.4s, v7.s[2]
254 FMLA v29.4s, v17.4s, v9.s[2]
255 LDP q16, q17, [x5], 32
256
257 FMLA v20.4s, v18.4s, v1.s[3]
258 FMLA v22.4s, v18.4s, v3.s[3]
259 SUBS x0, x0, 32
260 FMLA v24.4s, v18.4s, v5.s[3]
261 FMLA v26.4s, v18.4s, v7.s[3]
262 FMLA v28.4s, v18.4s, v9.s[3]
263 FMLA v21.4s, v19.4s, v1.s[3]
264 FMLA v23.4s, v19.4s, v3.s[3]
265 FMLA v25.4s, v19.4s, v5.s[3]
266 FMLA v27.4s, v19.4s, v7.s[3]
267 FMLA v29.4s, v19.4s, v9.s[3]
268 B.HS 2b
269
270 # Epilogue - 8 floats of A (32 bytes)
271 # 80 FMA + 5 LDP A + 8 LDP B
272 # First block same as main loop. Second block has no preloads.
2733:
274 # First group of 4 A. 40 FMA.
275 FMLA v20.4s, v12.4s, v0.s[0]
276 LDP q18, q19, [x5], 32 // Load last B
277 FMLA v22.4s, v12.4s, v2.s[0]
278 FMLA v24.4s, v12.4s, v4.s[0]
279 FMLA v26.4s, v12.4s, v6.s[0]
280 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
281 FMLA v28.4s, v12.4s, v8.s[0]
282 FMLA v21.4s, v13.4s, v0.s[0]
283 FMLA v23.4s, v13.4s, v2.s[0]
284 PRFM PLDL1KEEP, [x5, 256]
285 FMLA v25.4s, v13.4s, v4.s[0]
286 FMLA v27.4s, v13.4s, v6.s[0]
287 FMLA v29.4s, v13.4s, v8.s[0]
288 LDR q1, [x14], 16 // Load next 5 A
289
290 FMLA v20.4s, v14.4s, v0.s[1]
291 FMLA v22.4s, v14.4s, v2.s[1]
292 FMLA v24.4s, v14.4s, v4.s[1]
293 LDR q3, [x15], 16
294 FMLA v26.4s, v14.4s, v6.s[1]
295 FMLA v28.4s, v14.4s, v8.s[1]
296 FMLA v21.4s, v15.4s, v0.s[1]
297 LDR q5, [x20], 16
298 FMLA v23.4s, v15.4s, v2.s[1]
299 FMLA v25.4s, v15.4s, v4.s[1]
300 FMLA v27.4s, v15.4s, v6.s[1]
301 LDR q7, [x21], 16
302 FMLA v29.4s, v15.4s, v8.s[1]
303
304 FMLA v20.4s, v16.4s, v0.s[2]
305 FMLA v22.4s, v16.4s, v2.s[2]
306 LDR q9, [x8], 16
307 FMLA v24.4s, v16.4s, v4.s[2]
308 FMLA v26.4s, v16.4s, v6.s[2]
309 FMLA v28.4s, v16.4s, v8.s[2]
310 LDP q12, q13, [x5], 32 // Load 4 B
311 FMLA v21.4s, v17.4s, v0.s[2]
312 FMLA v23.4s, v17.4s, v2.s[2]
313 FMLA v25.4s, v17.4s, v4.s[2]
314 FMLA v27.4s, v17.4s, v6.s[2]
315 FMLA v29.4s, v17.4s, v8.s[2]
316
317 FMLA v20.4s, v18.4s, v0.s[3]
318 FMLA v22.4s, v18.4s, v2.s[3]
319 FMLA v24.4s, v18.4s, v4.s[3]
320 FMLA v26.4s, v18.4s, v6.s[3]
321 LDP q14, q15, [x5], 32
322 FMLA v28.4s, v18.4s, v8.s[3]
323 FMLA v21.4s, v19.4s, v0.s[3]
324 FMLA v23.4s, v19.4s, v2.s[3]
325 LDP q16, q17, [x5], 32
326 FMLA v25.4s, v19.4s, v4.s[3]
327 FMLA v27.4s, v19.4s, v6.s[3]
328 FMLA v29.4s, v19.4s, v8.s[3]
329 LDP q18, q19, [x5], 32
330
331 # Second group of 4 A. 40 FMA.
332 FMLA v20.4s, v12.4s, v1.s[0]
333 FMLA v22.4s, v12.4s, v3.s[0]
334 FMLA v24.4s, v12.4s, v5.s[0]
335 FMLA v26.4s, v12.4s, v7.s[0]
336 FMLA v28.4s, v12.4s, v9.s[0]
337 FMLA v21.4s, v13.4s, v1.s[0]
338 FMLA v23.4s, v13.4s, v3.s[0]
339 FMLA v25.4s, v13.4s, v5.s[0]
340 FMLA v27.4s, v13.4s, v7.s[0]
341 FMLA v29.4s, v13.4s, v9.s[0]
342
343 FMLA v20.4s, v14.4s, v1.s[1]
344 FMLA v22.4s, v14.4s, v3.s[1]
345 FMLA v24.4s, v14.4s, v5.s[1]
346 FMLA v26.4s, v14.4s, v7.s[1]
347 FMLA v28.4s, v14.4s, v9.s[1]
348 FMLA v21.4s, v15.4s, v1.s[1]
349 FMLA v23.4s, v15.4s, v3.s[1]
350 FMLA v25.4s, v15.4s, v5.s[1]
351 FMLA v27.4s, v15.4s, v7.s[1]
352 FMLA v29.4s, v15.4s, v9.s[1]
353
354 FMLA v20.4s, v16.4s, v1.s[2]
355 FMLA v22.4s, v16.4s, v3.s[2]
356 FMLA v24.4s, v16.4s, v5.s[2]
357 FMLA v26.4s, v16.4s, v7.s[2]
358 FMLA v28.4s, v16.4s, v9.s[2]
359 FMLA v21.4s, v17.4s, v1.s[2]
360 FMLA v23.4s, v17.4s, v3.s[2]
361 FMLA v25.4s, v17.4s, v5.s[2]
362 FMLA v27.4s, v17.4s, v7.s[2]
363 FMLA v29.4s, v17.4s, v9.s[2]
364
365 FMLA v20.4s, v18.4s, v1.s[3]
366 FMLA v22.4s, v18.4s, v3.s[3]
367 FMLA v24.4s, v18.4s, v5.s[3]
368 FMLA v26.4s, v18.4s, v7.s[3]
369 FMLA v28.4s, v18.4s, v9.s[3]
370 FMLA v21.4s, v19.4s, v1.s[3]
371 FMLA v23.4s, v19.4s, v3.s[3]
372 FMLA v25.4s, v19.4s, v5.s[3]
373 FMLA v27.4s, v19.4s, v7.s[3]
374 FMLA v29.4s, v19.4s, v9.s[3]
375 # Is there a remainder?- 4 floats of A (16 bytes) or less
376 TST x0, 31
377 B.NE 5f
378
3794:
380 # ks loop
381 SUBS x9, x9, 40 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800382 B.HI 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700383
384 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700385 FMAX v20.4s, v20.4s, v30.4s
386 FMAX v21.4s, v21.4s, v30.4s
387 FMAX v22.4s, v22.4s, v30.4s
388 FMAX v23.4s, v23.4s, v30.4s
389 FMAX v24.4s, v24.4s, v30.4s
390 FMAX v25.4s, v25.4s, v30.4s
391 FMAX v26.4s, v26.4s, v30.4s
392 FMAX v27.4s, v27.4s, v30.4s
393 FMAX v28.4s, v28.4s, v30.4s
394 FMAX v29.4s, v29.4s, v30.4s
395 FMIN v20.4s, v20.4s, v31.4s
396 FMIN v21.4s, v21.4s, v31.4s
397 FMIN v22.4s, v22.4s, v31.4s
398 FMIN v23.4s, v23.4s, v31.4s
399 FMIN v24.4s, v24.4s, v31.4s
400 FMIN v25.4s, v25.4s, v31.4s
401 FMIN v26.4s, v26.4s, v31.4s
402 FMIN v27.4s, v27.4s, v31.4s
403 FMIN v28.4s, v28.4s, v31.4s
404 FMIN v29.4s, v29.4s, v31.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700405
406 # Store full 5 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800407 SUBS x1, x1, 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700408 B.LO 8f
409
410 STP q28, q29, [x7]
411 ADD x7, x7, x10
412 STP q26, q27, [x13]
413 ADD x13, x13, x10
414 STP q24, q25, [x17]
415 ADD x17, x17, x10
416 STP q22, q23, [x16]
417 ADD x16, x16, x10
418 STP q20, q21, [x6]
419 ADD x6, x6, x10
420
421 SUB x4, x4, x3 // a -= ks
422
423 # nc loop
XNNPACK Teamb455b122019-09-27 18:10:33 -0700424 B.HI 0b
425
426 # Restore x20,x21 from stack
427 LDP x20, x21, [sp, 48]
428
429 # Restore d8-d15 from stack
430 LDP d14, d15, [sp, 32]
431 LDP d12, d13, [sp, 16]
432 LDP d8, d9, [sp], 64
433 RET
434
4355:
436 # Is there a remainder?- 4 floats of A (16 bytes)
437 TBZ x0, 4, 6f
438
439 # Remainder- 4 floats of A (16 bytes)
440 # Load A
441 LDR q0, [x14], 16
442 LDR q2, [x15], 16
443 LDR q4, [x20], 16
444 LDR q6, [x21], 16
445 LDR q8, [x8], 16
446 # Load B
447 LDP q12, q13, [x5], 32
448 LDP q14, q15, [x5], 32
449 LDP q16, q17, [x5], 32
450 LDP q18, q19, [x5], 32
451
452 FMLA v20.4s, v12.4s, v0.s[0]
453 FMLA v22.4s, v12.4s, v2.s[0]
454 FMLA v24.4s, v12.4s, v4.s[0]
455 FMLA v26.4s, v12.4s, v6.s[0]
456 FMLA v28.4s, v12.4s, v8.s[0]
457 FMLA v21.4s, v13.4s, v0.s[0]
458 FMLA v23.4s, v13.4s, v2.s[0]
459 FMLA v25.4s, v13.4s, v4.s[0]
460 FMLA v27.4s, v13.4s, v6.s[0]
461 FMLA v29.4s, v13.4s, v8.s[0]
462
463 FMLA v20.4s, v14.4s, v0.s[1]
464 FMLA v22.4s, v14.4s, v2.s[1]
465 FMLA v24.4s, v14.4s, v4.s[1]
466 FMLA v26.4s, v14.4s, v6.s[1]
467 FMLA v28.4s, v14.4s, v8.s[1]
468 FMLA v21.4s, v15.4s, v0.s[1]
469 FMLA v23.4s, v15.4s, v2.s[1]
470 FMLA v25.4s, v15.4s, v4.s[1]
471 FMLA v27.4s, v15.4s, v6.s[1]
472 FMLA v29.4s, v15.4s, v8.s[1]
473
474 FMLA v20.4s, v16.4s, v0.s[2]
475 FMLA v22.4s, v16.4s, v2.s[2]
476 FMLA v24.4s, v16.4s, v4.s[2]
477 FMLA v26.4s, v16.4s, v6.s[2]
478 FMLA v28.4s, v16.4s, v8.s[2]
479 FMLA v21.4s, v17.4s, v0.s[2]
480 FMLA v23.4s, v17.4s, v2.s[2]
481 FMLA v25.4s, v17.4s, v4.s[2]
482 FMLA v27.4s, v17.4s, v6.s[2]
483 FMLA v29.4s, v17.4s, v8.s[2]
484
485 FMLA v20.4s, v18.4s, v0.s[3]
486 FMLA v22.4s, v18.4s, v2.s[3]
487 FMLA v24.4s, v18.4s, v4.s[3]
488 FMLA v26.4s, v18.4s, v6.s[3]
489 FMLA v28.4s, v18.4s, v8.s[3]
490 FMLA v21.4s, v19.4s, v0.s[3]
491 FMLA v23.4s, v19.4s, v2.s[3]
492 FMLA v25.4s, v19.4s, v4.s[3]
493 FMLA v27.4s, v19.4s, v6.s[3]
494 FMLA v29.4s, v19.4s, v8.s[3]
495
496 # Is there a remainder?- 2 floats of A (8 bytes)
4976:
498 TBZ x0, 3, 7f
499
500 # Remainder- 2 floats of A (8 bytes)
501 # Load A
502 LDR d0, [x14], 8
503 LDR d2, [x15], 8
504 LDR d4, [x20], 8
505 LDR d6, [x21], 8
506 LDR d8, [x8], 8
507 # Load B
508 LDP q12, q13, [x5], 32
509 LDP q14, q15, [x5], 32
510
511 FMLA v20.4s, v12.4s, v0.s[0]
512 FMLA v22.4s, v12.4s, v2.s[0]
513 FMLA v24.4s, v12.4s, v4.s[0]
514 FMLA v26.4s, v12.4s, v6.s[0]
515 FMLA v28.4s, v12.4s, v8.s[0]
516 FMLA v21.4s, v13.4s, v0.s[0]
517 FMLA v23.4s, v13.4s, v2.s[0]
518 FMLA v25.4s, v13.4s, v4.s[0]
519 FMLA v27.4s, v13.4s, v6.s[0]
520 FMLA v29.4s, v13.4s, v8.s[0]
521
522 FMLA v20.4s, v14.4s, v0.s[1]
523 FMLA v22.4s, v14.4s, v2.s[1]
524 FMLA v24.4s, v14.4s, v4.s[1]
525 FMLA v26.4s, v14.4s, v6.s[1]
526 FMLA v28.4s, v14.4s, v8.s[1]
527 FMLA v21.4s, v15.4s, v0.s[1]
528 FMLA v23.4s, v15.4s, v2.s[1]
529 FMLA v25.4s, v15.4s, v4.s[1]
530 FMLA v27.4s, v15.4s, v6.s[1]
531 FMLA v29.4s, v15.4s, v8.s[1]
532
533 # Is there a remainder?- 1 float of A (4 bytes)
5347:
535 TBZ x0, 2, 4b
536
537 # Remainder- 1 float of A (4 bytes)
538 # Load A
539 LDR s0, [x14], 4
540 LDR s2, [x15], 4
541 LDR s4, [x20], 4
542 LDR s6, [x21], 4
543 LDR s8, [x8], 4
544 # Load B
545 LDP q12, q13, [x5], 32
546
547 FMLA v20.4s, v12.4s, v0.s[0]
548 FMLA v22.4s, v12.4s, v2.s[0]
549 FMLA v24.4s, v12.4s, v4.s[0]
550 FMLA v26.4s, v12.4s, v6.s[0]
551 FMLA v28.4s, v12.4s, v8.s[0]
552 FMLA v21.4s, v13.4s, v0.s[0]
553 FMLA v23.4s, v13.4s, v2.s[0]
554 FMLA v25.4s, v13.4s, v4.s[0]
555 FMLA v27.4s, v13.4s, v6.s[0]
556 FMLA v29.4s, v13.4s, v8.s[0]
557 B 4b
558
559 # Store odd width
5608:
561 TBZ x1, 2, 9f
562 STR q28, [x7], 16
563 MOV v28.16b, v29.16b
564 STR q26, [x13], 16
565 MOV v26.16b, v27.16b
566 STR q24, [x17], 16
567 MOV v24.16b, v25.16b
568 STR q22, [x16], 16
569 MOV v22.16b, v23.16b
570 STR q20, [x6], 16
571 MOV v20.16b, v21.16b
5729:
573 TBZ x1, 1, 10f
574 STR d28, [x7], 8
575 DUP d28, v28.d[1]
576 STR d26, [x13], 8
577 DUP d26, v26.d[1]
578 STR d24, [x17], 8
579 DUP d24, v24.d[1]
580 STR d22, [x16], 8
581 DUP d22, v22.d[1]
582 STR d20, [x6], 8
583 DUP d20, v20.d[1]
584
58510:
586 TBZ x1, 0, 11f
587 STR s28, [x7]
588 STR s26, [x13]
589 STR s24, [x17]
590 STR s22, [x16]
591 STR s20, [x6]
59211:
593 # Restore x20,x21 from stack
594 LDP x20, x21, [sp, 48]
595
596 # Restore d8-d15 from stack
597 LDP d14, d15, [sp, 32]
598 LDP d12, d13, [sp, 16]
599 LDP d8, d9, [sp], 64
600 RET
601
602END_FUNCTION xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75
603
604#ifdef __ELF__
605.section ".note.GNU-stack","",%progbits
606#endif