blob: e71982ce77dcf6419cd6add1a3590377b9e1c7f6 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# unused compared to 5x8
25# x4 a5
26# x7 c5
27# A5 v10 v11
28# C v30 v31
29
30# d8-d15 need to be preserved if used.
31# x19-x30 need to be preserved if used. x18 is reserved for OS.
32
33# A pointers
34# x3 a0
35# x9 a1
36# x10 a2
37# x11 a3
38# x12 a4
39
40# C pointers
41# x6 c0
42# x16 c1
43# x17 c2
44# x13 c3
45# x7 c4
46
47# Vector register usage
48# A0 v0 v1
49# A1 v2 v3
50# A2 v4 v5
51# A3 v6 v7
52# A4 v8 v9
53# B v12 v13 v14 v15
54# B v16 v17 v18 v19
55# C v20 v21
56# C v22 v23
57# C v24 v25
58# C v26 v27
59# C v28 v29
60# Clamp v30 v31
61
62BEGIN_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75
63
64 # Clamp A and C pointers / Save d8-d15 on stack
65 STP d8, d9, [sp, -48]!
Frank Barchard684bbb02019-11-16 14:14:42 -080066 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 ADD x9, x3, x4 // a1 = a0 + a_stride
68 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070069 CSEL x9, x3, x9, LO // a1 = a0
70 CSEL x16, x6, x16, LO // c1 = c0
71
72 STP d12, d13, [sp, 16]
73 ADD x10, x9, x4 // a2 = a1 + a_stride
74 ADD x17, x16, x7 // c2 = c1 + cm_stride
75 // if mr <= 2
76 CSEL x10, x9, x10, LS // a2 = a1
77 CSEL x17, x16, x17, LS // c2 = c1
78
79 STP d14, d15, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080080 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 ADD x11, x10, x4 // a3 = a2 + a_stride
82 ADD x13, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070083 CSEL x11, x10, x11, LO // a3 = a2
84 CSEL x13, x17, x13, LO // c3 = c2
85
86 # Load params pointer
87 LDR x8, [sp, 56]
88
89 ADD x12, x11, x4 // a4 = a3 + a_stride
90 ADD x7, x13, x7 // c4 = c3 + cm_stride
91 // if mr <= 5
92 CSEL x12, x11, x12, LS // a4 = a3
93 CSEL x7, x13, x7, LS // c4 = c3
94
95 # Load clamp values
96 LD2R {v30.4s, v31.4s}, [x8]
97
98 # Load cn_stride
99 LDR x14, [sp, 48]
100
1010:
102 # Load initial bias from w into accumulators
103 LDP q20, q21, [x5], 32
104 MOV v22.16b, v20.16b
105 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
106 MOV v23.16b, v21.16b
107 PRFM PLDL1KEEP, [x5, 64]
108 MOV v24.16b, v20.16b
109 PRFM PLDL1KEEP, [x5, 128]
110 MOV v25.16b, v21.16b
111 PRFM PLDL1KEEP, [x5, 192]
112 MOV v26.16b, v20.16b
113 PRFM PLDL1KEEP, [x3] // Prefetch A
114 MOV v27.16b, v21.16b
115 PRFM PLDL1KEEP, [x9]
116 MOV v28.16b, v20.16b
117 PRFM PLDL1KEEP, [x10]
118 MOV v29.16b, v21.16b
119 PRFM PLDL1KEEP, [x11]
120 PRFM PLDL1KEEP, [x12]
121
122 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
123 SUBS x0, x2, 32 // k = kc - 32
124 B.LO 4f
125
126 # Prologue - loads for main loop of 80 FMA
127 LDR q0, [x3], 16
128 LDR q2, [x9], 16
129 LDR q4, [x10], 16
130 LDR q6, [x11], 16
131 LDR q8, [x12], 16
132 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
133 LDP q14, q15, [x5], 32
134 LDP q16, q17, [x5], 32
135
136 # Is there at least 8 floats (32 bytes) for main loop?
137 SUBS x0, x0, 32
138 B.LO 2f
139
140 # Main loop - 8 floats of A (32 bytes)
141 # 80 FMA + 5 LDP A + 8 LDP B
1421:
143 # First group of 4 A. 40 FMA.
144 FMLA v20.4s, v12.4s, v0.s[0]
145 LDP q18, q19, [x5], 32 // Load last B
146 FMLA v22.4s, v12.4s, v2.s[0]
147 FMLA v24.4s, v12.4s, v4.s[0]
148 FMLA v26.4s, v12.4s, v6.s[0]
149 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
150 FMLA v28.4s, v12.4s, v8.s[0]
151 FMLA v21.4s, v13.4s, v0.s[0]
152 FMLA v23.4s, v13.4s, v2.s[0]
153 PRFM PLDL1KEEP, [x5, 256]
154 FMLA v25.4s, v13.4s, v4.s[0]
155 FMLA v27.4s, v13.4s, v6.s[0]
156 FMLA v29.4s, v13.4s, v8.s[0]
157 LDR q1, [x3], 16 // Load next 5 A
158
159 FMLA v20.4s, v14.4s, v0.s[1]
160 FMLA v22.4s, v14.4s, v2.s[1]
161 FMLA v24.4s, v14.4s, v4.s[1]
162 LDR q3, [x9], 16
163 FMLA v26.4s, v14.4s, v6.s[1]
164 FMLA v28.4s, v14.4s, v8.s[1]
165 FMLA v21.4s, v15.4s, v0.s[1]
166 LDR q5, [x10], 16
167 FMLA v23.4s, v15.4s, v2.s[1]
168 FMLA v25.4s, v15.4s, v4.s[1]
169 FMLA v27.4s, v15.4s, v6.s[1]
170 LDR q7, [x11], 16
171 FMLA v29.4s, v15.4s, v8.s[1]
172
173 FMLA v20.4s, v16.4s, v0.s[2]
174 FMLA v22.4s, v16.4s, v2.s[2]
175 LDR q9, [x12], 16
176 FMLA v24.4s, v16.4s, v4.s[2]
177 FMLA v26.4s, v16.4s, v6.s[2]
178 FMLA v28.4s, v16.4s, v8.s[2]
179 LDP q12, q13, [x5], 32 // Load 4 B
180 FMLA v21.4s, v17.4s, v0.s[2]
181 FMLA v23.4s, v17.4s, v2.s[2]
182 FMLA v25.4s, v17.4s, v4.s[2]
183 LDP q14, q15, [x5], 32
184 FMLA v27.4s, v17.4s, v6.s[2]
185 FMLA v29.4s, v17.4s, v8.s[2]
186
187 FMLA v20.4s, v18.4s, v0.s[3]
188 LDP q16, q17, [x5], 32
189 FMLA v22.4s, v18.4s, v2.s[3]
190 FMLA v24.4s, v18.4s, v4.s[3]
191 FMLA v26.4s, v18.4s, v6.s[3]
192 FMLA v28.4s, v18.4s, v8.s[3]
193 FMLA v21.4s, v19.4s, v0.s[3]
194 FMLA v23.4s, v19.4s, v2.s[3]
195 FMLA v25.4s, v19.4s, v4.s[3]
196 FMLA v27.4s, v19.4s, v6.s[3]
197 FMLA v29.4s, v19.4s, v8.s[3]
198 LDP q18, q19, [x5], 32
199
200 # Second group of 4 A. 40 FMA.
201 FMLA v20.4s, v12.4s, v1.s[0]
202 FMLA v22.4s, v12.4s, v3.s[0]
203 FMLA v24.4s, v12.4s, v5.s[0]
204 LDR q0, [x3], 16 // Load next 5 A
205 FMLA v26.4s, v12.4s, v7.s[0]
206 FMLA v28.4s, v12.4s, v9.s[0]
207 FMLA v21.4s, v13.4s, v1.s[0]
208 LDR q2, [x9], 16
209 FMLA v23.4s, v13.4s, v3.s[0]
210 FMLA v25.4s, v13.4s, v5.s[0]
211 FMLA v27.4s, v13.4s, v7.s[0]
212 LDR q4, [x10], 16
213 FMLA v29.4s, v13.4s, v9.s[0]
214
215 FMLA v20.4s, v14.4s, v1.s[1]
216 FMLA v22.4s, v14.4s, v3.s[1]
217 LDR q6, [x11], 16
218 FMLA v24.4s, v14.4s, v5.s[1]
219 FMLA v26.4s, v14.4s, v7.s[1]
220 FMLA v28.4s, v14.4s, v9.s[1]
221 LDR q8, [x12], 16
222 FMLA v21.4s, v15.4s, v1.s[1]
223 FMLA v23.4s, v15.4s, v3.s[1]
224 FMLA v25.4s, v15.4s, v5.s[1]
225 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
226 FMLA v27.4s, v15.4s, v7.s[1]
227 FMLA v29.4s, v15.4s, v9.s[1]
228
229 FMLA v20.4s, v16.4s, v1.s[2]
230 LDP q14, q15, [x5], 32
231 FMLA v22.4s, v16.4s, v3.s[2]
232 FMLA v24.4s, v16.4s, v5.s[2]
233 FMLA v26.4s, v16.4s, v7.s[2]
234 FMLA v28.4s, v16.4s, v9.s[2]
235 FMLA v21.4s, v17.4s, v1.s[2]
236 FMLA v23.4s, v17.4s, v3.s[2]
237 FMLA v25.4s, v17.4s, v5.s[2]
238 FMLA v27.4s, v17.4s, v7.s[2]
239 FMLA v29.4s, v17.4s, v9.s[2]
240 LDP q16, q17, [x5], 32
241
242 FMLA v20.4s, v18.4s, v1.s[3]
243 FMLA v22.4s, v18.4s, v3.s[3]
244 SUBS x0, x0, 32
245 FMLA v24.4s, v18.4s, v5.s[3]
246 FMLA v26.4s, v18.4s, v7.s[3]
247 FMLA v28.4s, v18.4s, v9.s[3]
248 FMLA v21.4s, v19.4s, v1.s[3]
249 FMLA v23.4s, v19.4s, v3.s[3]
250 FMLA v25.4s, v19.4s, v5.s[3]
251 FMLA v27.4s, v19.4s, v7.s[3]
252 FMLA v29.4s, v19.4s, v9.s[3]
253 B.HS 1b
254
255 # Epilogue - 8 floats of A (32 bytes)
256 # 80 FMA + 5 LDP A + 8 LDP B
257 # First block same as main loop. Second block has no preloads.
2582:
259 # First group of 4 A. 40 FMA.
260 FMLA v20.4s, v12.4s, v0.s[0]
261 LDP q18, q19, [x5], 32 // Load last B
262 FMLA v22.4s, v12.4s, v2.s[0]
263 FMLA v24.4s, v12.4s, v4.s[0]
264 FMLA v26.4s, v12.4s, v6.s[0]
265 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
266 FMLA v28.4s, v12.4s, v8.s[0]
267 FMLA v21.4s, v13.4s, v0.s[0]
268 FMLA v23.4s, v13.4s, v2.s[0]
269 PRFM PLDL1KEEP, [x5, 256]
270 FMLA v25.4s, v13.4s, v4.s[0]
271 FMLA v27.4s, v13.4s, v6.s[0]
272 FMLA v29.4s, v13.4s, v8.s[0]
273 LDR q1, [x3], 16 // Load next 5 A
274
275 FMLA v20.4s, v14.4s, v0.s[1]
276 FMLA v22.4s, v14.4s, v2.s[1]
277 FMLA v24.4s, v14.4s, v4.s[1]
278 LDR q3, [x9], 16
279 FMLA v26.4s, v14.4s, v6.s[1]
280 FMLA v28.4s, v14.4s, v8.s[1]
281 FMLA v21.4s, v15.4s, v0.s[1]
282 LDR q5, [x10], 16
283 FMLA v23.4s, v15.4s, v2.s[1]
284 FMLA v25.4s, v15.4s, v4.s[1]
285 FMLA v27.4s, v15.4s, v6.s[1]
286 LDR q7, [x11], 16
287 FMLA v29.4s, v15.4s, v8.s[1]
288
289 FMLA v20.4s, v16.4s, v0.s[2]
290 FMLA v22.4s, v16.4s, v2.s[2]
291 LDR q9, [x12], 16
292 FMLA v24.4s, v16.4s, v4.s[2]
293 FMLA v26.4s, v16.4s, v6.s[2]
294 FMLA v28.4s, v16.4s, v8.s[2]
295 LDP q12, q13, [x5], 32 // Load 4 B
296 FMLA v21.4s, v17.4s, v0.s[2]
297 FMLA v23.4s, v17.4s, v2.s[2]
298 FMLA v25.4s, v17.4s, v4.s[2]
299 LDP q14, q15, [x5], 32
300 FMLA v27.4s, v17.4s, v6.s[2]
301 FMLA v29.4s, v17.4s, v8.s[2]
302
303 FMLA v20.4s, v18.4s, v0.s[3]
304 LDP q16, q17, [x5], 32
305 FMLA v22.4s, v18.4s, v2.s[3]
306 FMLA v24.4s, v18.4s, v4.s[3]
307 FMLA v26.4s, v18.4s, v6.s[3]
308 FMLA v28.4s, v18.4s, v8.s[3]
309 FMLA v21.4s, v19.4s, v0.s[3]
310 FMLA v23.4s, v19.4s, v2.s[3]
311 FMLA v25.4s, v19.4s, v4.s[3]
312 FMLA v27.4s, v19.4s, v6.s[3]
313 FMLA v29.4s, v19.4s, v8.s[3]
314 LDP q18, q19, [x5], 32
315
316 # Second group of 4 A. 40 FMA.
317 FMLA v20.4s, v12.4s, v1.s[0]
318 FMLA v22.4s, v12.4s, v3.s[0]
319 FMLA v24.4s, v12.4s, v5.s[0]
320 FMLA v26.4s, v12.4s, v7.s[0]
321 FMLA v28.4s, v12.4s, v9.s[0]
322 FMLA v21.4s, v13.4s, v1.s[0]
323 FMLA v23.4s, v13.4s, v3.s[0]
324 FMLA v25.4s, v13.4s, v5.s[0]
325 FMLA v27.4s, v13.4s, v7.s[0]
326 FMLA v29.4s, v13.4s, v9.s[0]
327
328 FMLA v20.4s, v14.4s, v1.s[1]
329 FMLA v22.4s, v14.4s, v3.s[1]
330 FMLA v24.4s, v14.4s, v5.s[1]
331 FMLA v26.4s, v14.4s, v7.s[1]
332 FMLA v28.4s, v14.4s, v9.s[1]
333 FMLA v21.4s, v15.4s, v1.s[1]
334 FMLA v23.4s, v15.4s, v3.s[1]
335 FMLA v25.4s, v15.4s, v5.s[1]
336 FMLA v27.4s, v15.4s, v7.s[1]
337 FMLA v29.4s, v15.4s, v9.s[1]
338
339 FMLA v20.4s, v16.4s, v1.s[2]
340 FMLA v22.4s, v16.4s, v3.s[2]
341 FMLA v24.4s, v16.4s, v5.s[2]
342 FMLA v26.4s, v16.4s, v7.s[2]
343 FMLA v28.4s, v16.4s, v9.s[2]
344 FMLA v21.4s, v17.4s, v1.s[2]
345 FMLA v23.4s, v17.4s, v3.s[2]
346 FMLA v25.4s, v17.4s, v5.s[2]
347 FMLA v27.4s, v17.4s, v7.s[2]
348 FMLA v29.4s, v17.4s, v9.s[2]
349 TST x0, 31
350
351 FMLA v20.4s, v18.4s, v1.s[3]
352 FMLA v22.4s, v18.4s, v3.s[3]
353 FMLA v24.4s, v18.4s, v5.s[3]
354 FMLA v26.4s, v18.4s, v7.s[3]
355 FMLA v28.4s, v18.4s, v9.s[3]
356 FMLA v21.4s, v19.4s, v1.s[3]
357 FMLA v23.4s, v19.4s, v3.s[3]
358 FMLA v25.4s, v19.4s, v5.s[3]
359 FMLA v27.4s, v19.4s, v7.s[3]
360 FMLA v29.4s, v19.4s, v9.s[3]
361 B.NE 4f
362
363 # Clamp
3643:
365 FMIN v20.4s, v20.4s, v30.4s
366 FMIN v21.4s, v21.4s, v30.4s
367 FMIN v22.4s, v22.4s, v30.4s
368 FMIN v23.4s, v23.4s, v30.4s
369 FMIN v24.4s, v24.4s, v30.4s
370 FMIN v25.4s, v25.4s, v30.4s
371 FMIN v26.4s, v26.4s, v30.4s
372 FMIN v27.4s, v27.4s, v30.4s
373 FMIN v28.4s, v28.4s, v30.4s
374 FMIN v29.4s, v29.4s, v30.4s
375 FMAX v20.4s, v20.4s, v31.4s
376 FMAX v21.4s, v21.4s, v31.4s
377 FMAX v22.4s, v22.4s, v31.4s
378 FMAX v23.4s, v23.4s, v31.4s
379 FMAX v24.4s, v24.4s, v31.4s
380 FMAX v25.4s, v25.4s, v31.4s
381 FMAX v26.4s, v26.4s, v31.4s
382 FMAX v27.4s, v27.4s, v31.4s
383 FMAX v28.4s, v28.4s, v31.4s
384 FMAX v29.4s, v29.4s, v31.4s
385
386 # Store full 5 x 8
387 CMP x1, 8
388 B.LO 7f
389
390 STP q20, q21, [x6]
391 ADD x6, x6, x14
392 SUB x3, x3, x2 // a0 -= kc
393 STP q22, q23, [x16]
394 ADD x16, x16, x14
395 SUB x9, x9, x2 // a1 -= kc
396 STP q24, q25, [x17]
397 ADD x17, x17, x14
398 SUB x10, x10, x2 // a2 -= kc
399 STP q26, q27, [x13]
400 ADD x13, x13, x14
401 SUB x11, x11, x2 // a3 -= kc
402 STP q28, q29, [x7]
403 ADD x7, x7, x14
404 SUB x12, x12, x2 // a4 -= kc
405
406 SUBS x1, x1, 8
407 B.HI 0b
408
409 # Restore d8-d15 from stack
410 LDP d14, d15, [sp, 32]
411 LDP d12, d13, [sp, 16]
412 LDP d8, d9, [sp], 48
413 RET
414
415 # Load clamp values
4164:
417 # Is there a remainder?- 4 floats of A (16 bytes)
418 TBZ x0, 4, 5f
419
420 # Remainder- 4 floats of A (16 bytes)
421 # Load A
422 LDR q0, [x3], 16
423 LDR q2, [x9], 16
424 LDR q4, [x10], 16
425 LDR q6, [x11], 16
426 LDR q8, [x12], 16
427 # Load B
428 LDP q12, q13, [x5], 32
429 LDP q14, q15, [x5], 32
430 LDP q16, q17, [x5], 32
431 LDP q18, q19, [x5], 32
432
433 FMLA v20.4s, v12.4s, v0.s[0]
434 FMLA v22.4s, v12.4s, v2.s[0]
435 FMLA v24.4s, v12.4s, v4.s[0]
436 FMLA v26.4s, v12.4s, v6.s[0]
437 FMLA v28.4s, v12.4s, v8.s[0]
438 FMLA v21.4s, v13.4s, v0.s[0]
439 FMLA v23.4s, v13.4s, v2.s[0]
440 FMLA v25.4s, v13.4s, v4.s[0]
441 FMLA v27.4s, v13.4s, v6.s[0]
442 FMLA v29.4s, v13.4s, v8.s[0]
443
444 FMLA v20.4s, v14.4s, v0.s[1]
445 FMLA v22.4s, v14.4s, v2.s[1]
446 FMLA v24.4s, v14.4s, v4.s[1]
447 FMLA v26.4s, v14.4s, v6.s[1]
448 FMLA v28.4s, v14.4s, v8.s[1]
449 FMLA v21.4s, v15.4s, v0.s[1]
450 FMLA v23.4s, v15.4s, v2.s[1]
451 FMLA v25.4s, v15.4s, v4.s[1]
452 FMLA v27.4s, v15.4s, v6.s[1]
453 FMLA v29.4s, v15.4s, v8.s[1]
454
455 FMLA v20.4s, v16.4s, v0.s[2]
456 FMLA v22.4s, v16.4s, v2.s[2]
457 FMLA v24.4s, v16.4s, v4.s[2]
458 FMLA v26.4s, v16.4s, v6.s[2]
459 FMLA v28.4s, v16.4s, v8.s[2]
460 FMLA v21.4s, v17.4s, v0.s[2]
461 FMLA v23.4s, v17.4s, v2.s[2]
462 FMLA v25.4s, v17.4s, v4.s[2]
463 FMLA v27.4s, v17.4s, v6.s[2]
464 FMLA v29.4s, v17.4s, v8.s[2]
465
466 FMLA v20.4s, v18.4s, v0.s[3]
467 FMLA v22.4s, v18.4s, v2.s[3]
468 FMLA v24.4s, v18.4s, v4.s[3]
469 FMLA v26.4s, v18.4s, v6.s[3]
470 FMLA v28.4s, v18.4s, v8.s[3]
471 FMLA v21.4s, v19.4s, v0.s[3]
472 FMLA v23.4s, v19.4s, v2.s[3]
473 FMLA v25.4s, v19.4s, v4.s[3]
474 FMLA v27.4s, v19.4s, v6.s[3]
475 FMLA v29.4s, v19.4s, v8.s[3]
476
477 # Is there a remainder?- 2 floats of A (8 bytes)
4785:
479 TBZ x0, 3, 6f
480
481 # Remainder- 2 floats of A (8 bytes)
482 # Load A
483 LDR d0, [x3], 8
484 LDR d2, [x9], 8
485 LDR d4, [x10], 8
486 LDR d6, [x11], 8
487 LDR d8, [x12], 8
488 # Load B
489 LDP q12, q13, [x5], 32
490 LDP q14, q15, [x5], 32
491
492 FMLA v20.4s, v12.4s, v0.s[0]
493 FMLA v22.4s, v12.4s, v2.s[0]
494 FMLA v24.4s, v12.4s, v4.s[0]
495 FMLA v26.4s, v12.4s, v6.s[0]
496 FMLA v28.4s, v12.4s, v8.s[0]
497 FMLA v21.4s, v13.4s, v0.s[0]
498 FMLA v23.4s, v13.4s, v2.s[0]
499 FMLA v25.4s, v13.4s, v4.s[0]
500 FMLA v27.4s, v13.4s, v6.s[0]
501 FMLA v29.4s, v13.4s, v8.s[0]
502
503 FMLA v20.4s, v14.4s, v0.s[1]
504 FMLA v22.4s, v14.4s, v2.s[1]
505 FMLA v24.4s, v14.4s, v4.s[1]
506 FMLA v26.4s, v14.4s, v6.s[1]
507 FMLA v28.4s, v14.4s, v8.s[1]
508 FMLA v21.4s, v15.4s, v0.s[1]
509 FMLA v23.4s, v15.4s, v2.s[1]
510 FMLA v25.4s, v15.4s, v4.s[1]
511 FMLA v27.4s, v15.4s, v6.s[1]
512 FMLA v29.4s, v15.4s, v8.s[1]
513
514 # Is there a remainder?- 1 float of A (4 bytes)
5156:
516 TBZ x0, 2, 3b
517
518 # Remainder- 1 float of A (4 bytes)
519 # Load A
520 LDR s0, [x3], 4
521 LDR s2, [x9], 4
522 LDR s4, [x10], 4
523 LDR s6, [x11], 4
524 LDR s8, [x12], 4
525 # Load B
526 LDP q12, q13, [x5], 32
527
528 FMLA v20.4s, v12.4s, v0.s[0]
529 FMLA v22.4s, v12.4s, v2.s[0]
530 FMLA v24.4s, v12.4s, v4.s[0]
531 FMLA v26.4s, v12.4s, v6.s[0]
532 FMLA v28.4s, v12.4s, v8.s[0]
533 FMLA v21.4s, v13.4s, v0.s[0]
534 FMLA v23.4s, v13.4s, v2.s[0]
535 FMLA v25.4s, v13.4s, v4.s[0]
536 FMLA v27.4s, v13.4s, v6.s[0]
537 FMLA v29.4s, v13.4s, v8.s[0]
538 B 3b
539
540 # Store odd width
5417:
542 TBZ x1, 2, 8f
543 STR q20, [x6], 16
544 MOV v20.16b, v21.16b
545 STR q22, [x16], 16
546 MOV v22.16b, v23.16b
547 STR q24, [x17], 16
548 MOV v24.16b, v25.16b
549 STR q26, [x13], 16
550 MOV v26.16b, v27.16b
551 STR q28, [x7], 16
552 MOV v28.16b, v29.16b
5538:
554 TBZ x1, 1, 9f
555 STR d20, [x6], 8
556 DUP d20, v20.d[1]
557 STR d22, [x16], 8
558 DUP d22, v22.d[1]
559 STR d24, [x17], 8
560 DUP d24, v24.d[1]
561 STR d26, [x13], 8
562 DUP d26, v26.d[1]
563 STR d28, [x7], 8
564 DUP d28, v28.d[1]
565
5669:
567 TBZ x1, 0, 10f
568 STR s20, [x6]
569 STR s22, [x16]
570 STR s24, [x17]
571 STR s26, [x13]
572 STR s28, [x7]
57310:
574 # Restore d8-d15 from stack
575 LDP d14, d15, [sp, 32]
576 LDP d12, d13, [sp, 16]
577 LDP d8, d9, [sp], 48
578 RET
579
Marat Dukhan57431932019-11-22 07:50:42 -0800580END_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75
XNNPACK Teamb455b122019-09-27 18:10:33 -0700581
582#ifdef __ELF__
583.section ".note.GNU-stack","",%progbits
584#endif