blob: d8f912ccae48933b1ab12cafeecdd571a7bdbd6b [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# unused compared to 5x8
25# x4 a5
26# x7 c5
27# A5 v10 v11
28# C v30 v31
29
30# d8-d15 need to be preserved if used.
31# x19-x30 need to be preserved if used. x18 is reserved for OS.
32
33# A pointers
34# x3 a0
35# x9 a1
36# x10 a2
37# x11 a3
38# x12 a4
39
40# C pointers
41# x6 c0
42# x16 c1
43# x17 c2
44# x13 c3
45# x7 c4
46
47# Vector register usage
48# A0 v0 v1
49# A1 v2 v3
50# A2 v4 v5
51# A3 v6 v7
52# A4 v8 v9
53# B v12 v13 v14 v15
54# B v16 v17 v18 v19
55# C v20 v21
56# C v22 v23
57# C v24 v25
58# C v26 v27
59# C v28 v29
60# Clamp v30 v31
61
62BEGIN_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75
63
64 # Clamp A and C pointers / Save d8-d15 on stack
65 STP d8, d9, [sp, -48]!
Frank Barchard684bbb02019-11-16 14:14:42 -080066 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 ADD x9, x3, x4 // a1 = a0 + a_stride
68 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070069 CSEL x9, x3, x9, LO // a1 = a0
70 CSEL x16, x6, x16, LO // c1 = c0
71
72 STP d12, d13, [sp, 16]
73 ADD x10, x9, x4 // a2 = a1 + a_stride
74 ADD x17, x16, x7 // c2 = c1 + cm_stride
75 // if mr <= 2
76 CSEL x10, x9, x10, LS // a2 = a1
77 CSEL x17, x16, x17, LS // c2 = c1
78
79 STP d14, d15, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080080 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 ADD x11, x10, x4 // a3 = a2 + a_stride
82 ADD x13, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070083 CSEL x11, x10, x11, LO // a3 = a2
84 CSEL x13, x17, x13, LO // c3 = c2
85
86 # Load params pointer
87 LDR x8, [sp, 56]
88
89 ADD x12, x11, x4 // a4 = a3 + a_stride
90 ADD x7, x13, x7 // c4 = c3 + cm_stride
91 // if mr <= 5
92 CSEL x12, x11, x12, LS // a4 = a3
93 CSEL x7, x13, x7, LS // c4 = c3
94
95 # Load clamp values
96 LD2R {v30.4s, v31.4s}, [x8]
97
98 # Load cn_stride
99 LDR x14, [sp, 48]
100
1010:
102 # Load initial bias from w into accumulators
103 LDP q20, q21, [x5], 32
104 MOV v22.16b, v20.16b
105 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
106 MOV v23.16b, v21.16b
107 PRFM PLDL1KEEP, [x5, 64]
108 MOV v24.16b, v20.16b
109 PRFM PLDL1KEEP, [x5, 128]
110 MOV v25.16b, v21.16b
111 PRFM PLDL1KEEP, [x5, 192]
112 MOV v26.16b, v20.16b
113 PRFM PLDL1KEEP, [x3] // Prefetch A
114 MOV v27.16b, v21.16b
115 PRFM PLDL1KEEP, [x9]
116 MOV v28.16b, v20.16b
117 PRFM PLDL1KEEP, [x10]
118 MOV v29.16b, v21.16b
119 PRFM PLDL1KEEP, [x11]
120 PRFM PLDL1KEEP, [x12]
121
122 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
123 SUBS x0, x2, 32 // k = kc - 32
124 B.LO 4f
125
126 # Prologue - loads for main loop of 80 FMA
127 LDR q0, [x3], 16
128 LDR q2, [x9], 16
129 LDR q4, [x10], 16
130 LDR q6, [x11], 16
131 LDR q8, [x12], 16
132 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
133 LDP q14, q15, [x5], 32
134 LDP q16, q17, [x5], 32
135
136 # Is there at least 8 floats (32 bytes) for main loop?
137 SUBS x0, x0, 32
138 B.LO 2f
139
140 # Main loop - 8 floats of A (32 bytes)
141 # 80 FMA + 5 LDP A + 8 LDP B
1421:
143 # First group of 4 A. 40 FMA.
144 FMLA v20.4s, v12.4s, v0.s[0]
145 LDP q18, q19, [x5], 32 // Load last B
146 FMLA v22.4s, v12.4s, v2.s[0]
147 FMLA v24.4s, v12.4s, v4.s[0]
148 FMLA v26.4s, v12.4s, v6.s[0]
149 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
150 FMLA v28.4s, v12.4s, v8.s[0]
151 FMLA v21.4s, v13.4s, v0.s[0]
152 FMLA v23.4s, v13.4s, v2.s[0]
153 PRFM PLDL1KEEP, [x5, 256]
154 FMLA v25.4s, v13.4s, v4.s[0]
155 FMLA v27.4s, v13.4s, v6.s[0]
156 FMLA v29.4s, v13.4s, v8.s[0]
157 LDR q1, [x3], 16 // Load next 5 A
158
159 FMLA v20.4s, v14.4s, v0.s[1]
160 FMLA v22.4s, v14.4s, v2.s[1]
161 FMLA v24.4s, v14.4s, v4.s[1]
162 LDR q3, [x9], 16
163 FMLA v26.4s, v14.4s, v6.s[1]
164 FMLA v28.4s, v14.4s, v8.s[1]
165 FMLA v21.4s, v15.4s, v0.s[1]
166 LDR q5, [x10], 16
167 FMLA v23.4s, v15.4s, v2.s[1]
168 FMLA v25.4s, v15.4s, v4.s[1]
169 FMLA v27.4s, v15.4s, v6.s[1]
170 LDR q7, [x11], 16
171 FMLA v29.4s, v15.4s, v8.s[1]
172
173 FMLA v20.4s, v16.4s, v0.s[2]
174 FMLA v22.4s, v16.4s, v2.s[2]
175 LDR q9, [x12], 16
176 FMLA v24.4s, v16.4s, v4.s[2]
177 FMLA v26.4s, v16.4s, v6.s[2]
178 FMLA v28.4s, v16.4s, v8.s[2]
179 LDP q12, q13, [x5], 32 // Load 4 B
180 FMLA v21.4s, v17.4s, v0.s[2]
181 FMLA v23.4s, v17.4s, v2.s[2]
182 FMLA v25.4s, v17.4s, v4.s[2]
183 LDP q14, q15, [x5], 32
184 FMLA v27.4s, v17.4s, v6.s[2]
185 FMLA v29.4s, v17.4s, v8.s[2]
186
187 FMLA v20.4s, v18.4s, v0.s[3]
188 LDP q16, q17, [x5], 32
189 FMLA v22.4s, v18.4s, v2.s[3]
190 FMLA v24.4s, v18.4s, v4.s[3]
191 FMLA v26.4s, v18.4s, v6.s[3]
192 FMLA v28.4s, v18.4s, v8.s[3]
193 FMLA v21.4s, v19.4s, v0.s[3]
194 FMLA v23.4s, v19.4s, v2.s[3]
195 FMLA v25.4s, v19.4s, v4.s[3]
196 FMLA v27.4s, v19.4s, v6.s[3]
197 FMLA v29.4s, v19.4s, v8.s[3]
198 LDP q18, q19, [x5], 32
199
200 # Second group of 4 A. 40 FMA.
201 FMLA v20.4s, v12.4s, v1.s[0]
202 FMLA v22.4s, v12.4s, v3.s[0]
203 FMLA v24.4s, v12.4s, v5.s[0]
204 LDR q0, [x3], 16 // Load next 5 A
205 FMLA v26.4s, v12.4s, v7.s[0]
206 FMLA v28.4s, v12.4s, v9.s[0]
207 FMLA v21.4s, v13.4s, v1.s[0]
208 LDR q2, [x9], 16
209 FMLA v23.4s, v13.4s, v3.s[0]
210 FMLA v25.4s, v13.4s, v5.s[0]
211 FMLA v27.4s, v13.4s, v7.s[0]
212 LDR q4, [x10], 16
213 FMLA v29.4s, v13.4s, v9.s[0]
214
215 FMLA v20.4s, v14.4s, v1.s[1]
216 FMLA v22.4s, v14.4s, v3.s[1]
217 LDR q6, [x11], 16
218 FMLA v24.4s, v14.4s, v5.s[1]
219 FMLA v26.4s, v14.4s, v7.s[1]
220 FMLA v28.4s, v14.4s, v9.s[1]
221 LDR q8, [x12], 16
222 FMLA v21.4s, v15.4s, v1.s[1]
223 FMLA v23.4s, v15.4s, v3.s[1]
224 FMLA v25.4s, v15.4s, v5.s[1]
225 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
226 FMLA v27.4s, v15.4s, v7.s[1]
227 FMLA v29.4s, v15.4s, v9.s[1]
228
229 FMLA v20.4s, v16.4s, v1.s[2]
230 LDP q14, q15, [x5], 32
231 FMLA v22.4s, v16.4s, v3.s[2]
232 FMLA v24.4s, v16.4s, v5.s[2]
233 FMLA v26.4s, v16.4s, v7.s[2]
234 FMLA v28.4s, v16.4s, v9.s[2]
235 FMLA v21.4s, v17.4s, v1.s[2]
236 FMLA v23.4s, v17.4s, v3.s[2]
237 FMLA v25.4s, v17.4s, v5.s[2]
238 FMLA v27.4s, v17.4s, v7.s[2]
239 FMLA v29.4s, v17.4s, v9.s[2]
240 LDP q16, q17, [x5], 32
241
242 FMLA v20.4s, v18.4s, v1.s[3]
243 FMLA v22.4s, v18.4s, v3.s[3]
244 SUBS x0, x0, 32
245 FMLA v24.4s, v18.4s, v5.s[3]
246 FMLA v26.4s, v18.4s, v7.s[3]
247 FMLA v28.4s, v18.4s, v9.s[3]
248 FMLA v21.4s, v19.4s, v1.s[3]
249 FMLA v23.4s, v19.4s, v3.s[3]
250 FMLA v25.4s, v19.4s, v5.s[3]
251 FMLA v27.4s, v19.4s, v7.s[3]
252 FMLA v29.4s, v19.4s, v9.s[3]
253 B.HS 1b
254
255 # Epilogue - 8 floats of A (32 bytes)
256 # 80 FMA + 5 LDP A + 8 LDP B
257 # First block same as main loop. Second block has no preloads.
2582:
259 # First group of 4 A. 40 FMA.
260 FMLA v20.4s, v12.4s, v0.s[0]
261 LDP q18, q19, [x5], 32 // Load last B
262 FMLA v22.4s, v12.4s, v2.s[0]
263 FMLA v24.4s, v12.4s, v4.s[0]
264 FMLA v26.4s, v12.4s, v6.s[0]
265 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
266 FMLA v28.4s, v12.4s, v8.s[0]
267 FMLA v21.4s, v13.4s, v0.s[0]
268 FMLA v23.4s, v13.4s, v2.s[0]
269 PRFM PLDL1KEEP, [x5, 256]
270 FMLA v25.4s, v13.4s, v4.s[0]
271 FMLA v27.4s, v13.4s, v6.s[0]
272 FMLA v29.4s, v13.4s, v8.s[0]
273 LDR q1, [x3], 16 // Load next 5 A
274
275 FMLA v20.4s, v14.4s, v0.s[1]
276 FMLA v22.4s, v14.4s, v2.s[1]
277 FMLA v24.4s, v14.4s, v4.s[1]
278 LDR q3, [x9], 16
279 FMLA v26.4s, v14.4s, v6.s[1]
280 FMLA v28.4s, v14.4s, v8.s[1]
281 FMLA v21.4s, v15.4s, v0.s[1]
282 LDR q5, [x10], 16
283 FMLA v23.4s, v15.4s, v2.s[1]
284 FMLA v25.4s, v15.4s, v4.s[1]
285 FMLA v27.4s, v15.4s, v6.s[1]
286 LDR q7, [x11], 16
287 FMLA v29.4s, v15.4s, v8.s[1]
288
289 FMLA v20.4s, v16.4s, v0.s[2]
290 FMLA v22.4s, v16.4s, v2.s[2]
291 LDR q9, [x12], 16
292 FMLA v24.4s, v16.4s, v4.s[2]
293 FMLA v26.4s, v16.4s, v6.s[2]
294 FMLA v28.4s, v16.4s, v8.s[2]
295 LDP q12, q13, [x5], 32 // Load 4 B
296 FMLA v21.4s, v17.4s, v0.s[2]
297 FMLA v23.4s, v17.4s, v2.s[2]
298 FMLA v25.4s, v17.4s, v4.s[2]
299 LDP q14, q15, [x5], 32
300 FMLA v27.4s, v17.4s, v6.s[2]
301 FMLA v29.4s, v17.4s, v8.s[2]
302
303 FMLA v20.4s, v18.4s, v0.s[3]
304 LDP q16, q17, [x5], 32
305 FMLA v22.4s, v18.4s, v2.s[3]
306 FMLA v24.4s, v18.4s, v4.s[3]
307 FMLA v26.4s, v18.4s, v6.s[3]
308 FMLA v28.4s, v18.4s, v8.s[3]
309 FMLA v21.4s, v19.4s, v0.s[3]
310 FMLA v23.4s, v19.4s, v2.s[3]
311 FMLA v25.4s, v19.4s, v4.s[3]
312 FMLA v27.4s, v19.4s, v6.s[3]
313 FMLA v29.4s, v19.4s, v8.s[3]
314 LDP q18, q19, [x5], 32
315
316 # Second group of 4 A. 40 FMA.
317 FMLA v20.4s, v12.4s, v1.s[0]
318 FMLA v22.4s, v12.4s, v3.s[0]
319 FMLA v24.4s, v12.4s, v5.s[0]
320 FMLA v26.4s, v12.4s, v7.s[0]
321 FMLA v28.4s, v12.4s, v9.s[0]
322 FMLA v21.4s, v13.4s, v1.s[0]
323 FMLA v23.4s, v13.4s, v3.s[0]
324 FMLA v25.4s, v13.4s, v5.s[0]
325 FMLA v27.4s, v13.4s, v7.s[0]
326 FMLA v29.4s, v13.4s, v9.s[0]
327
328 FMLA v20.4s, v14.4s, v1.s[1]
329 FMLA v22.4s, v14.4s, v3.s[1]
330 FMLA v24.4s, v14.4s, v5.s[1]
331 FMLA v26.4s, v14.4s, v7.s[1]
332 FMLA v28.4s, v14.4s, v9.s[1]
333 FMLA v21.4s, v15.4s, v1.s[1]
334 FMLA v23.4s, v15.4s, v3.s[1]
335 FMLA v25.4s, v15.4s, v5.s[1]
336 FMLA v27.4s, v15.4s, v7.s[1]
337 FMLA v29.4s, v15.4s, v9.s[1]
338
339 FMLA v20.4s, v16.4s, v1.s[2]
340 FMLA v22.4s, v16.4s, v3.s[2]
341 FMLA v24.4s, v16.4s, v5.s[2]
342 FMLA v26.4s, v16.4s, v7.s[2]
343 FMLA v28.4s, v16.4s, v9.s[2]
344 FMLA v21.4s, v17.4s, v1.s[2]
345 FMLA v23.4s, v17.4s, v3.s[2]
346 FMLA v25.4s, v17.4s, v5.s[2]
347 FMLA v27.4s, v17.4s, v7.s[2]
348 FMLA v29.4s, v17.4s, v9.s[2]
349 TST x0, 31
350
351 FMLA v20.4s, v18.4s, v1.s[3]
352 FMLA v22.4s, v18.4s, v3.s[3]
353 FMLA v24.4s, v18.4s, v5.s[3]
354 FMLA v26.4s, v18.4s, v7.s[3]
355 FMLA v28.4s, v18.4s, v9.s[3]
356 FMLA v21.4s, v19.4s, v1.s[3]
357 FMLA v23.4s, v19.4s, v3.s[3]
358 FMLA v25.4s, v19.4s, v5.s[3]
359 FMLA v27.4s, v19.4s, v7.s[3]
360 FMLA v29.4s, v19.4s, v9.s[3]
361 B.NE 4f
362
363 # Clamp
3643:
Marat Dukhana51cf482020-04-08 16:16:19 -0700365 FMAX v20.4s, v20.4s, v30.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800366 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700367 FMAX v21.4s, v21.4s, v30.4s
368 FMAX v22.4s, v22.4s, v30.4s
369 FMAX v23.4s, v23.4s, v30.4s
370 FMAX v24.4s, v24.4s, v30.4s
371 FMAX v25.4s, v25.4s, v30.4s
372 FMAX v26.4s, v26.4s, v30.4s
373 FMAX v27.4s, v27.4s, v30.4s
374 FMAX v28.4s, v28.4s, v30.4s
375 FMAX v29.4s, v29.4s, v30.4s
376 FMIN v20.4s, v20.4s, v31.4s
377 FMIN v21.4s, v21.4s, v31.4s
378 FMIN v22.4s, v22.4s, v31.4s
379 FMIN v23.4s, v23.4s, v31.4s
380 FMIN v24.4s, v24.4s, v31.4s
381 FMIN v25.4s, v25.4s, v31.4s
382 FMIN v26.4s, v26.4s, v31.4s
383 FMIN v27.4s, v27.4s, v31.4s
384 FMIN v28.4s, v28.4s, v31.4s
385 FMIN v29.4s, v29.4s, v31.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700386
387 # Store full 5 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700388 B.LO 7f
389
390 STP q20, q21, [x6]
391 ADD x6, x6, x14
392 SUB x3, x3, x2 // a0 -= kc
393 STP q22, q23, [x16]
394 ADD x16, x16, x14
395 SUB x9, x9, x2 // a1 -= kc
396 STP q24, q25, [x17]
397 ADD x17, x17, x14
398 SUB x10, x10, x2 // a2 -= kc
399 STP q26, q27, [x13]
400 ADD x13, x13, x14
401 SUB x11, x11, x2 // a3 -= kc
402 STP q28, q29, [x7]
403 ADD x7, x7, x14
404 SUB x12, x12, x2 // a4 -= kc
405
XNNPACK Teamb455b122019-09-27 18:10:33 -0700406 B.HI 0b
407
408 # Restore d8-d15 from stack
409 LDP d14, d15, [sp, 32]
410 LDP d12, d13, [sp, 16]
411 LDP d8, d9, [sp], 48
412 RET
413
414 # Load clamp values
4154:
416 # Is there a remainder?- 4 floats of A (16 bytes)
417 TBZ x0, 4, 5f
418
419 # Remainder- 4 floats of A (16 bytes)
420 # Load A
421 LDR q0, [x3], 16
422 LDR q2, [x9], 16
423 LDR q4, [x10], 16
424 LDR q6, [x11], 16
425 LDR q8, [x12], 16
426 # Load B
427 LDP q12, q13, [x5], 32
428 LDP q14, q15, [x5], 32
429 LDP q16, q17, [x5], 32
430 LDP q18, q19, [x5], 32
431
432 FMLA v20.4s, v12.4s, v0.s[0]
433 FMLA v22.4s, v12.4s, v2.s[0]
434 FMLA v24.4s, v12.4s, v4.s[0]
435 FMLA v26.4s, v12.4s, v6.s[0]
436 FMLA v28.4s, v12.4s, v8.s[0]
437 FMLA v21.4s, v13.4s, v0.s[0]
438 FMLA v23.4s, v13.4s, v2.s[0]
439 FMLA v25.4s, v13.4s, v4.s[0]
440 FMLA v27.4s, v13.4s, v6.s[0]
441 FMLA v29.4s, v13.4s, v8.s[0]
442
443 FMLA v20.4s, v14.4s, v0.s[1]
444 FMLA v22.4s, v14.4s, v2.s[1]
445 FMLA v24.4s, v14.4s, v4.s[1]
446 FMLA v26.4s, v14.4s, v6.s[1]
447 FMLA v28.4s, v14.4s, v8.s[1]
448 FMLA v21.4s, v15.4s, v0.s[1]
449 FMLA v23.4s, v15.4s, v2.s[1]
450 FMLA v25.4s, v15.4s, v4.s[1]
451 FMLA v27.4s, v15.4s, v6.s[1]
452 FMLA v29.4s, v15.4s, v8.s[1]
453
454 FMLA v20.4s, v16.4s, v0.s[2]
455 FMLA v22.4s, v16.4s, v2.s[2]
456 FMLA v24.4s, v16.4s, v4.s[2]
457 FMLA v26.4s, v16.4s, v6.s[2]
458 FMLA v28.4s, v16.4s, v8.s[2]
459 FMLA v21.4s, v17.4s, v0.s[2]
460 FMLA v23.4s, v17.4s, v2.s[2]
461 FMLA v25.4s, v17.4s, v4.s[2]
462 FMLA v27.4s, v17.4s, v6.s[2]
463 FMLA v29.4s, v17.4s, v8.s[2]
464
465 FMLA v20.4s, v18.4s, v0.s[3]
466 FMLA v22.4s, v18.4s, v2.s[3]
467 FMLA v24.4s, v18.4s, v4.s[3]
468 FMLA v26.4s, v18.4s, v6.s[3]
469 FMLA v28.4s, v18.4s, v8.s[3]
470 FMLA v21.4s, v19.4s, v0.s[3]
471 FMLA v23.4s, v19.4s, v2.s[3]
472 FMLA v25.4s, v19.4s, v4.s[3]
473 FMLA v27.4s, v19.4s, v6.s[3]
474 FMLA v29.4s, v19.4s, v8.s[3]
475
476 # Is there a remainder?- 2 floats of A (8 bytes)
4775:
478 TBZ x0, 3, 6f
479
480 # Remainder- 2 floats of A (8 bytes)
481 # Load A
482 LDR d0, [x3], 8
483 LDR d2, [x9], 8
484 LDR d4, [x10], 8
485 LDR d6, [x11], 8
486 LDR d8, [x12], 8
487 # Load B
488 LDP q12, q13, [x5], 32
489 LDP q14, q15, [x5], 32
490
491 FMLA v20.4s, v12.4s, v0.s[0]
492 FMLA v22.4s, v12.4s, v2.s[0]
493 FMLA v24.4s, v12.4s, v4.s[0]
494 FMLA v26.4s, v12.4s, v6.s[0]
495 FMLA v28.4s, v12.4s, v8.s[0]
496 FMLA v21.4s, v13.4s, v0.s[0]
497 FMLA v23.4s, v13.4s, v2.s[0]
498 FMLA v25.4s, v13.4s, v4.s[0]
499 FMLA v27.4s, v13.4s, v6.s[0]
500 FMLA v29.4s, v13.4s, v8.s[0]
501
502 FMLA v20.4s, v14.4s, v0.s[1]
503 FMLA v22.4s, v14.4s, v2.s[1]
504 FMLA v24.4s, v14.4s, v4.s[1]
505 FMLA v26.4s, v14.4s, v6.s[1]
506 FMLA v28.4s, v14.4s, v8.s[1]
507 FMLA v21.4s, v15.4s, v0.s[1]
508 FMLA v23.4s, v15.4s, v2.s[1]
509 FMLA v25.4s, v15.4s, v4.s[1]
510 FMLA v27.4s, v15.4s, v6.s[1]
511 FMLA v29.4s, v15.4s, v8.s[1]
512
513 # Is there a remainder?- 1 float of A (4 bytes)
5146:
515 TBZ x0, 2, 3b
516
517 # Remainder- 1 float of A (4 bytes)
518 # Load A
519 LDR s0, [x3], 4
520 LDR s2, [x9], 4
521 LDR s4, [x10], 4
522 LDR s6, [x11], 4
523 LDR s8, [x12], 4
524 # Load B
525 LDP q12, q13, [x5], 32
526
527 FMLA v20.4s, v12.4s, v0.s[0]
528 FMLA v22.4s, v12.4s, v2.s[0]
529 FMLA v24.4s, v12.4s, v4.s[0]
530 FMLA v26.4s, v12.4s, v6.s[0]
531 FMLA v28.4s, v12.4s, v8.s[0]
532 FMLA v21.4s, v13.4s, v0.s[0]
533 FMLA v23.4s, v13.4s, v2.s[0]
534 FMLA v25.4s, v13.4s, v4.s[0]
535 FMLA v27.4s, v13.4s, v6.s[0]
536 FMLA v29.4s, v13.4s, v8.s[0]
537 B 3b
538
539 # Store odd width
5407:
541 TBZ x1, 2, 8f
542 STR q20, [x6], 16
543 MOV v20.16b, v21.16b
544 STR q22, [x16], 16
545 MOV v22.16b, v23.16b
546 STR q24, [x17], 16
547 MOV v24.16b, v25.16b
548 STR q26, [x13], 16
549 MOV v26.16b, v27.16b
550 STR q28, [x7], 16
551 MOV v28.16b, v29.16b
5528:
553 TBZ x1, 1, 9f
554 STR d20, [x6], 8
555 DUP d20, v20.d[1]
556 STR d22, [x16], 8
557 DUP d22, v22.d[1]
558 STR d24, [x17], 8
559 DUP d24, v24.d[1]
560 STR d26, [x13], 8
561 DUP d26, v26.d[1]
562 STR d28, [x7], 8
563 DUP d28, v28.d[1]
564
5659:
566 TBZ x1, 0, 10f
567 STR s20, [x6]
568 STR s22, [x16]
569 STR s24, [x17]
570 STR s26, [x13]
571 STR s28, [x7]
57210:
573 # Restore d8-d15 from stack
574 LDP d14, d15, [sp, 32]
575 LDP d12, d13, [sp, 16]
576 LDP d8, d9, [sp], 48
577 RET
578
Marat Dukhan57431932019-11-22 07:50:42 -0800579END_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75
XNNPACK Teamb455b122019-09-27 18:10:33 -0700580
581#ifdef __ELF__
582.section ".note.GNU-stack","",%progbits
583#endif