blob: 962c9511d67deea74c42485b885adeb835861dab [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0 v6
45# A1 v1 v7
46# A2 v2 v8
47# A3 v3 v9
48# A4 v4 v10
49# A5 v5 v11
50# B v12 v13 v14 v15
51# B v16 v17 v18 v19
52# C v20 v21
53# C v22 v23
54# C v24 v25
55# C v26 v27
56# C v28 v29
57# C v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75
61
62 # Clamp A and C pointers / Save d8-d15 on stack
63 STP d8, d9, [sp, -64]!
Frank Barchard684bbb02019-11-16 14:14:42 -080064 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 ADD x9, x3, x4 // a1 = a0 + a_stride
66 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 CSEL x9, x3, x9, LO // a1 = a0
68 CSEL x16, x6, x16, LO // c1 = c0
69
70 STP d10, d11, [sp, 16]
71 ADD x10, x9, x4 // a2 = a1 + a_stride
72 ADD x17, x16, x7 // c2 = c1 + cm_stride
73 // if mr <= 2
74 CSEL x10, x9, x10, LS // a2 = a1
75 CSEL x17, x16, x17, LS // c2 = c1
76
77 STP d12, d13, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080078 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 ADD x11, x10, x4 // a3 = a2 + a_stride
80 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 CSEL x11, x10, x11, LO // a3 = a2
82 CSEL x18, x17, x18, LO // c3 = c2
83
84 STP d14, d15, [sp, 48]
85 ADD x12, x11, x4 // a4 = a3 + a_stride
86 ADD x13, x18, x7 // c4 = c3 + cm_stride
87 // if mr <= 5
88 CSEL x12, x11, x12, LS // a4 = a3
89 CSEL x13, x18, x13, LS // c4 = c3
90
91 # Load params pointer
92 LDR x8, [sp, 72]
93
Frank Barchard684bbb02019-11-16 14:14:42 -080094 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 ADD x4, x12, x4 // a5 = a4 + a_stride
96 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070097 CSEL x4, x12, x4, LO // a5 = a4
98 CSEL x7, x13, x7, LO // c5 = c4
99
100 # Load cn_stride
101 LDR x14, [sp, 64]
102
1030:
104 # Load initial bias from w into accumulators
105 LDP q20, q21, [x5], 32
106 MOV v22.16b, v20.16b
107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
108 MOV v23.16b, v21.16b
109 PRFM PLDL1KEEP, [x5, 64]
110 MOV v24.16b, v20.16b
111 PRFM PLDL1KEEP, [x5, 128]
112 MOV v25.16b, v21.16b
113 PRFM PLDL1KEEP, [x5, 192]
114 MOV v26.16b, v20.16b
115 PRFM PLDL1KEEP, [x3] // Prefetch A
116 MOV v27.16b, v21.16b
117 PRFM PLDL1KEEP, [x9]
118 MOV v28.16b, v20.16b
119 PRFM PLDL1KEEP, [x10]
120 MOV v29.16b, v21.16b
121 PRFM PLDL1KEEP, [x11]
122 MOV v30.16b, v20.16b
123 PRFM PLDL1KEEP, [x12]
124 MOV v31.16b, v21.16b
125 PRFM PLDL1KEEP, [x4]
126
127 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
128 SUBS x0, x2, 32 // k = kc - 32
129 B.LO 4f
130
131 # Prologue - loads for main loop of 96 FMA
132 LDR q0, [x3], 16
133 LDR q1, [x9], 16
134 LDR q2, [x10], 16
135 LDR q3, [x11], 16
136 LDR q4, [x12], 16
137 LDR q5, [x4], 16
138 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
139 LDP q14, q15, [x5], 32
140 LDP q16, q17, [x5], 32
141
142 # Is there at least 8 floats (32 bytes) for main loop?
143 SUBS x0, x0, 32
144 B.LO 2f
145
146 # Main loop - 8 floats of A (32 bytes)
147 # 96 FMA + 6 LDP A + 8 LDP B
1481:
149 # First group of 4 A. 48 FMA.
150 FMLA v20.4s, v12.4s, v0.s[0]
151 LDP q18, q19, [x5], 32 // Load last B
152 FMLA v22.4s, v12.4s, v1.s[0]
153 FMLA v24.4s, v12.4s, v2.s[0]
154 FMLA v26.4s, v12.4s, v3.s[0]
155 FMLA v28.4s, v12.4s, v4.s[0]
156 FMLA v30.4s, v12.4s, v5.s[0]
157 FMLA v21.4s, v13.4s, v0.s[0]
158 FMLA v23.4s, v13.4s, v1.s[0]
159 FMLA v25.4s, v13.4s, v2.s[0]
160 FMLA v27.4s, v13.4s, v3.s[0]
161 FMLA v29.4s, v13.4s, v4.s[0]
162
163 FMLA v31.4s, v13.4s, v5.s[0]
164 FMLA v20.4s, v14.4s, v0.s[1]
165 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
166 FMLA v22.4s, v14.4s, v1.s[1]
167 FMLA v24.4s, v14.4s, v2.s[1]
168 FMLA v26.4s, v14.4s, v3.s[1]
169 FMLA v28.4s, v14.4s, v4.s[1]
170 PRFM PLDL1KEEP, [x5, 256]
171 FMLA v30.4s, v14.4s, v5.s[1]
172 FMLA v21.4s, v15.4s, v0.s[1]
173 FMLA v23.4s, v15.4s, v1.s[1]
174 FMLA v25.4s, v15.4s, v2.s[1]
175 LDR q6, [x3], 16 // Load next 6 A
176 FMLA v27.4s, v15.4s, v3.s[1]
177 FMLA v29.4s, v15.4s, v4.s[1]
178 FMLA v31.4s, v15.4s, v5.s[1]
179 LDR q7, [x9], 16
180
181 FMLA v20.4s, v16.4s, v0.s[2]
182 FMLA v22.4s, v16.4s, v1.s[2]
183 FMLA v24.4s, v16.4s, v2.s[2]
184 LDR q8, [x10], 16
185 FMLA v26.4s, v16.4s, v3.s[2]
186 FMLA v28.4s, v16.4s, v4.s[2]
187 FMLA v30.4s, v16.4s, v5.s[2]
188 LDR q9, [x11], 16
189 FMLA v21.4s, v17.4s, v0.s[2]
190 FMLA v23.4s, v17.4s, v1.s[2]
191 FMLA v25.4s, v17.4s, v2.s[2]
192 LDR q10, [x12], 16
193 FMLA v27.4s, v17.4s, v3.s[2]
194 FMLA v29.4s, v17.4s, v4.s[2]
195 FMLA v31.4s, v17.4s, v5.s[2]
196 LDR q11, [x4], 16
197
198 FMLA v20.4s, v18.4s, v0.s[3]
199 FMLA v22.4s, v18.4s, v1.s[3]
200 FMLA v24.4s, v18.4s, v2.s[3]
201 LDP q12, q13, [x5], 32 // Load 4 B
202 FMLA v26.4s, v18.4s, v3.s[3]
203 FMLA v28.4s, v18.4s, v4.s[3]
204 FMLA v30.4s, v18.4s, v5.s[3]
205 LDP q14, q15, [x5], 32
206 FMLA v21.4s, v19.4s, v0.s[3]
207 FMLA v23.4s, v19.4s, v1.s[3]
208 FMLA v25.4s, v19.4s, v2.s[3]
209 LDP q16, q17, [x5], 32
210 FMLA v27.4s, v19.4s, v3.s[3]
211 FMLA v29.4s, v19.4s, v4.s[3]
212 FMLA v31.4s, v19.4s, v5.s[3]
213 LDP q18, q19, [x5], 32
214
215 # Second group of 4 A. 48 FMA.
216 FMLA v20.4s, v12.4s, v6.s[0]
217 FMLA v22.4s, v12.4s, v7.s[0]
218 FMLA v24.4s, v12.4s, v8.s[0]
219 LDR q0, [x3], 16 // Load next 6 A
220 FMLA v26.4s, v12.4s, v9.s[0]
221 FMLA v28.4s, v12.4s, v10.s[0]
222 FMLA v30.4s, v12.4s, v11.s[0]
223 LDR q1, [x9], 16
224 FMLA v21.4s, v13.4s, v6.s[0]
225 FMLA v23.4s, v13.4s, v7.s[0]
226 FMLA v25.4s, v13.4s, v8.s[0]
227 LDR q2, [x10], 16
228 FMLA v27.4s, v13.4s, v9.s[0]
229 FMLA v29.4s, v13.4s, v10.s[0]
230 FMLA v31.4s, v13.4s, v11.s[0]
231 LDR q3, [x11], 16
232
233 FMLA v20.4s, v14.4s, v6.s[1]
234 FMLA v22.4s, v14.4s, v7.s[1]
235 FMLA v24.4s, v14.4s, v8.s[1]
236 LDR q4, [x12], 16
237 FMLA v26.4s, v14.4s, v9.s[1]
238 FMLA v28.4s, v14.4s, v10.s[1]
239 FMLA v30.4s, v14.4s, v11.s[1]
240 LDR q5, [x4], 16
241 FMLA v21.4s, v15.4s, v6.s[1]
242 FMLA v23.4s, v15.4s, v7.s[1]
243 FMLA v25.4s, v15.4s, v8.s[1]
244 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
245 FMLA v27.4s, v15.4s, v9.s[1]
246 FMLA v29.4s, v15.4s, v10.s[1]
247 FMLA v31.4s, v15.4s, v11.s[1]
248 LDP q14, q15, [x5], 32
249
250 FMLA v20.4s, v16.4s, v6.s[2]
251 FMLA v22.4s, v16.4s, v7.s[2]
252 FMLA v24.4s, v16.4s, v8.s[2]
253 FMLA v26.4s, v16.4s, v9.s[2]
254 FMLA v28.4s, v16.4s, v10.s[2]
255 FMLA v30.4s, v16.4s, v11.s[2]
256 FMLA v21.4s, v17.4s, v6.s[2]
257 FMLA v23.4s, v17.4s, v7.s[2]
258 FMLA v25.4s, v17.4s, v8.s[2]
259 FMLA v27.4s, v17.4s, v9.s[2]
260 FMLA v29.4s, v17.4s, v10.s[2]
261 FMLA v31.4s, v17.4s, v11.s[2]
262 LDP q16, q17, [x5], 32
263
264 FMLA v20.4s, v18.4s, v6.s[3]
265 FMLA v22.4s, v18.4s, v7.s[3]
266 SUBS x0, x0, 32
267 FMLA v24.4s, v18.4s, v8.s[3]
268 FMLA v26.4s, v18.4s, v9.s[3]
269 FMLA v28.4s, v18.4s, v10.s[3]
270 FMLA v30.4s, v18.4s, v11.s[3]
271 FMLA v21.4s, v19.4s, v6.s[3]
272 FMLA v23.4s, v19.4s, v7.s[3]
273 FMLA v25.4s, v19.4s, v8.s[3]
274 FMLA v27.4s, v19.4s, v9.s[3]
275 FMLA v29.4s, v19.4s, v10.s[3]
276 FMLA v31.4s, v19.4s, v11.s[3]
277 B.HS 1b
278
279 # Epilogue - 8 floats of A (32 bytes)
280 # 96 FMA + 6 LDP A + 8 LDP B
281 # First block same as main loop. Second block has no preloads.
2822:
283 # First group of 4 A. 48 FMA.
284 FMLA v20.4s, v12.4s, v0.s[0]
285 LDP q18, q19, [x5], 32 // Load last B
286 FMLA v22.4s, v12.4s, v1.s[0]
287 FMLA v24.4s, v12.4s, v2.s[0]
288 FMLA v26.4s, v12.4s, v3.s[0]
289 FMLA v28.4s, v12.4s, v4.s[0]
290 FMLA v30.4s, v12.4s, v5.s[0]
291 FMLA v21.4s, v13.4s, v0.s[0]
292 FMLA v23.4s, v13.4s, v1.s[0]
293 FMLA v25.4s, v13.4s, v2.s[0]
294 FMLA v27.4s, v13.4s, v3.s[0]
295 FMLA v29.4s, v13.4s, v4.s[0]
296
297 FMLA v31.4s, v13.4s, v5.s[0]
298 FMLA v20.4s, v14.4s, v0.s[1]
299 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
300 FMLA v22.4s, v14.4s, v1.s[1]
301 FMLA v24.4s, v14.4s, v2.s[1]
302 FMLA v26.4s, v14.4s, v3.s[1]
303 FMLA v28.4s, v14.4s, v4.s[1]
304 PRFM PLDL1KEEP, [x5, 256]
305 FMLA v30.4s, v14.4s, v5.s[1]
306 FMLA v21.4s, v15.4s, v0.s[1]
307 FMLA v23.4s, v15.4s, v1.s[1]
308 FMLA v25.4s, v15.4s, v2.s[1]
309 LDR q6, [x3], 16 // Load next 6 A
310 FMLA v27.4s, v15.4s, v3.s[1]
311 FMLA v29.4s, v15.4s, v4.s[1]
312 FMLA v31.4s, v15.4s, v5.s[1]
313 LDR q7, [x9], 16
314
315 FMLA v20.4s, v16.4s, v0.s[2]
316 FMLA v22.4s, v16.4s, v1.s[2]
317 FMLA v24.4s, v16.4s, v2.s[2]
318 LDR q8, [x10], 16
319 FMLA v26.4s, v16.4s, v3.s[2]
320 FMLA v28.4s, v16.4s, v4.s[2]
321 FMLA v30.4s, v16.4s, v5.s[2]
322 LDR q9, [x11], 16
323 FMLA v21.4s, v17.4s, v0.s[2]
324 FMLA v23.4s, v17.4s, v1.s[2]
325 FMLA v25.4s, v17.4s, v2.s[2]
326 LDR q10, [x12], 16
327 FMLA v27.4s, v17.4s, v3.s[2]
328 FMLA v29.4s, v17.4s, v4.s[2]
329 FMLA v31.4s, v17.4s, v5.s[2]
330 LDR q11, [x4], 16
331
332 FMLA v20.4s, v18.4s, v0.s[3]
333 FMLA v22.4s, v18.4s, v1.s[3]
334 FMLA v24.4s, v18.4s, v2.s[3]
335 LDP q12, q13, [x5], 32 // Load 4 B
336 FMLA v26.4s, v18.4s, v3.s[3]
337 FMLA v28.4s, v18.4s, v4.s[3]
338 FMLA v30.4s, v18.4s, v5.s[3]
339 LDP q14, q15, [x5], 32
340 FMLA v21.4s, v19.4s, v0.s[3]
341 FMLA v23.4s, v19.4s, v1.s[3]
342 FMLA v25.4s, v19.4s, v2.s[3]
343 LDP q16, q17, [x5], 32
344 FMLA v27.4s, v19.4s, v3.s[3]
345 FMLA v29.4s, v19.4s, v4.s[3]
346 FMLA v31.4s, v19.4s, v5.s[3]
347 LDP q18, q19, [x5], 32
348
349 # Second group of 4 A. 48 FMA.
350 FMLA v20.4s, v12.4s, v6.s[0]
351 FMLA v22.4s, v12.4s, v7.s[0]
352 FMLA v24.4s, v12.4s, v8.s[0]
353 FMLA v26.4s, v12.4s, v9.s[0]
354 FMLA v28.4s, v12.4s, v10.s[0]
355 FMLA v30.4s, v12.4s, v11.s[0]
356 FMLA v21.4s, v13.4s, v6.s[0]
357 FMLA v23.4s, v13.4s, v7.s[0]
358 FMLA v25.4s, v13.4s, v8.s[0]
359 FMLA v27.4s, v13.4s, v9.s[0]
360 FMLA v29.4s, v13.4s, v10.s[0]
361 FMLA v31.4s, v13.4s, v11.s[0]
362
363 FMLA v20.4s, v14.4s, v6.s[1]
364 FMLA v22.4s, v14.4s, v7.s[1]
365 FMLA v24.4s, v14.4s, v8.s[1]
366 FMLA v26.4s, v14.4s, v9.s[1]
367 FMLA v28.4s, v14.4s, v10.s[1]
368 FMLA v30.4s, v14.4s, v11.s[1]
369 FMLA v21.4s, v15.4s, v6.s[1]
370 FMLA v23.4s, v15.4s, v7.s[1]
371 FMLA v25.4s, v15.4s, v8.s[1]
372 FMLA v27.4s, v15.4s, v9.s[1]
373 FMLA v29.4s, v15.4s, v10.s[1]
374 FMLA v31.4s, v15.4s, v11.s[1]
375
376 FMLA v20.4s, v16.4s, v6.s[2]
377 FMLA v22.4s, v16.4s, v7.s[2]
378 FMLA v24.4s, v16.4s, v8.s[2]
379 FMLA v26.4s, v16.4s, v9.s[2]
380 FMLA v28.4s, v16.4s, v10.s[2]
381 FMLA v30.4s, v16.4s, v11.s[2]
382 FMLA v21.4s, v17.4s, v6.s[2]
383 FMLA v23.4s, v17.4s, v7.s[2]
384 FMLA v25.4s, v17.4s, v8.s[2]
385 FMLA v27.4s, v17.4s, v9.s[2]
386 FMLA v29.4s, v17.4s, v10.s[2]
387 FMLA v31.4s, v17.4s, v11.s[2]
388
389 FMLA v20.4s, v18.4s, v6.s[3]
390 FMLA v22.4s, v18.4s, v7.s[3]
391 FMLA v24.4s, v18.4s, v8.s[3]
392 FMLA v26.4s, v18.4s, v9.s[3]
393 FMLA v28.4s, v18.4s, v10.s[3]
394 FMLA v30.4s, v18.4s, v11.s[3]
395 FMLA v21.4s, v19.4s, v6.s[3]
396 FMLA v23.4s, v19.4s, v7.s[3]
397
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700398 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700399 LD2R {v6.4s, v7.4s}, [x8]
400
401 FMLA v25.4s, v19.4s, v8.s[3]
402 FMLA v27.4s, v19.4s, v9.s[3]
403 # Is there a remainder?- 4 floats of A (16 bytes) or less
404 TST x0, 31
405 FMLA v29.4s, v19.4s, v10.s[3]
406 FMLA v31.4s, v19.4s, v11.s[3]
407 B.NE 4f
408
409 # Clamp
4103:
Marat Dukhana51cf482020-04-08 16:16:19 -0700411 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800412 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700413 FMAX v21.4s, v21.4s, v6.4s
414 FMAX v22.4s, v22.4s, v6.4s
415 FMAX v23.4s, v23.4s, v6.4s
416 FMAX v24.4s, v24.4s, v6.4s
417 FMAX v25.4s, v25.4s, v6.4s
418 FMAX v26.4s, v26.4s, v6.4s
419 FMAX v27.4s, v27.4s, v6.4s
420 FMAX v28.4s, v28.4s, v6.4s
421 FMAX v29.4s, v29.4s, v6.4s
422 FMAX v30.4s, v30.4s, v6.4s
423 FMAX v31.4s, v31.4s, v6.4s
424 FMIN v20.4s, v20.4s, v7.4s
425 FMIN v21.4s, v21.4s, v7.4s
426 FMIN v22.4s, v22.4s, v7.4s
427 FMIN v23.4s, v23.4s, v7.4s
428 FMIN v24.4s, v24.4s, v7.4s
429 FMIN v25.4s, v25.4s, v7.4s
430 FMIN v26.4s, v26.4s, v7.4s
431 FMIN v27.4s, v27.4s, v7.4s
432 FMIN v28.4s, v28.4s, v7.4s
433 FMIN v29.4s, v29.4s, v7.4s
434 FMIN v30.4s, v30.4s, v7.4s
435 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700436
437 # Store full 6 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700438 B.LO 7f
439
440 STP q20, q21, [x6]
441 ADD x6, x6, x14
442 SUB x3, x3, x2 // a0 -= kc
443 STP q22, q23, [x16]
444 ADD x16, x16, x14
445 SUB x9, x9, x2 // a1 -= kc
446 STP q24, q25, [x17]
447 ADD x17, x17, x14
448 SUB x10, x10, x2 // a2 -= kc
449 STP q26, q27, [x18]
450 ADD x18, x18, x14
451 SUB x11, x11, x2 // a3 -= kc
452 STP q28, q29, [x13]
453 ADD x13, x13, x14
454 SUB x12, x12, x2 // a4 -= kc
455 STP q30, q31, [x7]
456 ADD x7, x7, x14
457 SUB x4, x4, x2 // a5 -= kc
458
XNNPACK Teamb455b122019-09-27 18:10:33 -0700459 B.HI 0b
460
461 # Restore d8-d15 from stack
462 LDP d14, d15, [sp, 48]
463 LDP d12, d13, [sp, 32]
464 LDP d10, d11, [sp, 16]
465 LDP d8, d9, [sp], 64
466 RET
467
4684:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700469 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700470 LD2R {v6.4s, v7.4s}, [x8]
471
472 # Is there a remainder?- 4 floats of A (16 bytes)
473 TBZ x0, 4, 5f
474
475 # Remainder- 4 floats of A (16 bytes)
476 # Load A
477 LDR q0, [x3], 16
478 LDR q1, [x9], 16
479 LDR q2, [x10], 16
480 LDR q3, [x11], 16
481 LDR q4, [x12], 16
482 LDR q5, [x4], 16
483 # Load B
484 LDP q12, q13, [x5], 32
485 LDP q14, q15, [x5], 32
486 LDP q16, q17, [x5], 32
487 LDP q18, q19, [x5], 32
488
489 FMLA v20.4s, v12.4s, v0.s[0]
490 FMLA v22.4s, v12.4s, v1.s[0]
491 FMLA v24.4s, v12.4s, v2.s[0]
492 FMLA v26.4s, v12.4s, v3.s[0]
493 FMLA v28.4s, v12.4s, v4.s[0]
494 FMLA v30.4s, v12.4s, v5.s[0]
495 FMLA v21.4s, v13.4s, v0.s[0]
496 FMLA v23.4s, v13.4s, v1.s[0]
497 FMLA v25.4s, v13.4s, v2.s[0]
498 FMLA v27.4s, v13.4s, v3.s[0]
499 FMLA v29.4s, v13.4s, v4.s[0]
500 FMLA v31.4s, v13.4s, v5.s[0]
501
502 FMLA v20.4s, v14.4s, v0.s[1]
503 FMLA v22.4s, v14.4s, v1.s[1]
504 FMLA v24.4s, v14.4s, v2.s[1]
505 FMLA v26.4s, v14.4s, v3.s[1]
506 FMLA v28.4s, v14.4s, v4.s[1]
507 FMLA v30.4s, v14.4s, v5.s[1]
508 FMLA v21.4s, v15.4s, v0.s[1]
509 FMLA v23.4s, v15.4s, v1.s[1]
510 FMLA v25.4s, v15.4s, v2.s[1]
511 FMLA v27.4s, v15.4s, v3.s[1]
512 FMLA v29.4s, v15.4s, v4.s[1]
513 FMLA v31.4s, v15.4s, v5.s[1]
514
515 FMLA v20.4s, v16.4s, v0.s[2]
516 FMLA v22.4s, v16.4s, v1.s[2]
517 FMLA v24.4s, v16.4s, v2.s[2]
518 FMLA v26.4s, v16.4s, v3.s[2]
519 FMLA v28.4s, v16.4s, v4.s[2]
520 FMLA v30.4s, v16.4s, v5.s[2]
521 FMLA v21.4s, v17.4s, v0.s[2]
522 FMLA v23.4s, v17.4s, v1.s[2]
523 FMLA v25.4s, v17.4s, v2.s[2]
524 FMLA v27.4s, v17.4s, v3.s[2]
525 FMLA v29.4s, v17.4s, v4.s[2]
526 FMLA v31.4s, v17.4s, v5.s[2]
527
528 FMLA v20.4s, v18.4s, v0.s[3]
529 FMLA v22.4s, v18.4s, v1.s[3]
530 FMLA v24.4s, v18.4s, v2.s[3]
531 FMLA v26.4s, v18.4s, v3.s[3]
532 FMLA v28.4s, v18.4s, v4.s[3]
533 FMLA v30.4s, v18.4s, v5.s[3]
534 FMLA v21.4s, v19.4s, v0.s[3]
535 FMLA v23.4s, v19.4s, v1.s[3]
536 FMLA v25.4s, v19.4s, v2.s[3]
537 FMLA v27.4s, v19.4s, v3.s[3]
538 FMLA v29.4s, v19.4s, v4.s[3]
539 FMLA v31.4s, v19.4s, v5.s[3]
540
541 # Is there a remainder?- 2 floats of A (8 bytes)
5425:
543 TBZ x0, 3, 6f
544
545 # Remainder- 2 floats of A (8 bytes)
546 # Load A
547 LDR d0, [x3], 8
548 LDR d1, [x9], 8
549 LDR d2, [x10], 8
550 LDR d3, [x11], 8
551 LDR d4, [x12], 8
552 LDR d5, [x4], 8
553 # Load B
554 LDP q12, q13, [x5], 32
555 LDP q14, q15, [x5], 32
556
557 FMLA v20.4s, v12.4s, v0.s[0]
558 FMLA v22.4s, v12.4s, v1.s[0]
559 FMLA v24.4s, v12.4s, v2.s[0]
560 FMLA v26.4s, v12.4s, v3.s[0]
561 FMLA v28.4s, v12.4s, v4.s[0]
562 FMLA v30.4s, v12.4s, v5.s[0]
563 FMLA v21.4s, v13.4s, v0.s[0]
564 FMLA v23.4s, v13.4s, v1.s[0]
565 FMLA v25.4s, v13.4s, v2.s[0]
566 FMLA v27.4s, v13.4s, v3.s[0]
567 FMLA v29.4s, v13.4s, v4.s[0]
568 FMLA v31.4s, v13.4s, v5.s[0]
569
570 FMLA v20.4s, v14.4s, v0.s[1]
571 FMLA v22.4s, v14.4s, v1.s[1]
572 FMLA v24.4s, v14.4s, v2.s[1]
573 FMLA v26.4s, v14.4s, v3.s[1]
574 FMLA v28.4s, v14.4s, v4.s[1]
575 FMLA v30.4s, v14.4s, v5.s[1]
576 FMLA v21.4s, v15.4s, v0.s[1]
577 FMLA v23.4s, v15.4s, v1.s[1]
578 FMLA v25.4s, v15.4s, v2.s[1]
579 FMLA v27.4s, v15.4s, v3.s[1]
580 FMLA v29.4s, v15.4s, v4.s[1]
581 FMLA v31.4s, v15.4s, v5.s[1]
582
583 # Is there a remainder?- 1 float of A (4 bytes)
5846:
585 TBZ x0, 2, 3b
586
587 # Remainder- 1 float of A (4 bytes)
588 # Load A
589 LDR s0, [x3], 4
590 LDR s1, [x9], 4
591 LDR s2, [x10], 4
592 LDR s3, [x11], 4
593 LDR s4, [x12], 4
594 LDR s5, [x4], 4
595 # Load B
596 LDP q12, q13, [x5], 32
597
598 FMLA v20.4s, v12.4s, v0.s[0]
599 FMLA v22.4s, v12.4s, v1.s[0]
600 FMLA v24.4s, v12.4s, v2.s[0]
601 FMLA v26.4s, v12.4s, v3.s[0]
602 FMLA v28.4s, v12.4s, v4.s[0]
603 FMLA v30.4s, v12.4s, v5.s[0]
604 FMLA v21.4s, v13.4s, v0.s[0]
605 FMLA v23.4s, v13.4s, v1.s[0]
606 FMLA v25.4s, v13.4s, v2.s[0]
607 FMLA v27.4s, v13.4s, v3.s[0]
608 FMLA v29.4s, v13.4s, v4.s[0]
609 FMLA v31.4s, v13.4s, v5.s[0]
610 B 3b
611
612 # Store odd width
6137:
614 TBZ x1, 2, 8f
615 STR q20, [x6], 16
616 MOV v20.16b, v21.16b
617 STR q22, [x16], 16
618 MOV v22.16b, v23.16b
619 STR q24, [x17], 16
620 MOV v24.16b, v25.16b
621 STR q26, [x18], 16
622 MOV v26.16b, v27.16b
623 STR q28, [x13], 16
624 MOV v28.16b, v29.16b
625 STR q30, [x7], 16
626 MOV v30.16b, v31.16b
6278:
628 TBZ x1, 1, 9f
629 STR d20, [x6], 8
630 DUP d20, v20.d[1]
631 STR d22, [x16], 8
632 DUP d22, v22.d[1]
633 STR d24, [x17], 8
634 DUP d24, v24.d[1]
635 STR d26, [x18], 8
636 DUP d26, v26.d[1]
637 STR d28, [x13], 8
638 DUP d28, v28.d[1]
639 STR d30, [x7], 8
640 DUP d30, v30.d[1]
641
6429:
643 TBZ x1, 0, 10f
644 STR s20, [x6]
645 STR s22, [x16]
646 STR s24, [x17]
647 STR s26, [x18]
648 STR s28, [x13]
649 STR s30, [x7]
65010:
651 # Restore d8-d15 from stack
652 LDP d14, d15, [sp, 48]
653 LDP d12, d13, [sp, 32]
654 LDP d10, d11, [sp, 16]
655 LDP d8, d9, [sp], 64
656 RET
657
Marat Dukhan57431932019-11-22 07:50:42 -0800658END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75
XNNPACK Teamb455b122019-09-27 18:10:33 -0700659
660#ifdef __ELF__
661.section ".note.GNU-stack","",%progbits
662#endif