blob: 92c6a9e56c30473f5c3ba40ec9a5f826865ef9d4 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34# x4 a5
35
36# C pointers
37# x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42# x7 c5
43
44# Vector register usage
45# A0 v0 v6
46# A1 v1 v7
47# A2 v2 v8
48# A3 v3 v9
49# A4 v4 v10
50# A5 v5 v11
51# B v12 v13 v14 v15
52# B v16 v17 v18 v19
53# C v20 v21
54# C v22 v23
55# C v24 v25
56# C v26 v27
57# C v28 v29
58# C v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73
62
63 # Clamp A and C pointers / Save d8-d15 on stack
64 STP d8, d9, [sp, -64]!
Frank Barchard684bbb02019-11-16 14:14:42 -080065 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 ADD x9, x3, x4 // a1 = a0 + a_stride
67 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 CSEL x9, x3, x9, LO // a1 = a0
69 CSEL x16, x6, x16, LO // c1 = c0
70
71 STP d10, d11, [sp, 16]
72 ADD x10, x9, x4 // a2 = a1 + a_stride
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x10, x9, x10, LS // a2 = a1
76 CSEL x17, x16, x17, LS // c2 = c1
77
78 STP d12, d13, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080079 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 ADD x11, x10, x4 // a3 = a2 + a_stride
81 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 CSEL x11, x10, x11, LO // a3 = a2
83 CSEL x18, x17, x18, LO // c3 = c2
84
85 STP d14, d15, [sp, 48]
86 ADD x12, x11, x4 // a4 = a3 + a_stride
87 ADD x13, x18, x7 // c4 = c3 + cm_stride
88 // if mr <= 5
89 CSEL x12, x11, x12, LS // a4 = a3
90 CSEL x13, x18, x13, LS // c4 = c3
91
92 # Load acc, params pointer
93 LDP x15, x8, [sp, 72]
94
Frank Barchard684bbb02019-11-16 14:14:42 -080095 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070096 ADD x4, x12, x4 // a5 = a4 + a_stride
97 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 CSEL x4, x12, x4, LO // a5 = a4
99 CSEL x7, x13, x7, LO // c5 = c4
100
101 # Load cn_stride
102 LDR x14, [sp, 64]
103
104 .p2align 3
1050:
106 # Load initial accumulators
107 LDP q20, q21, [x15], 32
108 LDP q22, q23, [x15], 32
109 LDP q24, q25, [x15], 32
110 LDP q26, q27, [x15], 32
111 LDP q28, q29, [x15], 32
112 LDP q30, q31, [x15], 32
113 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
114 PRFM PLDL1KEEP, [x5, 64]
115 PRFM PLDL1KEEP, [x5, 128]
116 PRFM PLDL1KEEP, [x5, 192]
117 PRFM PLDL1KEEP, [x3] // Prefetch A
118 PRFM PLDL1KEEP, [x9]
119 PRFM PLDL1KEEP, [x10]
120 PRFM PLDL1KEEP, [x11]
121 PRFM PLDL1KEEP, [x12]
122 PRFM PLDL1KEEP, [x4]
123
124 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
125 SUBS x0, x2, 32 // k = kc - 32
126 B.LO 4f
127
128 # Prologue - loads for main loop of 96 FMA
129 # load A0 to A4 but not A5
130 LDP q0, q6, [x3], 32
131 LDP q1, q7, [x9], 32
132 LDP q2, q8, [x10], 32
133 LDP q3, q9, [x11], 32
134 LDP q4, q10, [x12], 32
135 # load first set of B
136 LDP q12, q13, [x5], 32
137 LDP q14, q15, [x5], 32
138
139 # Is there at least 8 floats (32 bytes) for main loop?
140 SUBS x0, x0, 32
141 B.LO 2f
142
143 # Main loop - 8 floats of A (32 bytes)
144 # 96 FMA + 6 LDP A + 8 LDP B
145 .p2align 3
1461:
147 # First group of 4 A. 48 FMA. Loads A5
148
149 LDP q5, q11, [x4], 32
150 FMLA v20.4s, v12.4s, v0.s[0]
151 FMLA v22.4s, v12.4s, v1.s[0]
152 LDP q16, q17, [x5], 32
153 FMLA v24.4s, v12.4s, v2.s[0]
154 FMLA v26.4s, v12.4s, v3.s[0]
155 LDP q18, q19, [x5], 32
156 FMLA v28.4s, v12.4s, v4.s[0]
157 FMLA v30.4s, v12.4s, v5.s[0]
158 FMLA v21.4s, v13.4s, v0.s[0]
159 FMLA v23.4s, v13.4s, v1.s[0]
160 FMLA v25.4s, v13.4s, v2.s[0]
161 FMLA v27.4s, v13.4s, v3.s[0]
162 FMLA v29.4s, v13.4s, v4.s[0]
163 FMLA v31.4s, v13.4s, v5.s[0]
164
165 FMLA v20.4s, v14.4s, v0.s[1]
166 FMLA v22.4s, v14.4s, v1.s[1]
167 FMLA v24.4s, v14.4s, v2.s[1]
168 FMLA v26.4s, v14.4s, v3.s[1]
169 FMLA v28.4s, v14.4s, v4.s[1]
170 FMLA v30.4s, v14.4s, v5.s[1]
171 FMLA v21.4s, v15.4s, v0.s[1]
172 FMLA v23.4s, v15.4s, v1.s[1]
173 FMLA v25.4s, v15.4s, v2.s[1]
174 FMLA v27.4s, v15.4s, v3.s[1]
175 FMLA v29.4s, v15.4s, v4.s[1]
176 FMLA v31.4s, v15.4s, v5.s[1]
177
178 LDP q12, q13, [x5], 32
179 FMLA v20.4s, v16.4s, v0.s[2]
180 FMLA v22.4s, v16.4s, v1.s[2]
181 LDP q14, q15, [x5], 32
182 FMLA v24.4s, v16.4s, v2.s[2]
183 FMLA v26.4s, v16.4s, v3.s[2]
184 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
185 FMLA v28.4s, v16.4s, v4.s[2]
186 FMLA v30.4s, v16.4s, v5.s[2]
187 PRFM PLDL1KEEP, [x5, 256]
188 FMLA v21.4s, v17.4s, v0.s[2]
189 FMLA v23.4s, v17.4s, v1.s[2]
190 FMLA v25.4s, v17.4s, v2.s[2]
191 FMLA v27.4s, v17.4s, v3.s[2]
192 FMLA v29.4s, v17.4s, v4.s[2]
193 FMLA v31.4s, v17.4s, v5.s[2]
194
195 FMLA v20.4s, v18.4s, v0.s[3]
196 FMLA v22.4s, v18.4s, v1.s[3]
197 FMLA v24.4s, v18.4s, v2.s[3]
198 FMLA v26.4s, v18.4s, v3.s[3]
199 FMLA v28.4s, v18.4s, v4.s[3]
200 FMLA v30.4s, v18.4s, v5.s[3]
201 FMLA v21.4s, v19.4s, v0.s[3]
202 FMLA v23.4s, v19.4s, v1.s[3]
203 FMLA v25.4s, v19.4s, v2.s[3]
204 FMLA v27.4s, v19.4s, v3.s[3]
205 FMLA v29.4s, v19.4s, v4.s[3]
206 FMLA v31.4s, v19.4s, v5.s[3]
207
208 # Second group of 4 A. 48 FMA. Loads A0 - A4
209
210 LDP q16, q17, [x5], 32
211 FMLA v20.4s, v12.4s, v6.s[0]
212 FMLA v22.4s, v12.4s, v7.s[0]
213 LDP q18, q19, [x5], 32
214 FMLA v24.4s, v12.4s, v8.s[0]
215 FMLA v26.4s, v12.4s, v9.s[0]
216 FMLA v28.4s, v12.4s, v10.s[0]
217 FMLA v30.4s, v12.4s, v11.s[0]
218 FMLA v21.4s, v13.4s, v6.s[0]
219 FMLA v23.4s, v13.4s, v7.s[0]
220 FMLA v25.4s, v13.4s, v8.s[0]
221 FMLA v27.4s, v13.4s, v9.s[0]
222 FMLA v29.4s, v13.4s, v10.s[0]
223 FMLA v31.4s, v13.4s, v11.s[0]
224
225 FMLA v20.4s, v14.4s, v6.s[1]
226 FMLA v22.4s, v14.4s, v7.s[1]
227 FMLA v24.4s, v14.4s, v8.s[1]
228 FMLA v26.4s, v14.4s, v9.s[1]
229 FMLA v28.4s, v14.4s, v10.s[1]
230 FMLA v30.4s, v14.4s, v11.s[1]
231 FMLA v21.4s, v15.4s, v6.s[1]
232 FMLA v23.4s, v15.4s, v7.s[1]
233 FMLA v25.4s, v15.4s, v8.s[1]
234 FMLA v27.4s, v15.4s, v9.s[1]
235 FMLA v29.4s, v15.4s, v10.s[1]
236 FMLA v31.4s, v15.4s, v11.s[1]
237
238 LDP q12, q13, [x5], 32
239 FMLA v20.4s, v16.4s, v6.s[2]
240 FMLA v20.4s, v18.4s, v6.s[3]
241 LDP q14, q15, [x5], 32
242 FMLA v21.4s, v17.4s, v6.s[2]
243 FMLA v21.4s, v19.4s, v6.s[3]
244 LDP q0, q6, [x3], 32
245 FMLA v22.4s, v16.4s, v7.s[2]
246 FMLA v22.4s, v18.4s, v7.s[3]
247 FMLA v23.4s, v17.4s, v7.s[2]
248 FMLA v23.4s, v19.4s, v7.s[3]
249 LDP q1, q7, [x9], 32
250 FMLA v24.4s, v16.4s, v8.s[2]
251 FMLA v24.4s, v18.4s, v8.s[3]
252 FMLA v25.4s, v17.4s, v8.s[2]
253 FMLA v25.4s, v19.4s, v8.s[3]
254 LDP q2, q8, [x10], 32
255 FMLA v26.4s, v16.4s, v9.s[2]
256 FMLA v26.4s, v18.4s, v9.s[3]
257 FMLA v27.4s, v17.4s, v9.s[2]
258 FMLA v27.4s, v19.4s, v9.s[3]
259 LDP q3, q9, [x11], 32
260 FMLA v28.4s, v16.4s, v10.s[2]
261 FMLA v28.4s, v18.4s, v10.s[3]
262 FMLA v29.4s, v17.4s, v10.s[2]
263 FMLA v29.4s, v19.4s, v10.s[3]
264 LDP q4, q10, [x12], 32
265 FMLA v30.4s, v16.4s, v11.s[2]
266 FMLA v30.4s, v18.4s, v11.s[3]
267 SUBS x0, x0, 32
268 FMLA v31.4s, v17.4s, v11.s[2]
269 FMLA v31.4s, v19.4s, v11.s[3]
270 B.HS 1b
271
272 # Epilogue - 8 floats of A (32 bytes)
273 # 96 FMA + 6 LDP A + 8 LDP B
274 # First block same as main loop. Second block has no preloads.
2752:
276 # First group of 4 A. 48 FMA. Loads A5
277
278 LDP q5, q11, [x4], 32
279 FMLA v20.4s, v12.4s, v0.s[0]
280 FMLA v22.4s, v12.4s, v1.s[0]
281 LDP q16, q17, [x5], 32
282 FMLA v24.4s, v12.4s, v2.s[0]
283 FMLA v26.4s, v12.4s, v3.s[0]
284 LDP q18, q19, [x5], 32
285 FMLA v28.4s, v12.4s, v4.s[0]
286 FMLA v30.4s, v12.4s, v5.s[0]
287 FMLA v21.4s, v13.4s, v0.s[0]
288 FMLA v23.4s, v13.4s, v1.s[0]
289 FMLA v25.4s, v13.4s, v2.s[0]
290 FMLA v27.4s, v13.4s, v3.s[0]
291 FMLA v29.4s, v13.4s, v4.s[0]
292 FMLA v31.4s, v13.4s, v5.s[0]
293
294 FMLA v20.4s, v14.4s, v0.s[1]
295 FMLA v22.4s, v14.4s, v1.s[1]
296 FMLA v24.4s, v14.4s, v2.s[1]
297 FMLA v26.4s, v14.4s, v3.s[1]
298 FMLA v28.4s, v14.4s, v4.s[1]
299 FMLA v30.4s, v14.4s, v5.s[1]
300 FMLA v21.4s, v15.4s, v0.s[1]
301 FMLA v23.4s, v15.4s, v1.s[1]
302 FMLA v25.4s, v15.4s, v2.s[1]
303 FMLA v27.4s, v15.4s, v3.s[1]
304 FMLA v29.4s, v15.4s, v4.s[1]
305 FMLA v31.4s, v15.4s, v5.s[1]
306
307 LDP q12, q13, [x5], 32
308 FMLA v20.4s, v16.4s, v0.s[2]
309 FMLA v22.4s, v16.4s, v1.s[2]
310 LDP q14, q15, [x5], 32
311 FMLA v24.4s, v16.4s, v2.s[2]
312 FMLA v26.4s, v16.4s, v3.s[2]
313 FMLA v28.4s, v16.4s, v4.s[2]
314 FMLA v30.4s, v16.4s, v5.s[2]
315 FMLA v21.4s, v17.4s, v0.s[2]
316 FMLA v23.4s, v17.4s, v1.s[2]
317 FMLA v25.4s, v17.4s, v2.s[2]
318 FMLA v27.4s, v17.4s, v3.s[2]
319 FMLA v29.4s, v17.4s, v4.s[2]
320 FMLA v31.4s, v17.4s, v5.s[2]
321
322 FMLA v20.4s, v18.4s, v0.s[3]
323 FMLA v22.4s, v18.4s, v1.s[3]
324 FMLA v24.4s, v18.4s, v2.s[3]
325 FMLA v26.4s, v18.4s, v3.s[3]
326 FMLA v28.4s, v18.4s, v4.s[3]
327 FMLA v30.4s, v18.4s, v5.s[3]
328 FMLA v21.4s, v19.4s, v0.s[3]
329 FMLA v23.4s, v19.4s, v1.s[3]
330 FMLA v25.4s, v19.4s, v2.s[3]
331 FMLA v27.4s, v19.4s, v3.s[3]
332 FMLA v29.4s, v19.4s, v4.s[3]
333 FMLA v31.4s, v19.4s, v5.s[3]
334
335 # Second group of 4 A. 48 FMA. No A Loads, No last B load
336
337 LDP q16, q17, [x5], 32
338 FMLA v20.4s, v12.4s, v6.s[0]
339 FMLA v22.4s, v12.4s, v7.s[0]
340 LDP q18, q19, [x5], 32
341 FMLA v24.4s, v12.4s, v8.s[0]
342 FMLA v26.4s, v12.4s, v9.s[0]
343 FMLA v28.4s, v12.4s, v10.s[0]
344 FMLA v30.4s, v12.4s, v11.s[0]
345 FMLA v21.4s, v13.4s, v6.s[0]
346 FMLA v23.4s, v13.4s, v7.s[0]
347 FMLA v25.4s, v13.4s, v8.s[0]
348 FMLA v27.4s, v13.4s, v9.s[0]
349 FMLA v29.4s, v13.4s, v10.s[0]
350 FMLA v31.4s, v13.4s, v11.s[0]
351
352 FMLA v20.4s, v14.4s, v6.s[1]
353 FMLA v22.4s, v14.4s, v7.s[1]
354 FMLA v24.4s, v14.4s, v8.s[1]
355 FMLA v26.4s, v14.4s, v9.s[1]
356 FMLA v28.4s, v14.4s, v10.s[1]
357 FMLA v30.4s, v14.4s, v11.s[1]
358 FMLA v21.4s, v15.4s, v6.s[1]
359 FMLA v23.4s, v15.4s, v7.s[1]
360 FMLA v25.4s, v15.4s, v8.s[1]
361 FMLA v27.4s, v15.4s, v9.s[1]
362 FMLA v29.4s, v15.4s, v10.s[1]
363 FMLA v31.4s, v15.4s, v11.s[1]
364
365 # Last part of epilogue has loads removed.
366
367 FMLA v20.4s, v16.4s, v6.s[2]
368 FMLA v22.4s, v16.4s, v7.s[2]
369 FMLA v24.4s, v16.4s, v8.s[2]
370 FMLA v26.4s, v16.4s, v9.s[2]
371 FMLA v28.4s, v16.4s, v10.s[2]
372 FMLA v30.4s, v16.4s, v11.s[2]
373 FMLA v21.4s, v17.4s, v6.s[2]
374 FMLA v23.4s, v17.4s, v7.s[2]
375 FMLA v25.4s, v17.4s, v8.s[2]
376 FMLA v27.4s, v17.4s, v9.s[2]
377 FMLA v29.4s, v17.4s, v10.s[2]
378 FMLA v31.4s, v17.4s, v11.s[2]
379
380 FMLA v20.4s, v18.4s, v6.s[3]
381 FMLA v22.4s, v18.4s, v7.s[3]
382 FMLA v24.4s, v18.4s, v8.s[3]
383 FMLA v26.4s, v18.4s, v9.s[3]
384 FMLA v28.4s, v18.4s, v10.s[3]
385 FMLA v30.4s, v18.4s, v11.s[3]
386 FMLA v21.4s, v19.4s, v6.s[3]
387 FMLA v23.4s, v19.4s, v7.s[3]
388
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700389 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700390 LD2R {v6.4s, v7.4s}, [x8]
391
392 FMLA v25.4s, v19.4s, v8.s[3]
393 FMLA v27.4s, v19.4s, v9.s[3]
394 # Is there a remainder?- 4 floats of A (16 bytes) or less
395 TST x0, 31
396 FMLA v29.4s, v19.4s, v10.s[3]
397 FMLA v31.4s, v19.4s, v11.s[3]
398 B.NE 4f
399
400 .p2align 3
401
402 # Clamp
4033:
Frank Barchardc6591402019-12-11 12:54:12 -0800404 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700405 FMAX v20.4s, v20.4s, v6.4s
406 FMAX v21.4s, v21.4s, v6.4s
407 FMAX v22.4s, v22.4s, v6.4s
408 FMAX v23.4s, v23.4s, v6.4s
409 FMAX v24.4s, v24.4s, v6.4s
410 FMAX v25.4s, v25.4s, v6.4s
411 FMAX v26.4s, v26.4s, v6.4s
412 FMAX v27.4s, v27.4s, v6.4s
413 FMAX v28.4s, v28.4s, v6.4s
414 FMAX v29.4s, v29.4s, v6.4s
415 FMAX v30.4s, v30.4s, v6.4s
416 FMAX v31.4s, v31.4s, v6.4s
417 FMIN v20.4s, v20.4s, v7.4s
418 FMIN v21.4s, v21.4s, v7.4s
419 FMIN v22.4s, v22.4s, v7.4s
420 FMIN v23.4s, v23.4s, v7.4s
421 FMIN v24.4s, v24.4s, v7.4s
422 FMIN v25.4s, v25.4s, v7.4s
423 FMIN v26.4s, v26.4s, v7.4s
424 FMIN v27.4s, v27.4s, v7.4s
425 FMIN v28.4s, v28.4s, v7.4s
426 FMIN v29.4s, v29.4s, v7.4s
427 FMIN v30.4s, v30.4s, v7.4s
428 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700429
430 # Store full 6 x 8
Frank Barchard07efec42019-12-12 14:19:21 -0800431 NOP
XNNPACK Teamb455b122019-09-27 18:10:33 -0700432 B.LO 7f
433
434 STP q30, q31, [x7]
435 ADD x7, x7, x14
436 SUB x3, x3, x2 // a0 -= kc
437 STP q28, q29, [x13]
438 ADD x13, x13, x14
439 SUB x9, x9, x2 // a1 -= kc
440 STP q26, q27, [x18]
441 ADD x18, x18, x14
442 SUB x10, x10, x2 // a2 -= kc
443 STP q24, q25, [x17]
444 ADD x17, x17, x14
445 SUB x11, x11, x2 // a3 -= kc
446 STP q22, q23, [x16]
447 ADD x16, x16, x14
448 SUB x12, x12, x2 // a4 -= kc
449 STP q20, q21, [x6]
450 ADD x6, x6, x14
451 SUB x4, x4, x2 // a5 -= kc
452
Frank Barchardc6591402019-12-11 12:54:12 -0800453 NOP
XNNPACK Teamb455b122019-09-27 18:10:33 -0700454 B.HI 0b
455
456 # Restore d8-d15 from stack
457 LDP d14, d15, [sp, 48]
458 LDP d12, d13, [sp, 32]
459 LDP d10, d11, [sp, 16]
460 LDP d8, d9, [sp], 64
461 RET
462
463 .p2align 3
4644:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700465 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700466 LD2R {v6.4s, v7.4s}, [x8]
467
468 # Is there a remainder?- 4 floats of A (16 bytes)
469 TBZ x0, 4, 5f
470
471 # Remainder- 4 floats of A (16 bytes)
472 # Load A
473 LDR q0, [x3], 16
474 LDR q1, [x9], 16
475 LDR q2, [x10], 16
476 LDR q3, [x11], 16
477 LDR q4, [x12], 16
478 LDR q5, [x4], 16
479 # Load B
480 LDP q12, q13, [x5], 32
481 LDP q14, q15, [x5], 32
482 LDP q16, q17, [x5], 32
483 LDP q18, q19, [x5], 32
484
485 FMLA v20.4s, v12.4s, v0.s[0]
486 FMLA v22.4s, v12.4s, v1.s[0]
487 FMLA v24.4s, v12.4s, v2.s[0]
488 FMLA v26.4s, v12.4s, v3.s[0]
489 FMLA v28.4s, v12.4s, v4.s[0]
490 FMLA v30.4s, v12.4s, v5.s[0]
491 FMLA v21.4s, v13.4s, v0.s[0]
492 FMLA v23.4s, v13.4s, v1.s[0]
493 FMLA v25.4s, v13.4s, v2.s[0]
494 FMLA v27.4s, v13.4s, v3.s[0]
495 FMLA v29.4s, v13.4s, v4.s[0]
496 FMLA v31.4s, v13.4s, v5.s[0]
497
498 FMLA v20.4s, v14.4s, v0.s[1]
499 FMLA v22.4s, v14.4s, v1.s[1]
500 FMLA v24.4s, v14.4s, v2.s[1]
501 FMLA v26.4s, v14.4s, v3.s[1]
502 FMLA v28.4s, v14.4s, v4.s[1]
503 FMLA v30.4s, v14.4s, v5.s[1]
504 FMLA v21.4s, v15.4s, v0.s[1]
505 FMLA v23.4s, v15.4s, v1.s[1]
506 FMLA v25.4s, v15.4s, v2.s[1]
507 FMLA v27.4s, v15.4s, v3.s[1]
508 FMLA v29.4s, v15.4s, v4.s[1]
509 FMLA v31.4s, v15.4s, v5.s[1]
510
511 FMLA v20.4s, v16.4s, v0.s[2]
512 FMLA v22.4s, v16.4s, v1.s[2]
513 FMLA v24.4s, v16.4s, v2.s[2]
514 FMLA v26.4s, v16.4s, v3.s[2]
515 FMLA v28.4s, v16.4s, v4.s[2]
516 FMLA v30.4s, v16.4s, v5.s[2]
517 FMLA v21.4s, v17.4s, v0.s[2]
518 FMLA v23.4s, v17.4s, v1.s[2]
519 FMLA v25.4s, v17.4s, v2.s[2]
520 FMLA v27.4s, v17.4s, v3.s[2]
521 FMLA v29.4s, v17.4s, v4.s[2]
522 FMLA v31.4s, v17.4s, v5.s[2]
523
524 FMLA v20.4s, v18.4s, v0.s[3]
525 FMLA v22.4s, v18.4s, v1.s[3]
526 FMLA v24.4s, v18.4s, v2.s[3]
527 FMLA v26.4s, v18.4s, v3.s[3]
528 FMLA v28.4s, v18.4s, v4.s[3]
529 FMLA v30.4s, v18.4s, v5.s[3]
530 FMLA v21.4s, v19.4s, v0.s[3]
531 FMLA v23.4s, v19.4s, v1.s[3]
532 FMLA v25.4s, v19.4s, v2.s[3]
533 FMLA v27.4s, v19.4s, v3.s[3]
534 FMLA v29.4s, v19.4s, v4.s[3]
535 FMLA v31.4s, v19.4s, v5.s[3]
536
537 # Is there a remainder?- 2 floats of A (8 bytes)
5385:
539 TBZ x0, 3, 6f
540
541 # Remainder- 2 floats of A (8 bytes)
542 # Load A
543 LDR d0, [x3], 8
544 LDR d1, [x9], 8
545 LDR d2, [x10], 8
546 LDR d3, [x11], 8
547 LDR d4, [x12], 8
548 LDR d5, [x4], 8
549 # Load B
550 LDP q12, q13, [x5], 32
551 LDP q14, q15, [x5], 32
552
553 FMLA v20.4s, v12.4s, v0.s[0]
554 FMLA v22.4s, v12.4s, v1.s[0]
555 FMLA v24.4s, v12.4s, v2.s[0]
556 FMLA v26.4s, v12.4s, v3.s[0]
557 FMLA v28.4s, v12.4s, v4.s[0]
558 FMLA v30.4s, v12.4s, v5.s[0]
559 FMLA v21.4s, v13.4s, v0.s[0]
560 FMLA v23.4s, v13.4s, v1.s[0]
561 FMLA v25.4s, v13.4s, v2.s[0]
562 FMLA v27.4s, v13.4s, v3.s[0]
563 FMLA v29.4s, v13.4s, v4.s[0]
564 FMLA v31.4s, v13.4s, v5.s[0]
565
566 FMLA v20.4s, v14.4s, v0.s[1]
567 FMLA v22.4s, v14.4s, v1.s[1]
568 FMLA v24.4s, v14.4s, v2.s[1]
569 FMLA v26.4s, v14.4s, v3.s[1]
570 FMLA v28.4s, v14.4s, v4.s[1]
571 FMLA v30.4s, v14.4s, v5.s[1]
572 FMLA v21.4s, v15.4s, v0.s[1]
573 FMLA v23.4s, v15.4s, v1.s[1]
574 FMLA v25.4s, v15.4s, v2.s[1]
575 FMLA v27.4s, v15.4s, v3.s[1]
576 FMLA v29.4s, v15.4s, v4.s[1]
577 FMLA v31.4s, v15.4s, v5.s[1]
578
579 # Is there a remainder?- 1 float of A (4 bytes)
5806:
581 TBZ x0, 2, 3b
582
583 # Remainder- 1 float of A (4 bytes)
584 # Load A
585 LDR s0, [x3], 4
586 LDR s1, [x9], 4
587 LDR s2, [x10], 4
588 LDR s3, [x11], 4
589 LDR s4, [x12], 4
590 LDR s5, [x4], 4
591 # Load B
592 LDP q12, q13, [x5], 32
593
594 FMLA v20.4s, v12.4s, v0.s[0]
595 FMLA v22.4s, v12.4s, v1.s[0]
596 FMLA v24.4s, v12.4s, v2.s[0]
597 FMLA v26.4s, v12.4s, v3.s[0]
598 FMLA v28.4s, v12.4s, v4.s[0]
599 FMLA v30.4s, v12.4s, v5.s[0]
600 FMLA v21.4s, v13.4s, v0.s[0]
601 FMLA v23.4s, v13.4s, v1.s[0]
602 FMLA v25.4s, v13.4s, v2.s[0]
603 FMLA v27.4s, v13.4s, v3.s[0]
604 FMLA v29.4s, v13.4s, v4.s[0]
605 FMLA v31.4s, v13.4s, v5.s[0]
606 B 3b
607
Frank Barchardc6591402019-12-11 12:54:12 -0800608 .p2align 3
609
XNNPACK Teamb455b122019-09-27 18:10:33 -0700610 # Store odd width
6117:
612 TBZ x1, 2, 8f
613 STR q30, [x7], 16
614 MOV v30.16b, v31.16b
615 STR q28, [x13], 16
616 MOV v28.16b, v29.16b
617 STR q26, [x18], 16
618 MOV v26.16b, v27.16b
619 STR q24, [x17], 16
620 MOV v24.16b, v25.16b
621 STR q22, [x16], 16
622 MOV v22.16b, v23.16b
623 STR q20, [x6], 16
624 MOV v20.16b, v21.16b
6258:
626 TBZ x1, 1, 9f
627 STR d30, [x7], 8
628 DUP d30, v30.d[1]
629 STR d28, [x13], 8
630 DUP d28, v28.d[1]
631 STR d26, [x18], 8
632 DUP d26, v26.d[1]
633 STR d24, [x17], 8
634 DUP d24, v24.d[1]
635 STR d22, [x16], 8
636 DUP d22, v22.d[1]
637 STR d20, [x6], 8
638 DUP d20, v20.d[1]
639
6409:
641 TBZ x1, 0, 10f
642 STR s30, [x7]
643 STR s28, [x13]
644 STR s26, [x18]
645 STR s24, [x17]
646 STR s22, [x16]
647 STR s20, [x6]
64810:
649 # Restore d8-d15 from stack
650 LDP d14, d15, [sp, 48]
651 LDP d12, d13, [sp, 32]
652 LDP d10, d11, [sp, 16]
653 LDP d8, d9, [sp], 64
654 RET
655
Marat Dukhan57431932019-11-22 07:50:42 -0800656END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73
XNNPACK Teamb455b122019-09-27 18:10:33 -0700657
658#ifdef __ELF__
659.section ".note.GNU-stack","",%progbits
660#endif