blob: 551f320bd12be88705598e3c3628170be8e0b302 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34# x4 a5
35
36# C pointers
37# x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42# x7 c5
43
44# Vector register usage
45# A0 v0 v6
46# A1 v1 v7
47# A2 v2 v8
48# A3 v3 v9
49# A4 v4 v10
50# A5 v5 v11
51# B v12 v13 v14 v15
52# B v16 v17 v18 v19
53# C v20 v21
54# C v22 v23
55# C v24 v25
56# C v26 v27
57# C v28 v29
58# C v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75
62
63 # Clamp A and C pointers / Save d8-d15 on stack
64 STP d8, d9, [sp, -64]!
Frank Barchard684bbb02019-11-16 14:14:42 -080065 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 ADD x9, x3, x4 // a1 = a0 + a_stride
67 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 CSEL x9, x3, x9, LO // a1 = a0
69 CSEL x16, x6, x16, LO // c1 = c0
70
71 STP d10, d11, [sp, 16]
72 ADD x10, x9, x4 // a2 = a1 + a_stride
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x10, x9, x10, LS // a2 = a1
76 CSEL x17, x16, x17, LS // c2 = c1
77
78 STP d12, d13, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080079 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 ADD x11, x10, x4 // a3 = a2 + a_stride
81 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 CSEL x11, x10, x11, LO // a3 = a2
83 CSEL x18, x17, x18, LO // c3 = c2
84
85 STP d14, d15, [sp, 48]
86 ADD x12, x11, x4 // a4 = a3 + a_stride
87 ADD x13, x18, x7 // c4 = c3 + cm_stride
88 // if mr <= 5
89 CSEL x12, x11, x12, LS // a4 = a3
90 CSEL x13, x18, x13, LS // c4 = c3
91
92 # Load acc, params pointer
93 LDP x15, x8, [sp, 72]
94
Frank Barchard684bbb02019-11-16 14:14:42 -080095 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070096 ADD x4, x12, x4 // a5 = a4 + a_stride
97 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 CSEL x4, x12, x4, LO // a5 = a4
99 CSEL x7, x13, x7, LO // c5 = c4
100
101 # Load cn_stride
102 LDR x14, [sp, 64]
103
1040:
105 # Load initial accumulators
106 LDP q20, q21, [x15], 32
107 LDP q22, q23, [x15], 32
108 LDP q24, q25, [x15], 32
109 LDP q26, q27, [x15], 32
110 LDP q28, q29, [x15], 32
111 LDP q30, q31, [x15], 32
112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
113 PRFM PLDL1KEEP, [x5, 64]
114 PRFM PLDL1KEEP, [x5, 128]
115 PRFM PLDL1KEEP, [x5, 192]
116 PRFM PLDL1KEEP, [x3] // Prefetch A
117 PRFM PLDL1KEEP, [x9]
118 PRFM PLDL1KEEP, [x10]
119 PRFM PLDL1KEEP, [x11]
120 PRFM PLDL1KEEP, [x12]
121 PRFM PLDL1KEEP, [x4]
122
123 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
124 SUBS x0, x2, 32 // k = kc - 32
125 B.LO 4f
126
127 # Prologue - loads for main loop of 96 FMA
128 LDR q0, [x3], 16
129 LDR q1, [x9], 16
130 LDR q2, [x10], 16
131 LDR q3, [x11], 16
132 LDR q4, [x12], 16
133 LDR q5, [x4], 16
134 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
135 LDP q14, q15, [x5], 32
136 LDP q16, q17, [x5], 32
137
138 # Is there at least 8 floats (32 bytes) for main loop?
139 SUBS x0, x0, 32
140 B.LO 2f
141
142 # Main loop - 8 floats of A (32 bytes)
143 # 96 FMA + 6 LDP A + 8 LDP B
1441:
145 # First group of 4 A. 48 FMA.
146 FMLA v20.4s, v12.4s, v0.s[0]
147 LDP q18, q19, [x5], 32 // Load last B
148 FMLA v22.4s, v12.4s, v1.s[0]
149 FMLA v24.4s, v12.4s, v2.s[0]
150 FMLA v26.4s, v12.4s, v3.s[0]
151 FMLA v28.4s, v12.4s, v4.s[0]
152 FMLA v30.4s, v12.4s, v5.s[0]
153 FMLA v21.4s, v13.4s, v0.s[0]
154 FMLA v23.4s, v13.4s, v1.s[0]
155 FMLA v25.4s, v13.4s, v2.s[0]
156 FMLA v27.4s, v13.4s, v3.s[0]
157 FMLA v29.4s, v13.4s, v4.s[0]
158
159 FMLA v31.4s, v13.4s, v5.s[0]
160 FMLA v20.4s, v14.4s, v0.s[1]
161 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
162 FMLA v22.4s, v14.4s, v1.s[1]
163 FMLA v24.4s, v14.4s, v2.s[1]
164 FMLA v26.4s, v14.4s, v3.s[1]
165 FMLA v28.4s, v14.4s, v4.s[1]
166 PRFM PLDL1KEEP, [x5, 256]
167 FMLA v30.4s, v14.4s, v5.s[1]
168 FMLA v21.4s, v15.4s, v0.s[1]
169 FMLA v23.4s, v15.4s, v1.s[1]
170 FMLA v25.4s, v15.4s, v2.s[1]
171 LDR q6, [x3], 16 // Load next 6 A
172 FMLA v27.4s, v15.4s, v3.s[1]
173 FMLA v29.4s, v15.4s, v4.s[1]
174 FMLA v31.4s, v15.4s, v5.s[1]
175 LDR q7, [x9], 16
176
177 FMLA v20.4s, v16.4s, v0.s[2]
178 FMLA v22.4s, v16.4s, v1.s[2]
179 FMLA v24.4s, v16.4s, v2.s[2]
180 LDR q8, [x10], 16
181 FMLA v26.4s, v16.4s, v3.s[2]
182 FMLA v28.4s, v16.4s, v4.s[2]
183 FMLA v30.4s, v16.4s, v5.s[2]
184 LDR q9, [x11], 16
185 FMLA v21.4s, v17.4s, v0.s[2]
186 FMLA v23.4s, v17.4s, v1.s[2]
187 FMLA v25.4s, v17.4s, v2.s[2]
188 LDR q10, [x12], 16
189 FMLA v27.4s, v17.4s, v3.s[2]
190 FMLA v29.4s, v17.4s, v4.s[2]
191 FMLA v31.4s, v17.4s, v5.s[2]
192 LDR q11, [x4], 16
193
194 FMLA v20.4s, v18.4s, v0.s[3]
195 FMLA v22.4s, v18.4s, v1.s[3]
196 FMLA v24.4s, v18.4s, v2.s[3]
197 LDP q12, q13, [x5], 32 // Load 4 B
198 FMLA v26.4s, v18.4s, v3.s[3]
199 FMLA v28.4s, v18.4s, v4.s[3]
200 FMLA v30.4s, v18.4s, v5.s[3]
201 LDP q14, q15, [x5], 32
202 FMLA v21.4s, v19.4s, v0.s[3]
203 FMLA v23.4s, v19.4s, v1.s[3]
204 FMLA v25.4s, v19.4s, v2.s[3]
205 LDP q16, q17, [x5], 32
206 FMLA v27.4s, v19.4s, v3.s[3]
207 FMLA v29.4s, v19.4s, v4.s[3]
208 FMLA v31.4s, v19.4s, v5.s[3]
209 LDP q18, q19, [x5], 32
210
211 # Second group of 4 A. 48 FMA.
212 FMLA v20.4s, v12.4s, v6.s[0]
213 FMLA v22.4s, v12.4s, v7.s[0]
214 FMLA v24.4s, v12.4s, v8.s[0]
215 LDR q0, [x3], 16 // Load next 6 A
216 FMLA v26.4s, v12.4s, v9.s[0]
217 FMLA v28.4s, v12.4s, v10.s[0]
218 FMLA v30.4s, v12.4s, v11.s[0]
219 LDR q1, [x9], 16
220 FMLA v21.4s, v13.4s, v6.s[0]
221 FMLA v23.4s, v13.4s, v7.s[0]
222 FMLA v25.4s, v13.4s, v8.s[0]
223 LDR q2, [x10], 16
224 FMLA v27.4s, v13.4s, v9.s[0]
225 FMLA v29.4s, v13.4s, v10.s[0]
226 FMLA v31.4s, v13.4s, v11.s[0]
227 LDR q3, [x11], 16
228
229 FMLA v20.4s, v14.4s, v6.s[1]
230 FMLA v22.4s, v14.4s, v7.s[1]
231 FMLA v24.4s, v14.4s, v8.s[1]
232 LDR q4, [x12], 16
233 FMLA v26.4s, v14.4s, v9.s[1]
234 FMLA v28.4s, v14.4s, v10.s[1]
235 FMLA v30.4s, v14.4s, v11.s[1]
236 LDR q5, [x4], 16
237 FMLA v21.4s, v15.4s, v6.s[1]
238 FMLA v23.4s, v15.4s, v7.s[1]
239 FMLA v25.4s, v15.4s, v8.s[1]
240 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
241 FMLA v27.4s, v15.4s, v9.s[1]
242 FMLA v29.4s, v15.4s, v10.s[1]
243 FMLA v31.4s, v15.4s, v11.s[1]
244 LDP q14, q15, [x5], 32
245
246 FMLA v20.4s, v16.4s, v6.s[2]
247 FMLA v22.4s, v16.4s, v7.s[2]
248 FMLA v24.4s, v16.4s, v8.s[2]
249 FMLA v26.4s, v16.4s, v9.s[2]
250 FMLA v28.4s, v16.4s, v10.s[2]
251 FMLA v30.4s, v16.4s, v11.s[2]
252 FMLA v21.4s, v17.4s, v6.s[2]
253 FMLA v23.4s, v17.4s, v7.s[2]
254 FMLA v25.4s, v17.4s, v8.s[2]
255 FMLA v27.4s, v17.4s, v9.s[2]
256 FMLA v29.4s, v17.4s, v10.s[2]
257 FMLA v31.4s, v17.4s, v11.s[2]
258 LDP q16, q17, [x5], 32
259
260 FMLA v20.4s, v18.4s, v6.s[3]
261 FMLA v22.4s, v18.4s, v7.s[3]
262 SUBS x0, x0, 32
263 FMLA v24.4s, v18.4s, v8.s[3]
264 FMLA v26.4s, v18.4s, v9.s[3]
265 FMLA v28.4s, v18.4s, v10.s[3]
266 FMLA v30.4s, v18.4s, v11.s[3]
267 FMLA v21.4s, v19.4s, v6.s[3]
268 FMLA v23.4s, v19.4s, v7.s[3]
269 FMLA v25.4s, v19.4s, v8.s[3]
270 FMLA v27.4s, v19.4s, v9.s[3]
271 FMLA v29.4s, v19.4s, v10.s[3]
272 FMLA v31.4s, v19.4s, v11.s[3]
273 B.HS 1b
274
275 # Epilogue - 8 floats of A (32 bytes)
276 # 96 FMA + 6 LDP A + 8 LDP B
277 # First block same as main loop. Second block has no preloads.
2782:
279 # First group of 4 A. 48 FMA.
280 FMLA v20.4s, v12.4s, v0.s[0]
281 LDP q18, q19, [x5], 32 // Load last B
282 FMLA v22.4s, v12.4s, v1.s[0]
283 FMLA v24.4s, v12.4s, v2.s[0]
284 FMLA v26.4s, v12.4s, v3.s[0]
285 FMLA v28.4s, v12.4s, v4.s[0]
286 FMLA v30.4s, v12.4s, v5.s[0]
287 FMLA v21.4s, v13.4s, v0.s[0]
288 FMLA v23.4s, v13.4s, v1.s[0]
289 FMLA v25.4s, v13.4s, v2.s[0]
290 FMLA v27.4s, v13.4s, v3.s[0]
291 FMLA v29.4s, v13.4s, v4.s[0]
292
293 FMLA v31.4s, v13.4s, v5.s[0]
294 FMLA v20.4s, v14.4s, v0.s[1]
295 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
296 FMLA v22.4s, v14.4s, v1.s[1]
297 FMLA v24.4s, v14.4s, v2.s[1]
298 FMLA v26.4s, v14.4s, v3.s[1]
299 FMLA v28.4s, v14.4s, v4.s[1]
300 PRFM PLDL1KEEP, [x5, 256]
301 FMLA v30.4s, v14.4s, v5.s[1]
302 FMLA v21.4s, v15.4s, v0.s[1]
303 FMLA v23.4s, v15.4s, v1.s[1]
304 FMLA v25.4s, v15.4s, v2.s[1]
305 LDR q6, [x3], 16 // Load next 6 A
306 FMLA v27.4s, v15.4s, v3.s[1]
307 FMLA v29.4s, v15.4s, v4.s[1]
308 FMLA v31.4s, v15.4s, v5.s[1]
309 LDR q7, [x9], 16
310
311 FMLA v20.4s, v16.4s, v0.s[2]
312 FMLA v22.4s, v16.4s, v1.s[2]
313 FMLA v24.4s, v16.4s, v2.s[2]
314 LDR q8, [x10], 16
315 FMLA v26.4s, v16.4s, v3.s[2]
316 FMLA v28.4s, v16.4s, v4.s[2]
317 FMLA v30.4s, v16.4s, v5.s[2]
318 LDR q9, [x11], 16
319 FMLA v21.4s, v17.4s, v0.s[2]
320 FMLA v23.4s, v17.4s, v1.s[2]
321 FMLA v25.4s, v17.4s, v2.s[2]
322 LDR q10, [x12], 16
323 FMLA v27.4s, v17.4s, v3.s[2]
324 FMLA v29.4s, v17.4s, v4.s[2]
325 FMLA v31.4s, v17.4s, v5.s[2]
326 LDR q11, [x4], 16
327
328 FMLA v20.4s, v18.4s, v0.s[3]
329 FMLA v22.4s, v18.4s, v1.s[3]
330 FMLA v24.4s, v18.4s, v2.s[3]
331 LDP q12, q13, [x5], 32 // Load 4 B
332 FMLA v26.4s, v18.4s, v3.s[3]
333 FMLA v28.4s, v18.4s, v4.s[3]
334 FMLA v30.4s, v18.4s, v5.s[3]
335 LDP q14, q15, [x5], 32
336 FMLA v21.4s, v19.4s, v0.s[3]
337 FMLA v23.4s, v19.4s, v1.s[3]
338 FMLA v25.4s, v19.4s, v2.s[3]
339 LDP q16, q17, [x5], 32
340 FMLA v27.4s, v19.4s, v3.s[3]
341 FMLA v29.4s, v19.4s, v4.s[3]
342 FMLA v31.4s, v19.4s, v5.s[3]
343 LDP q18, q19, [x5], 32
344
345 # Second group of 4 A. 48 FMA.
346 FMLA v20.4s, v12.4s, v6.s[0]
347 FMLA v22.4s, v12.4s, v7.s[0]
348 FMLA v24.4s, v12.4s, v8.s[0]
349 FMLA v26.4s, v12.4s, v9.s[0]
350 FMLA v28.4s, v12.4s, v10.s[0]
351 FMLA v30.4s, v12.4s, v11.s[0]
352 FMLA v21.4s, v13.4s, v6.s[0]
353 FMLA v23.4s, v13.4s, v7.s[0]
354 FMLA v25.4s, v13.4s, v8.s[0]
355 FMLA v27.4s, v13.4s, v9.s[0]
356 FMLA v29.4s, v13.4s, v10.s[0]
357 FMLA v31.4s, v13.4s, v11.s[0]
358
359 FMLA v20.4s, v14.4s, v6.s[1]
360 FMLA v22.4s, v14.4s, v7.s[1]
361 FMLA v24.4s, v14.4s, v8.s[1]
362 FMLA v26.4s, v14.4s, v9.s[1]
363 FMLA v28.4s, v14.4s, v10.s[1]
364 FMLA v30.4s, v14.4s, v11.s[1]
365 FMLA v21.4s, v15.4s, v6.s[1]
366 FMLA v23.4s, v15.4s, v7.s[1]
367 FMLA v25.4s, v15.4s, v8.s[1]
368 FMLA v27.4s, v15.4s, v9.s[1]
369 FMLA v29.4s, v15.4s, v10.s[1]
370 FMLA v31.4s, v15.4s, v11.s[1]
371
372 FMLA v20.4s, v16.4s, v6.s[2]
373 FMLA v22.4s, v16.4s, v7.s[2]
374 FMLA v24.4s, v16.4s, v8.s[2]
375 FMLA v26.4s, v16.4s, v9.s[2]
376 FMLA v28.4s, v16.4s, v10.s[2]
377 FMLA v30.4s, v16.4s, v11.s[2]
378 FMLA v21.4s, v17.4s, v6.s[2]
379 FMLA v23.4s, v17.4s, v7.s[2]
380 FMLA v25.4s, v17.4s, v8.s[2]
381 FMLA v27.4s, v17.4s, v9.s[2]
382 FMLA v29.4s, v17.4s, v10.s[2]
383 FMLA v31.4s, v17.4s, v11.s[2]
384
385 FMLA v20.4s, v18.4s, v6.s[3]
386 FMLA v22.4s, v18.4s, v7.s[3]
387 FMLA v24.4s, v18.4s, v8.s[3]
388 FMLA v26.4s, v18.4s, v9.s[3]
389 FMLA v28.4s, v18.4s, v10.s[3]
390 FMLA v30.4s, v18.4s, v11.s[3]
391 FMLA v21.4s, v19.4s, v6.s[3]
392 FMLA v23.4s, v19.4s, v7.s[3]
393
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700394 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700395 LD2R {v6.4s, v7.4s}, [x8]
396
397 FMLA v25.4s, v19.4s, v8.s[3]
398 FMLA v27.4s, v19.4s, v9.s[3]
399 # Is there a remainder?- 4 floats of A (16 bytes) or less
400 TST x0, 31
401 FMLA v29.4s, v19.4s, v10.s[3]
402 FMLA v31.4s, v19.4s, v11.s[3]
403 B.NE 4f
404
405 # Clamp
4063:
Marat Dukhana51cf482020-04-08 16:16:19 -0700407 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800408 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700409 FMAX v21.4s, v21.4s, v6.4s
410 FMAX v22.4s, v22.4s, v6.4s
411 FMAX v23.4s, v23.4s, v6.4s
412 FMAX v24.4s, v24.4s, v6.4s
413 FMAX v25.4s, v25.4s, v6.4s
414 FMAX v26.4s, v26.4s, v6.4s
415 FMAX v27.4s, v27.4s, v6.4s
416 FMAX v28.4s, v28.4s, v6.4s
417 FMAX v29.4s, v29.4s, v6.4s
418 FMAX v30.4s, v30.4s, v6.4s
419 FMAX v31.4s, v31.4s, v6.4s
420 FMIN v20.4s, v20.4s, v7.4s
421 FMIN v21.4s, v21.4s, v7.4s
422 FMIN v22.4s, v22.4s, v7.4s
423 FMIN v23.4s, v23.4s, v7.4s
424 FMIN v24.4s, v24.4s, v7.4s
425 FMIN v25.4s, v25.4s, v7.4s
426 FMIN v26.4s, v26.4s, v7.4s
427 FMIN v27.4s, v27.4s, v7.4s
428 FMIN v28.4s, v28.4s, v7.4s
429 FMIN v29.4s, v29.4s, v7.4s
430 FMIN v30.4s, v30.4s, v7.4s
431 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700432
433 # Store full 6 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700434 B.LO 7f
435
436 STP q30, q31, [x7]
437 ADD x7, x7, x14
438 SUB x3, x3, x2 // a0 -= kc
439 STP q28, q29, [x13]
440 ADD x13, x13, x14
441 SUB x9, x9, x2 // a1 -= kc
442 STP q26, q27, [x18]
443 ADD x18, x18, x14
444 SUB x10, x10, x2 // a2 -= kc
445 STP q24, q25, [x17]
446 ADD x17, x17, x14
447 SUB x11, x11, x2 // a3 -= kc
448 STP q22, q23, [x16]
449 ADD x16, x16, x14
450 SUB x12, x12, x2 // a4 -= kc
451 STP q20, q21, [x6]
452 ADD x6, x6, x14
453 SUB x4, x4, x2 // a5 -= kc
454
XNNPACK Teamb455b122019-09-27 18:10:33 -0700455 B.HI 0b
456
457 # Restore d8-d15 from stack
458 LDP d14, d15, [sp, 48]
459 LDP d12, d13, [sp, 32]
460 LDP d10, d11, [sp, 16]
461 LDP d8, d9, [sp], 64
462 RET
463
4644:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700465 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700466 LD2R {v6.4s, v7.4s}, [x8]
467
468 # Is there a remainder?- 4 floats of A (16 bytes)
469 TBZ x0, 4, 5f
470
471 # Remainder- 4 floats of A (16 bytes)
472 # Load A
473 LDR q0, [x3], 16
474 LDR q1, [x9], 16
475 LDR q2, [x10], 16
476 LDR q3, [x11], 16
477 LDR q4, [x12], 16
478 LDR q5, [x4], 16
479 # Load B
480 LDP q12, q13, [x5], 32
481 LDP q14, q15, [x5], 32
482 LDP q16, q17, [x5], 32
483 LDP q18, q19, [x5], 32
484
485 FMLA v20.4s, v12.4s, v0.s[0]
486 FMLA v22.4s, v12.4s, v1.s[0]
487 FMLA v24.4s, v12.4s, v2.s[0]
488 FMLA v26.4s, v12.4s, v3.s[0]
489 FMLA v28.4s, v12.4s, v4.s[0]
490 FMLA v30.4s, v12.4s, v5.s[0]
491 FMLA v21.4s, v13.4s, v0.s[0]
492 FMLA v23.4s, v13.4s, v1.s[0]
493 FMLA v25.4s, v13.4s, v2.s[0]
494 FMLA v27.4s, v13.4s, v3.s[0]
495 FMLA v29.4s, v13.4s, v4.s[0]
496 FMLA v31.4s, v13.4s, v5.s[0]
497
498 FMLA v20.4s, v14.4s, v0.s[1]
499 FMLA v22.4s, v14.4s, v1.s[1]
500 FMLA v24.4s, v14.4s, v2.s[1]
501 FMLA v26.4s, v14.4s, v3.s[1]
502 FMLA v28.4s, v14.4s, v4.s[1]
503 FMLA v30.4s, v14.4s, v5.s[1]
504 FMLA v21.4s, v15.4s, v0.s[1]
505 FMLA v23.4s, v15.4s, v1.s[1]
506 FMLA v25.4s, v15.4s, v2.s[1]
507 FMLA v27.4s, v15.4s, v3.s[1]
508 FMLA v29.4s, v15.4s, v4.s[1]
509 FMLA v31.4s, v15.4s, v5.s[1]
510
511 FMLA v20.4s, v16.4s, v0.s[2]
512 FMLA v22.4s, v16.4s, v1.s[2]
513 FMLA v24.4s, v16.4s, v2.s[2]
514 FMLA v26.4s, v16.4s, v3.s[2]
515 FMLA v28.4s, v16.4s, v4.s[2]
516 FMLA v30.4s, v16.4s, v5.s[2]
517 FMLA v21.4s, v17.4s, v0.s[2]
518 FMLA v23.4s, v17.4s, v1.s[2]
519 FMLA v25.4s, v17.4s, v2.s[2]
520 FMLA v27.4s, v17.4s, v3.s[2]
521 FMLA v29.4s, v17.4s, v4.s[2]
522 FMLA v31.4s, v17.4s, v5.s[2]
523
524 FMLA v20.4s, v18.4s, v0.s[3]
525 FMLA v22.4s, v18.4s, v1.s[3]
526 FMLA v24.4s, v18.4s, v2.s[3]
527 FMLA v26.4s, v18.4s, v3.s[3]
528 FMLA v28.4s, v18.4s, v4.s[3]
529 FMLA v30.4s, v18.4s, v5.s[3]
530 FMLA v21.4s, v19.4s, v0.s[3]
531 FMLA v23.4s, v19.4s, v1.s[3]
532 FMLA v25.4s, v19.4s, v2.s[3]
533 FMLA v27.4s, v19.4s, v3.s[3]
534 FMLA v29.4s, v19.4s, v4.s[3]
535 FMLA v31.4s, v19.4s, v5.s[3]
536
537 # Is there a remainder?- 2 floats of A (8 bytes)
5385:
539 TBZ x0, 3, 6f
540
541 # Remainder- 2 floats of A (8 bytes)
542 # Load A
543 LDR d0, [x3], 8
544 LDR d1, [x9], 8
545 LDR d2, [x10], 8
546 LDR d3, [x11], 8
547 LDR d4, [x12], 8
548 LDR d5, [x4], 8
549 # Load B
550 LDP q12, q13, [x5], 32
551 LDP q14, q15, [x5], 32
552
553 FMLA v20.4s, v12.4s, v0.s[0]
554 FMLA v22.4s, v12.4s, v1.s[0]
555 FMLA v24.4s, v12.4s, v2.s[0]
556 FMLA v26.4s, v12.4s, v3.s[0]
557 FMLA v28.4s, v12.4s, v4.s[0]
558 FMLA v30.4s, v12.4s, v5.s[0]
559 FMLA v21.4s, v13.4s, v0.s[0]
560 FMLA v23.4s, v13.4s, v1.s[0]
561 FMLA v25.4s, v13.4s, v2.s[0]
562 FMLA v27.4s, v13.4s, v3.s[0]
563 FMLA v29.4s, v13.4s, v4.s[0]
564 FMLA v31.4s, v13.4s, v5.s[0]
565
566 FMLA v20.4s, v14.4s, v0.s[1]
567 FMLA v22.4s, v14.4s, v1.s[1]
568 FMLA v24.4s, v14.4s, v2.s[1]
569 FMLA v26.4s, v14.4s, v3.s[1]
570 FMLA v28.4s, v14.4s, v4.s[1]
571 FMLA v30.4s, v14.4s, v5.s[1]
572 FMLA v21.4s, v15.4s, v0.s[1]
573 FMLA v23.4s, v15.4s, v1.s[1]
574 FMLA v25.4s, v15.4s, v2.s[1]
575 FMLA v27.4s, v15.4s, v3.s[1]
576 FMLA v29.4s, v15.4s, v4.s[1]
577 FMLA v31.4s, v15.4s, v5.s[1]
578
579 # Is there a remainder?- 1 float of A (4 bytes)
5806:
581 TBZ x0, 2, 3b
582
583 # Remainder- 1 float of A (4 bytes)
584 # Load A
585 LDR s0, [x3], 4
586 LDR s1, [x9], 4
587 LDR s2, [x10], 4
588 LDR s3, [x11], 4
589 LDR s4, [x12], 4
590 LDR s5, [x4], 4
591 # Load B
592 LDP q12, q13, [x5], 32
593
594 FMLA v20.4s, v12.4s, v0.s[0]
595 FMLA v22.4s, v12.4s, v1.s[0]
596 FMLA v24.4s, v12.4s, v2.s[0]
597 FMLA v26.4s, v12.4s, v3.s[0]
598 FMLA v28.4s, v12.4s, v4.s[0]
599 FMLA v30.4s, v12.4s, v5.s[0]
600 FMLA v21.4s, v13.4s, v0.s[0]
601 FMLA v23.4s, v13.4s, v1.s[0]
602 FMLA v25.4s, v13.4s, v2.s[0]
603 FMLA v27.4s, v13.4s, v3.s[0]
604 FMLA v29.4s, v13.4s, v4.s[0]
605 FMLA v31.4s, v13.4s, v5.s[0]
606 B 3b
607
608 # Store odd width
6097:
610 TBZ x1, 2, 8f
611 STR q30, [x7], 16
612 MOV v30.16b, v31.16b
613 STR q28, [x13], 16
614 MOV v28.16b, v29.16b
615 STR q26, [x18], 16
616 MOV v26.16b, v27.16b
617 STR q24, [x17], 16
618 MOV v24.16b, v25.16b
619 STR q22, [x16], 16
620 MOV v22.16b, v23.16b
621 STR q20, [x6], 16
622 MOV v20.16b, v21.16b
6238:
624 TBZ x1, 1, 9f
625 STR d30, [x7], 8
626 DUP d30, v30.d[1]
627 STR d28, [x13], 8
628 DUP d28, v28.d[1]
629 STR d26, [x18], 8
630 DUP d26, v26.d[1]
631 STR d24, [x17], 8
632 DUP d24, v24.d[1]
633 STR d22, [x16], 8
634 DUP d22, v22.d[1]
635 STR d20, [x6], 8
636 DUP d20, v20.d[1]
637
6389:
639 TBZ x1, 0, 10f
640 STR s30, [x7]
641 STR s28, [x13]
642 STR s26, [x18]
643 STR s24, [x17]
644 STR s22, [x16]
645 STR s20, [x6]
64610:
647 # Restore d8-d15 from stack
648 LDP d14, d15, [sp, 48]
649 LDP d12, d13, [sp, 32]
650 LDP d10, d11, [sp, 16]
651 LDP d8, d9, [sp], 64
652 RET
653
Marat Dukhan57431932019-11-22 07:50:42 -0800654END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75
XNNPACK Teamb455b122019-09-27 18:10:33 -0700655
656#ifdef __ELF__
657.section ".note.GNU-stack","",%progbits
658#endif