blob: 2606f7d8cfdebb6d8e1b7661d834d44db140c4e1 [file] [log] [blame]
Frank Barchard8fb90552020-03-16 11:36:09 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
Frank Barchard8fb90552020-03-16 11:36:09 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32
33# C pointers
34# x6 c0
35# x16 c1
36# x17 c2
37# x18 c3
38
39# x4 temporary vector shadow register
40
41# Vector register usage
42# A0 v0 v3
43# A1 v0[1] v3[1]
44# A2 v1 v4
45# A3 v1[1] v4[1]
46
47# B v12 v13 v14 v15 second set of B
48# B v16 v17 v18 v19 first set
49# C v20 v21
50# C v22 v23
51# C v24 v25
52# C v26 v27
53# Clamp v6 v7
54
55# unused A v8 v9 v10 v11
56# x12 a4
57# x13 c4
58# x7 c5
59# A4 v2 v5
60# A5 v2[1] v5[1]
61# C v28 v29
62# C v30 v31
63
64BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55
65
66 # Clamp A and C pointers
67 CMP x0, 2 // if mr < 2
68 ADD x9, x3, x4 // a1 = a0 + a_stride
69 ADD x16, x6, x7 // c1 = c0 + cm_stride
70 CSEL x9, x3, x9, LO // a1 = a0
71 CSEL x16, x6, x16, LO // c1 = c0
72
73 ADD x10, x9, x4 // a2 = a1 + a_stride
74 ADD x17, x16, x7 // c2 = c1 + cm_stride
75 // if mr <= 2
76 CSEL x10, x9, x10, LS // a2 = a1
77 CSEL x17, x16, x17, LS // c2 = c1
78
79 CMP x0, 4 // if mr < 4
80 ADD x11, x10, x4 // a3 = a2 + a_stride
81 ADD x18, x17, x7 // c3 = c2 + cm_stride
82 CSEL x11, x10, x11, LO // a3 = a2
83 CSEL x18, x17, x18, LO // c3 = c2
84
85 # Load params pointer
86 LDR x8, [sp, 8]
87
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070088 # Load min/max values
Frank Barchard8fb90552020-03-16 11:36:09 -070089 LD2R {v6.4s, v7.4s}, [x8]
90
91 # Load cn_stride
92 LDR x14, [sp]
93
94 // Save d12-d15 on stack
95 STP d12, d13, [sp, -32]!
96 STP d14, d15, [sp, 16]
97
980:
99 # Load initial bias from w into accumulators
100 LDP q20, q21, [x5], 32
101 MOV v22.16b, v20.16b
102 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
103 PRFM PLDL1KEEP, [x3, 64]
104 MOV v23.16b, v21.16b
105 PRFM PLDL1KEEP, [x9, 0]
106 PRFM PLDL1KEEP, [x9, 64]
107 MOV v24.16b, v20.16b
108 PRFM PLDL1KEEP, [x10, 0]
109 PRFM PLDL1KEEP, [x10, 64]
110 MOV v25.16b, v21.16b
111 PRFM PLDL1KEEP, [x11, 0]
112 PRFM PLDL1KEEP, [x11, 64]
113 MOV v26.16b, v20.16b
114 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
115 MOV v27.16b, v21.16b
116 PRFM PLDL1KEEP, [x5, 64]
117 PRFM PLDL1KEEP, [x5, 128]
118 PRFM PLDL1KEEP, [x5, 192]
119
120 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
121 SUBS x0, x2, 16 // k = kc - 16
122 B.LO 5f
123
124 # Prologue - First group loads, no FMA
125 LDR d0, [x3], 8 // a0
126 LDP q16, q17, [x5], 32 // b
127 LDR d1, [x10], 8 // a2
128 LD1 {v0.d}[1], [x9], 8 // a1
129 LD1 {v1.d}[1], [x11], 8 // a3
130 SUBS x0, x0, 16
131 LDR q18, [x5], 16
132 LDR d19, [x5], 8
133 LDR x4, [x5], 8 // ins is in BLOCK 0
134
135 # Is there at least 4 floats (16 bytes) for main loop?
136 B.LO 2f
137
138 # Main loop - 4 floats of A (16 bytes)
139 # 32 FMA + 8 LD64 A + 8 LDR B
1401:
141 # First group of 16 FMA, Second group loads
142 // BLOCK 0
143 FMLA v20.4s, v16.4s, v0.s[0]
144 LDR d3, [x3], 8 // a0
145 FMLA v22.4s, v16.4s, v0.s[2]
146 INS v19.d[1], x4 // b from second group
147 FMLA v24.4s, v16.4s, v1.s[0]
148 LDR x4, [x9], 8 // a1
149
150 // BLOCK 1
151 FMLA v26.4s, v16.4s, v1.s[2]
152 LDR d12, [x5]
153 FMLA v21.4s, v17.4s, v0.s[0]
154 INS v3.d[1], x4 // a1 ins
155 FMLA v23.4s, v17.4s, v0.s[2]
156 LDR x4, [x5, 8] // b
157
158 // BLOCK 2
159 FMLA v25.4s, v17.4s, v1.s[0]
160 LDR d4, [x10], 8 // a2
161 FMLA v27.4s, v17.4s, v1.s[2]
162 INS v12.d[1], x4 // b ins
163 FMLA v20.4s, v18.4s, v0.s[1]
164 LDR x4, [x11], 8 // a3
165
166 // BLOCK 3
167 FMLA v22.4s, v18.4s, v0.s[3]
168 LDR d13, [x5, 16]
169 FMLA v24.4s, v18.4s, v1.s[1]
170 INS v4.d[1], x4 // a3 ins
171 FMLA v26.4s, v18.4s, v1.s[3]
172 LDR x4, [x5, 24]
173
174 // BLOCK 4
175 FMLA v21.4s, v19.4s, v0.s[1]
176 LDR d14, [x5, 32]
177 FMLA v23.4s, v19.4s, v0.s[3]
178 INS v13.d[1], x4 // b
179 FMLA v25.4s, v19.4s, v1.s[1]
180 LDR x4, [x5, 40]
181
182 // BLOCK 5
183 // NOPs to ensure 4 cycle LDR lands on next LDR
184 FMLA v27.4s, v19.4s, v1.s[3]
185 LDR d15, [x5, 48]
186 NOP
187 INS v14.d[1], x4 // b from previous
188 SUBS x0, x0, 16
189 LDR x4, [x5, 56]
190
191 # Second group of 16 FMA, First group of loads
192 // BLOCK 0
193 FMLA v20.4s, v12.4s, v3.s[0]
194 LDR d0, [x3], 8 // a0
195 FMLA v22.4s, v12.4s, v3.s[2]
196 INS v15.d[1], x4 // b from previous
197 FMLA v24.4s, v12.4s, v4.s[0]
198 LDR x4, [x9], 8 // a1
199
200 // BLOCK 1
201 FMLA v26.4s, v12.4s, v4.s[2]
202 LDR d16, [x5, 64]
203 FMLA v21.4s, v13.4s, v3.s[0]
204 INS v0.d[1], x4 // a1 ins
205 FMLA v23.4s, v13.4s, v3.s[2]
206 LDR x4, [x5, 72] // b
207
208 // BLOCK 2
209 FMLA v25.4s, v13.4s, v4.s[0]
210 LDR d1, [x10], 8 // a2
211 FMLA v27.4s, v13.4s, v4.s[2]
212 INS v16.d[1], x4 // b
213 FMLA v20.4s, v14.4s, v3.s[1]
214 LDR x4, [x11], 8 // a3
215
216 // BLOCK 3
217 FMLA v22.4s, v14.4s, v3.s[3]
218 LDR d17, [x5, 80]
219 FMLA v24.4s, v14.4s, v4.s[1]
220 INS v1.d[1], x4 // a3 ins
221 FMLA v26.4s, v14.4s, v4.s[3]
222 LDR x4, [x5, 88]
223
224 // BLOCK 4
225 FMLA v21.4s, v15.4s, v3.s[1]
226 LDR d18, [x5, 96]
227 FMLA v23.4s, v15.4s, v3.s[3]
228 INS v17.d[1], x4 // b
229 FMLA v25.4s, v15.4s, v4.s[1]
230 LDR x4, [x5, 104]
231
232 // BLOCK 5
233 // NOTE that block needs to be 4 cycles for LDR not to stall
234 FMLA v27.4s, v15.4s, v4.s[3]
235 LDR d19, [x5, 112]
236 INS v18.d[1], x4
237 LDR x4, [x5, 120]
238 ADD x5, x5, 128
239 B.HS 1b
240
241 # Epilogue - 4 floats of A (16 bytes)
242 # 32 FMA + 8 LD64 A + 8 LDR B
2432:
244 # First group of 16 FMA, Second group loads
245 // BLOCK 0
246 FMLA v20.4s, v16.4s, v0.s[0]
247 LDR d3, [x3], 8 // a0
248 FMLA v22.4s, v16.4s, v0.s[2]
249 INS v19.d[1], x4 // b from second group
250 FMLA v24.4s, v16.4s, v1.s[0]
251 LDR x4, [x9], 8 // a1
252
253 // BLOCK 1
254 FMLA v26.4s, v16.4s, v1.s[2]
255 LDR d12, [x5]
256 FMLA v21.4s, v17.4s, v0.s[0]
257 INS v3.d[1], x4 // a1 ins
258 FMLA v23.4s, v17.4s, v0.s[2]
259 LDR x4, [x5, 8] // b
260
261 // BLOCK 2
262 FMLA v25.4s, v17.4s, v1.s[0]
263 LDR d4, [x10], 8 // a2
264 FMLA v27.4s, v17.4s, v1.s[2]
265 INS v12.d[1], x4 // b ins
266 FMLA v20.4s, v18.4s, v0.s[1]
267 LDR x4, [x11], 8 // a3
268
269 // BLOCK 3
270 FMLA v22.4s, v18.4s, v0.s[3]
271 LDR d13, [x5, 16]
272 FMLA v24.4s, v18.4s, v1.s[1]
273 INS v4.d[1], x4 // a3 ins
274 FMLA v26.4s, v18.4s, v1.s[3]
275 LDR x4, [x5, 24]
276
277 // BLOCK 4
278 FMLA v21.4s, v19.4s, v0.s[1]
279 LDR d14, [x5, 32]
280 FMLA v23.4s, v19.4s, v0.s[3]
281 INS v13.d[1], x4 // b
282 FMLA v25.4s, v19.4s, v1.s[1]
283 LDR x4, [x5, 40]
284
285 // BLOCK 5
286 // NOPs to ensure 4 cycle LDR lands on next LDR
287 FMLA v27.4s, v19.4s, v1.s[3]
288 LDR d15, [x5, 48]
289 NOP // fma
290 INS v14.d[1], x4
291 NOP
292 LDR x4, [x5, 56]
293
294 # Second group of 16 FMA, no loads
295 // BLOCK 0
296 FMLA v20.4s, v12.4s, v3.s[0]
297 FMLA v22.4s, v12.4s, v3.s[2]
298 INS v15.d[1], x4 // b from previous
299 FMLA v24.4s, v12.4s, v4.s[0]
300
301 // BLOCK 1
302 FMLA v26.4s, v12.4s, v4.s[2]
303 FMLA v21.4s, v13.4s, v3.s[0]
304 FMLA v23.4s, v13.4s, v3.s[2]
305
306 // BLOCK 2
307 FMLA v25.4s, v13.4s, v4.s[0]
308 FMLA v27.4s, v13.4s, v4.s[2]
309 FMLA v20.4s, v14.4s, v3.s[1]
310
311 // BLOCK 3
312 FMLA v22.4s, v14.4s, v3.s[3]
313 FMLA v24.4s, v14.4s, v4.s[1]
314 FMLA v26.4s, v14.4s, v4.s[3]
315 TST x0, 15
316
317 // BLOCK 4
318 FMLA v21.4s, v15.4s, v3.s[1]
319 FMLA v23.4s, v15.4s, v3.s[3]
320 FMLA v25.4s, v15.4s, v4.s[1]
321 ADD x5, x5, 64
322
323 // BLOCK 5
324 FMLA v27.4s, v15.4s, v4.s[3]
325
326 # Is there a remainder?- 2 floats of A (8 bytes) or less
327 B.NE 5f
328
3294:
330 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700331 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard8fb90552020-03-16 11:36:09 -0700332 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700333 FMAX v21.4s, v21.4s, v6.4s
334 FMAX v22.4s, v22.4s, v6.4s
335 FMAX v23.4s, v23.4s, v6.4s
336 FMAX v24.4s, v24.4s, v6.4s
337 FMAX v25.4s, v25.4s, v6.4s
338 FMAX v26.4s, v26.4s, v6.4s
339 FMAX v27.4s, v27.4s, v6.4s
340 FMIN v20.4s, v20.4s, v7.4s
341 FMIN v21.4s, v21.4s, v7.4s
342 FMIN v22.4s, v22.4s, v7.4s
343 FMIN v23.4s, v23.4s, v7.4s
344 FMIN v24.4s, v24.4s, v7.4s
345 FMIN v25.4s, v25.4s, v7.4s
346 FMIN v26.4s, v26.4s, v7.4s
347 FMIN v27.4s, v27.4s, v7.4s
Frank Barchard8fb90552020-03-16 11:36:09 -0700348
349 # Store full 4 x 8
350 B.LO 8f
351
352 ST1 {v20.16b, v21.16b}, [x6], x14
353 SUB x3, x3, x2 // a0 -= kc
354 ST1 {v22.16b, v23.16b}, [x16], x14
355 SUB x9, x9, x2 // a1 -= kc
356 ST1 {v24.16b, v25.16b}, [x17], x14
357 SUB x10, x10, x2 // a2 -= kc
358 ST1 {v26.16b, v27.16b}, [x18], x14
359 SUB x11, x11, x2 // a3 -= kc
360
361 B.HI 0b
362
363 // Restore d12-d15 from stack
364 LDP d14, d15, [sp, 16]
365 LDP d12, d13, [sp], 32
366 RET
367
3685:
369 # Is there a remainder?- 2 floats of A (8 bytes)
370 TBZ x0, 3, 6f
371
372 # Remainder- 2 floats of A (8 bytes)
373 LDR d0, [x3], 8
374 LDR q16, [x5], 16
375 LD1 {v0.d}[1], [x9], 8
376 LDR d1, [x10], 8
377 LD1 {v1.d}[1], [x11], 8
378 LDR q17, [x5], 16
379 LDR q18, [x5], 16
380 LDR q19, [x5], 16
381 FMLA v20.4s, v16.4s, v0.s[0]
382 FMLA v22.4s, v16.4s, v0.s[2]
383 FMLA v24.4s, v16.4s, v1.s[0]
384 FMLA v26.4s, v16.4s, v1.s[2]
385 FMLA v21.4s, v17.4s, v0.s[0]
386 FMLA v23.4s, v17.4s, v0.s[2]
387 FMLA v25.4s, v17.4s, v1.s[0]
388 FMLA v27.4s, v17.4s, v1.s[2]
389
390 FMLA v20.4s, v18.4s, v0.s[1]
391 FMLA v22.4s, v18.4s, v0.s[3]
392 FMLA v24.4s, v18.4s, v1.s[1]
393 FMLA v26.4s, v18.4s, v1.s[3]
394 FMLA v21.4s, v19.4s, v0.s[1]
395 FMLA v23.4s, v19.4s, v0.s[3]
396 FMLA v25.4s, v19.4s, v1.s[1]
397 FMLA v27.4s, v19.4s, v1.s[3]
398
399 # Is there a remainder?- 1 floats of A (4 bytes)
400 TBZ x0, 2, 4b
401
4026:
403 # Remainder- 1 floats of A (4 bytes)
404 LDR s0, [x3], 4
405 LDR q16, [x5], 16
406 LD1 {v0.s}[2], [x9], 4
407 LDR s1, [x10], 4
408 LD1 {v1.s}[2], [x11], 4
409 LDR q17, [x5], 16
410
411 FMLA v20.4s, v16.4s, v0.s[0]
412 FMLA v22.4s, v16.4s, v0.s[2]
413 FMLA v24.4s, v16.4s, v1.s[0]
414 FMLA v26.4s, v16.4s, v1.s[2]
415 FMLA v21.4s, v17.4s, v0.s[0]
416 FMLA v23.4s, v17.4s, v0.s[2]
417 FMLA v25.4s, v17.4s, v1.s[0]
418 FMLA v27.4s, v17.4s, v1.s[2]
419 B 4b
420
421 # Store odd width
4228:
423 TBZ x1, 2, 9f
424 STR q20, [x6], 16
425 MOV v20.16b, v21.16b
426 STR q22, [x16], 16
427 MOV v22.16b, v23.16b
428 STR q24, [x17], 16
429 MOV v24.16b, v25.16b
430 STR q26, [x18], 16
431 MOV v26.16b, v27.16b
432
4339:
434 TBZ x1, 1, 10f
435 STR d20, [x6], 8
436 DUP d20, v20.d[1]
437 STR d22, [x16], 8
438 DUP d22, v22.d[1]
439 STR d24, [x17], 8
440 DUP d24, v24.d[1]
441 STR d26, [x18], 8
442 DUP d26, v26.d[1]
443
44410:
445 TBZ x1, 0, 11f
446 STR s20, [x6]
447 STR s22, [x16]
448 STR s24, [x17]
449 STR s26, [x18]
45011:
451 // Restore d12-d15 from stack
452 LDP d14, d15, [sp, 16]
453 LDP d12, d13, [sp], 32
454 RET
455
456END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55
457
458#ifdef __ELF__
459.section ".note.GNU-stack","",%progbits
460#endif