blob: 59036470151d87b093d7cc43ff07c109b4e80a1d [file] [log] [blame]
Frank Barchard8fb90552020-03-16 11:36:09 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020# const xnn_f32_minmax_params params [sp + 24] -> x8
Frank Barchard8fb90552020-03-16 11:36:09 -070021
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x13 a0
27# x14 a1
28# x15 a2
29# x8 a3
30
31# C pointers
32# x6 c0
33# x16 c1
34# x17 c2
35# x7 c3
36
37# x19 temporary vector shadow register
38
39# Vector register usage
40# A0 v0 v3
41# A1 v0[1] v3[1]
42# A2 v1 v4
43# A3 v1[1] v4[1]
44
45# B v12 v13 v14 v15 second set of B
46# B v16 v17 v18 v19 first set
47# C v20 v21
48# C v22 v23
49# C v24 v25
50# C v26 v27
51# Clamp v6 v7
52
53# unused A v8 v9 v10 v11
54# x12 a4
55# x4 a5
56# x13 c4
57# x7 c5
58# A4 v2 v5
59# A5 v2[1] v5[1]
60# C v28 v29
61# C v30 v31
62
63BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55
64
65 # Clamp C pointers
66 CMP x0, 2 // if mr < 2
67 ADD x16, x6, x7 // c1 = c0 + cm_stride
68 CSEL x16, x6, x16, LO // c1 = c0
69
70 ADD x17, x16, x7 // c2 = c1 + cm_stride
71 // if mr <= 2
72 CSEL x17, x16, x17, LS // c2 = c1
73
74 CMP x0, 4 // if mr < 4
75 ADD x7, x17, x7 // c3 = c2 + cm_stride
76 CSEL x7, x17, x7, LO // c3 = c2
77
78 # Load cn_stride, a_offset
79 LDP x10, x11, [sp]
80
81 # Load zero, clamping params pointer
82 LDP x12, x8, [sp, 16]
83
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070084 # Load min/max values
Frank Barchard8fb90552020-03-16 11:36:09 -070085 LD2R {v6.4s, v7.4s}, [x8]
86
87 // Save x19, d12-d15 on stack
88 STP d12, d13, [sp, -48]!
89 STP d14, d15, [sp, 16]
90 STP x19, x19, [sp, 32]
91
920:
93 # Load initial bias from w into accumulators
94 LDP q20, q21, [x5], 32
95 MOV v22.16b, v20.16b
96 PRFM PLDL1KEEP, [x13, 0] // Prefetch A
97 PRFM PLDL1KEEP, [x13, 64]
98 MOV v23.16b, v21.16b
99 PRFM PLDL1KEEP, [x14, 0]
100 PRFM PLDL1KEEP, [x14, 64]
101 MOV v24.16b, v20.16b
102 PRFM PLDL1KEEP, [x15, 0]
103 PRFM PLDL1KEEP, [x15, 64]
104 MOV v25.16b, v21.16b
105 PRFM PLDL1KEEP, [x8, 0]
106 PRFM PLDL1KEEP, [x8, 64]
107 MOV v26.16b, v20.16b
108 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
109 PRFM PLDL1KEEP, [x5, 64]
110 MOV v27.16b, v21.16b
111 PRFM PLDL1KEEP, [x5, 128]
112 PRFM PLDL1KEEP, [x5, 192]
113
114 MOV x9, x3 // p = ks
115
1161:
117 # Load next 4 A pointers
118 LDP x13, x14, [x4], 16
119 LDP x15, x8, [x4], 16
120
121
122 CMP x13, x12 // if a0 == zero
123 ADD x13, x13, x11 // a0 += a_offset
124 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
125 CMP x14, x12 // if a1 == zero
126 ADD x14, x14, x11 // a1 += a_offset
127 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
128 CMP x15, x12 // if a2 == zero
129 ADD x15, x15, x11 // a2 += a_offset
130 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
131 CMP x8, x12 // if a3 == zero
132 ADD x8, x8, x11 // a3 += a_offset
133 CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset
134
135 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
136 SUBS x0, x2, 16 // k = kc - 16
137 B.LO 4f
138
139 # Prologue - First group loads, no FMA
140 LDR d0, [x13], 8 // a0
141 LDP q16, q17, [x5], 32 // b
142 LDR d1, [x15], 8 // a2
143 LD1 {v0.d}[1], [x14], 8 // a1
144 LD1 {v1.d}[1], [x8], 8 // a3
145 SUBS x0, x0, 16
146 LDR q18, [x5], 16
147 LDR d19, [x5], 8
148 LDR x19, [x5], 8 // ins is in BLOCK 0
149
150 # Is there at least 4 floats (16 bytes) for main loop?
151 B.LO 3f
152
153 # Main loop - 4 floats of A (16 bytes)
154 # 32 FMA + 8 LD64 A + 8 LDR B
1552:
156 # First group of 16 FMA, Second group loads
157 // BLOCK 0
158 FMLA v20.4s, v16.4s, v0.s[0]
159 LDR d3, [x13], 8 // a0
160 FMLA v22.4s, v16.4s, v0.s[2]
161 INS v19.d[1], x19 // b from second group
162 FMLA v24.4s, v16.4s, v1.s[0]
163 LDR x19, [x14], 8 // a1
164
165 // BLOCK 1
166 FMLA v26.4s, v16.4s, v1.s[2]
167 LDR d12, [x5]
168 FMLA v21.4s, v17.4s, v0.s[0]
169 INS v3.d[1], x19 // a1 ins
170 FMLA v23.4s, v17.4s, v0.s[2]
171 LDR x19, [x5, 8] // b
172
173 // BLOCK 2
174 FMLA v25.4s, v17.4s, v1.s[0]
175 LDR d4, [x15], 8 // a2
176 FMLA v27.4s, v17.4s, v1.s[2]
177 INS v12.d[1], x19 // b ins
178 FMLA v20.4s, v18.4s, v0.s[1]
179 LDR x19, [x8], 8 // a3
180
181 // BLOCK 3
182 FMLA v22.4s, v18.4s, v0.s[3]
183 LDR d13, [x5, 16]
184 FMLA v24.4s, v18.4s, v1.s[1]
185 INS v4.d[1], x19 // a3 ins
186 FMLA v26.4s, v18.4s, v1.s[3]
187 LDR x19, [x5, 24]
188
189 // BLOCK 4
190 FMLA v21.4s, v19.4s, v0.s[1]
191 LDR d14, [x5, 32]
192 FMLA v23.4s, v19.4s, v0.s[3]
193 INS v13.d[1], x19 // b
194 FMLA v25.4s, v19.4s, v1.s[1]
195 LDR x19, [x5, 40]
196
197 // BLOCK 5
198 // NOPs to ensure 4 cycle LDR lands on next LDR
199 FMLA v27.4s, v19.4s, v1.s[3]
200 LDR d15, [x5, 48]
201 NOP
202 INS v14.d[1], x19 // b from previous
203 SUBS x0, x0, 16
204 LDR x19, [x5, 56]
205
206 # Second group of 16 FMA, First group of loads
207 // BLOCK 0
208 FMLA v20.4s, v12.4s, v3.s[0]
209 LDR d0, [x13], 8 // a0
210 FMLA v22.4s, v12.4s, v3.s[2]
211 INS v15.d[1], x19 // b from previous
212 FMLA v24.4s, v12.4s, v4.s[0]
213 LDR x19, [x14], 8 // a1
214
215 // BLOCK 1
216 FMLA v26.4s, v12.4s, v4.s[2]
217 LDR d16, [x5, 64]
218 FMLA v21.4s, v13.4s, v3.s[0]
219 INS v0.d[1], x19 // a1 ins
220 FMLA v23.4s, v13.4s, v3.s[2]
221 LDR x19, [x5, 72] // b
222
223 // BLOCK 2
224 FMLA v25.4s, v13.4s, v4.s[0]
225 LDR d1, [x15], 8 // a2
226 FMLA v27.4s, v13.4s, v4.s[2]
227 INS v16.d[1], x19 // b
228 FMLA v20.4s, v14.4s, v3.s[1]
229 LDR x19, [x8], 8 // a3
230
231 // BLOCK 3
232 FMLA v22.4s, v14.4s, v3.s[3]
233 LDR d17, [x5, 80]
234 FMLA v24.4s, v14.4s, v4.s[1]
235 INS v1.d[1], x19 // a3 ins
236 FMLA v26.4s, v14.4s, v4.s[3]
237 LDR x19, [x5, 88]
238
239 // BLOCK 4
240 FMLA v21.4s, v15.4s, v3.s[1]
241 LDR d18, [x5, 96]
242 FMLA v23.4s, v15.4s, v3.s[3]
243 INS v17.d[1], x19 // b
244 FMLA v25.4s, v15.4s, v4.s[1]
245 LDR x19, [x5, 104]
246
247 // BLOCK 5
248 // NOTE that block needs to be 4 cycles for LDR not to stall
249 FMLA v27.4s, v15.4s, v4.s[3]
250 LDR d19, [x5, 112]
251 INS v18.d[1], x19
252 LDR x19, [x5, 120]
253 ADD x5, x5, 128
254 B.HS 2b
255
256 # Epilogue - 4 floats of A (16 bytes)
257 # 32 FMA + 8 LD64 A + 8 LDR B
2583:
259 # First group of 16 FMA, Second group loads
260 // BLOCK 0
261 LDR d3, [x13], 8 // a0
262 INS v19.d[1], x19 // b from second group
263 FMLA v20.4s, v16.4s, v0.s[0]
264 LDR x19, [x14], 8 // a1
265 FMLA v22.4s, v16.4s, v0.s[2]
266 FMLA v24.4s, v16.4s, v1.s[0]
267
268 // BLOCK 1
269 LDR d12, [x5]
270 INS v3.d[1], x19 // a1 ins
271 FMLA v26.4s, v16.4s, v1.s[2]
272 LDR x19, [x5, 8] // b
273 FMLA v21.4s, v17.4s, v0.s[0]
274 FMLA v23.4s, v17.4s, v0.s[2]
275
276 // BLOCK 2
277 LDR d4, [x15], 8 // a2
278 INS v12.d[1], x19 // b ins
279 FMLA v25.4s, v17.4s, v1.s[0]
280 LDR x19, [x8], 8 // a3
281 FMLA v27.4s, v17.4s, v1.s[2]
282 FMLA v20.4s, v18.4s, v0.s[1]
283
284 // BLOCK 3
285 LDR d13, [x5, 16]
286 INS v4.d[1], x19 // a3 ins
287 FMLA v22.4s, v18.4s, v0.s[3]
288 LDR x19, [x5, 24]
289 FMLA v24.4s, v18.4s, v1.s[1]
290 FMLA v26.4s, v18.4s, v1.s[3]
291
292 // BLOCK 4
293 LDR d14, [x5, 32]
294 INS v13.d[1], x19 // b
295 FMLA v21.4s, v19.4s, v0.s[1]
296 LDR x19, [x5, 40]
297 FMLA v23.4s, v19.4s, v0.s[3]
298 FMLA v25.4s, v19.4s, v1.s[1]
299
300 // BLOCK 5
301 // NOPs to ensure 4 cycle LDR lands on next LDR
302 LDR d15, [x5, 48]
303 INS v14.d[1], x19
304 FMLA v27.4s, v19.4s, v1.s[3]
305 LDR x19, [x5, 56]
306 NOP // fma
307 NOP
308 NOP // fma
309 NOP
310
311 # Second group of 16 FMA, no loads
312 // BLOCK 0
313 INS v15.d[1], x19 // b from previous
314 FMLA v20.4s, v12.4s, v3.s[0]
315 FMLA v22.4s, v12.4s, v3.s[2]
316 FMLA v24.4s, v12.4s, v4.s[0]
317
318 // BLOCK 1
319 FMLA v26.4s, v12.4s, v4.s[2]
320 FMLA v21.4s, v13.4s, v3.s[0]
321 FMLA v23.4s, v13.4s, v3.s[2]
322
323 // BLOCK 2
324 FMLA v25.4s, v13.4s, v4.s[0]
325 FMLA v27.4s, v13.4s, v4.s[2]
326 FMLA v20.4s, v14.4s, v3.s[1]
327
328 // BLOCK 3
329 FMLA v22.4s, v14.4s, v3.s[3]
330 FMLA v24.4s, v14.4s, v4.s[1]
331 FMLA v26.4s, v14.4s, v4.s[3]
332
333 // BLOCK 4
334 FMLA v21.4s, v15.4s, v3.s[1]
335 FMLA v23.4s, v15.4s, v3.s[3]
336 FMLA v25.4s, v15.4s, v4.s[1]
337 ADD x5, x5, 64
338
339 // BLOCK 5
340 FMLA v27.4s, v15.4s, v4.s[3]
341
3424:
343 # Is there a remainder?- 2 floats of A (8 bytes)
344 TBNZ x0, 3, 6f
345 # Is there a remainder?- 1 floats of A (4 bytes)
346 TBNZ x0, 2, 7f
3475:
348 # ks loop
349 SUBS x9, x9, 32 // ks -= MR * sizeof(void*)
350 B.HI 1b
351
352 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700353 FMAX v20.4s, v20.4s, v6.4s
354 FMAX v21.4s, v21.4s, v6.4s
355 FMAX v22.4s, v22.4s, v6.4s
356 FMAX v23.4s, v23.4s, v6.4s
357 FMAX v24.4s, v24.4s, v6.4s
358 FMAX v25.4s, v25.4s, v6.4s
359 FMAX v26.4s, v26.4s, v6.4s
360 FMAX v27.4s, v27.4s, v6.4s
361 FMIN v20.4s, v20.4s, v7.4s
362 FMIN v21.4s, v21.4s, v7.4s
363 FMIN v22.4s, v22.4s, v7.4s
364 FMIN v23.4s, v23.4s, v7.4s
365 FMIN v24.4s, v24.4s, v7.4s
366 FMIN v25.4s, v25.4s, v7.4s
367 FMIN v26.4s, v26.4s, v7.4s
368 FMIN v27.4s, v27.4s, v7.4s
Frank Barchard8fb90552020-03-16 11:36:09 -0700369
370 # Store full 4 x 8
371 SUBS x1, x1, 8
372 B.LO 8f
373
374 STP q26, q27, [x7]
375 ADD x7, x7, x10
376 STP q24, q25, [x17]
377 ADD x17, x17, x10
378 STP q22, q23, [x16]
379 ADD x16, x16, x10
380 STP q20, q21, [x6]
381 ADD x6, x6, x10
382
383 SUB x4, x4, x3 // a -= ks
384
385 # nc loop
386 B.HI 0b
387
388 // Restore x19, d12-d15 from stack
389 LDR x19, [sp, 32]
390 LDP d14, d15, [sp, 16]
391 LDP d12, d13, [sp], 48
392 RET
393
394 # Remainder - 2 floats of A (8 bytes)
395 # 16 FMA + 4 LD64 A + 2 LDP B
3966:
397 LDR d0, [x13], 8
398 LDP q16, q17, [x5], 32
399 LD1 {v0.d}[1], [x14], 8
400 LDR d1, [x15], 8
401 LD1 {v1.d}[1], [x8], 8
402 LDP q18, q19, [x5], 32
403 FMLA v20.4s, v16.4s, v0.s[0]
404 FMLA v22.4s, v16.4s, v0.s[2]
405 FMLA v24.4s, v16.4s, v1.s[0]
406 FMLA v26.4s, v16.4s, v1.s[2]
407 FMLA v21.4s, v17.4s, v0.s[0]
408 FMLA v23.4s, v17.4s, v0.s[2]
409 FMLA v25.4s, v17.4s, v1.s[0]
410 FMLA v27.4s, v17.4s, v1.s[2]
411
412 FMLA v20.4s, v18.4s, v0.s[1]
413 FMLA v22.4s, v18.4s, v0.s[3]
414 FMLA v24.4s, v18.4s, v1.s[1]
415 FMLA v26.4s, v18.4s, v1.s[3]
416 FMLA v21.4s, v19.4s, v0.s[1]
417 FMLA v23.4s, v19.4s, v0.s[3]
418 FMLA v25.4s, v19.4s, v1.s[1]
419 FMLA v27.4s, v19.4s, v1.s[3]
420
421 # Is there a remainder?- 1 floats of A (4 bytes)
422 TBZ x0, 2, 5b
423
4247:
425 # Remainder- 1 floats of A (4 bytes)
426 LDR s0, [x13], 4
427 LDP q16, q17, [x5], 32
428 LD1 {v0.s}[2], [x14], 4
429 LDR s1, [x15], 4
430 LD1 {v1.s}[2], [x8], 4
431
432 FMLA v20.4s, v16.4s, v0.s[0]
433 FMLA v22.4s, v16.4s, v0.s[2]
434 FMLA v24.4s, v16.4s, v1.s[0]
435 FMLA v26.4s, v16.4s, v1.s[2]
436 FMLA v21.4s, v17.4s, v0.s[0]
437 FMLA v23.4s, v17.4s, v0.s[2]
438 FMLA v25.4s, v17.4s, v1.s[0]
439 FMLA v27.4s, v17.4s, v1.s[2]
440 B 5b
441
442 # Store odd width
4438:
444 TBZ x1, 2, 9f
445 STR q26, [x7], 16
446 MOV v26.16b, v27.16b
447 STR q24, [x17], 16
448 MOV v24.16b, v25.16b
449 STR q22, [x16], 16
450 MOV v22.16b, v23.16b
451 STR q20, [x6], 16
452 MOV v20.16b, v21.16b
4539:
454 TBZ x1, 1, 10f
455 STR d26, [x7], 8
456 DUP d26, v26.d[1]
457 STR d24, [x17], 8
458 DUP d24, v24.d[1]
459 STR d22, [x16], 8
460 DUP d22, v22.d[1]
461 STR d20, [x6], 8
462 DUP d20, v20.d[1]
463
46410:
465 TBZ x1, 0, 11f
466 STR s26, [x7]
467 STR s24, [x17]
468 STR s22, [x16]
469 STR s20, [x6]
47011:
471 // Restore x19, d12-d15 from stack
472 LDR x19, [sp, 32]
473 LDP d14, d15, [sp, 16]
474 LDP d12, d13, [sp], 48
475 RET
476
477END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55
478
479#ifdef __ELF__
480.section ".note.GNU-stack","",%progbits
481#endif