blob: 2c1e26cf96893b405273377ca6b19928504a367d [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
Frank Barchard459c9fc2019-10-17 21:36:26 -070050# B v12 v13 v14 v15
XNNPACK Teamb455b122019-09-27 18:10:33 -070051# C v20 v21
52# C v22 v23
53# C v24 v25
54# C v26 v27
55# C v28 v29
56# C v30 v31
57# Clamp v6 v7
58# unused A v8 v9 v10 v11
59# unused B v16 v17 v18 v19
60
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64
62
63 # Clamp A and C pointers / Save d8-d15 on stack
64 STP d8, d9, [sp, -64]!
65 ADD x9, x3, x4 // a1 = a0 + a_stride
66 ADD x16, x6, x7 // c1 = c0 + cm_stride
67 CMP x0, 2 // if mr < 2
68 CSEL x9, x3, x9, LO // a1 = a0
69 CSEL x16, x6, x16, LO // c1 = c0
70
71 STP d10, d11, [sp, 16]
72 ADD x10, x9, x4 // a2 = a1 + a_stride
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x10, x9, x10, LS // a2 = a1
76 CSEL x17, x16, x17, LS // c2 = c1
77
78 STP d12, d13, [sp, 32]
79 ADD x11, x10, x4 // a3 = a2 + a_stride
80 ADD x18, x17, x7 // c3 = c2 + cm_stride
81 CMP x0, 4 // if mr < 4
82 CSEL x11, x10, x11, LO // a3 = a2
83 CSEL x18, x17, x18, LO // c3 = c2
84
85 STP d14, d15, [sp, 48]
86 ADD x12, x11, x4 // a4 = a3 + a_stride
87 ADD x13, x18, x7 // c4 = c3 + cm_stride
88 // if mr <= 5
89 CSEL x12, x11, x12, LS // a4 = a3
90 CSEL x13, x18, x13, LS // c4 = c3
91
92 $if INC:
93 # Load acc, params pointer
94 LDP x15, x8, [sp, 72]
95 $else:
96 # Load params pointer
97 LDR x8, [sp, 72]
98
99 ADD x4, x12, x4 // a5 = a4 + a_stride
100 ADD x7, x13, x7 // c5 = c4 + cm_stride
101 CMP x0, 6 // if mr < 6
102 CSEL x4, x12, x4, LO // a5 = a4
103 CSEL x7, x13, x7, LO // c5 = c4
104
105 # Load clamping_params values
106 LD2R {v6.4s, v7.4s}, [x8]
107
108 # Load cn_stride
109 LDR x14, [sp, 64]
110
1110:
112 $if INC:
113 # Load initial accumulators
114 LDP q20, q21, [x15], 32
115 LDP q22, q23, [x15], 32
116 LDP q24, q25, [x15], 32
117 LDP q26, q27, [x15], 32
118 LDP q28, q29, [x15], 32
119 LDP q30, q31, [x15], 32
120 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
121 PRFM PLDL1KEEP, [x5, 64]
122 PRFM PLDL1KEEP, [x5, 128]
123 PRFM PLDL1KEEP, [x5, 192]
124 PRFM PLDL1KEEP, [x3] // Prefetch A
125 PRFM PLDL1KEEP, [x9]
126 PRFM PLDL1KEEP, [x10]
127 PRFM PLDL1KEEP, [x11]
128 PRFM PLDL1KEEP, [x12]
129 PRFM PLDL1KEEP, [x4]
130 $else:
131 # Load initial bias from w into accumulators
132 LDP q20, q21, [x5], 32
133 MOV v22.16b, v20.16b
134 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
135 MOV v23.16b, v21.16b
136 PRFM PLDL1KEEP, [x5, 64]
137 MOV v24.16b, v20.16b
138 PRFM PLDL1KEEP, [x5, 128]
139 MOV v25.16b, v21.16b
140 PRFM PLDL1KEEP, [x5, 192]
141 MOV v26.16b, v20.16b
142 PRFM PLDL1KEEP, [x3] // Prefetch A
143 MOV v27.16b, v21.16b
144 PRFM PLDL1KEEP, [x9]
145 MOV v28.16b, v20.16b
146 PRFM PLDL1KEEP, [x10]
147 MOV v29.16b, v21.16b
148 PRFM PLDL1KEEP, [x11]
149 MOV v30.16b, v20.16b
150 PRFM PLDL1KEEP, [x12]
151 MOV v31.16b, v21.16b
152 PRFM PLDL1KEEP, [x4]
153
154 # Is there at least 2 floats (8 bytes) for main loop?
155 SUBS x0, x2, 8 // k = kc - 8
156 B.LO 2f
157
158 # Main loop - 2 floats of A (8 bytes)
159 # 24 FMA + 6 LD64 A + 2 LDP B
1601:
161 LDR d0, [x3], 8
162 LDP q12, q13, [x5], 32
163 LDR d1, [x9], 8
164 LDR d2, [x10], 8
165 LDR d3, [x11], 8
166 LDR d4, [x12], 8
167 LDR d5, [x4], 8
168 FMLA v20.4s, v12.4s, v0.s[0]
169 FMLA v22.4s, v12.4s, v1.s[0]
170 FMLA v24.4s, v12.4s, v2.s[0]
171 FMLA v26.4s, v12.4s, v3.s[0]
172 LDP q14, q15, [x5], 32
173 FMLA v28.4s, v12.4s, v4.s[0]
174 FMLA v30.4s, v12.4s, v5.s[0]
175 FMLA v21.4s, v13.4s, v0.s[0]
176 FMLA v23.4s, v13.4s, v1.s[0]
177 FMLA v25.4s, v13.4s, v2.s[0]
178 FMLA v27.4s, v13.4s, v3.s[0]
179 FMLA v29.4s, v13.4s, v4.s[0]
180 FMLA v31.4s, v13.4s, v5.s[0]
181
182 FMLA v20.4s, v14.4s, v0.s[1]
183 FMLA v22.4s, v14.4s, v1.s[1]
184 FMLA v24.4s, v14.4s, v2.s[1]
185 FMLA v26.4s, v14.4s, v3.s[1]
186 FMLA v28.4s, v14.4s, v4.s[1]
187 FMLA v30.4s, v14.4s, v5.s[1]
188 FMLA v21.4s, v15.4s, v0.s[1]
189 FMLA v23.4s, v15.4s, v1.s[1]
190 FMLA v25.4s, v15.4s, v2.s[1]
191 FMLA v27.4s, v15.4s, v3.s[1]
192 SUBS x0, x0, 8
193 FMLA v29.4s, v15.4s, v4.s[1]
194 FMLA v31.4s, v15.4s, v5.s[1]
195 B.HS 1b
196
1972:
198 # Is there a remainder?- 1 floats of A (4 bytes)
199 TBNZ x0, 2, 4f
2003:
201 # Clamp
202 FMIN v20.4s, v20.4s, v6.4s
203 FMIN v21.4s, v21.4s, v6.4s
204 FMIN v22.4s, v22.4s, v6.4s
205 FMIN v23.4s, v23.4s, v6.4s
206 FMIN v24.4s, v24.4s, v6.4s
207 FMIN v25.4s, v25.4s, v6.4s
208 FMIN v26.4s, v26.4s, v6.4s
209 FMIN v27.4s, v27.4s, v6.4s
210 FMIN v28.4s, v28.4s, v6.4s
211 FMIN v29.4s, v29.4s, v6.4s
212 FMIN v30.4s, v30.4s, v6.4s
213 FMIN v31.4s, v31.4s, v6.4s
214 FMAX v20.4s, v20.4s, v7.4s
215 FMAX v21.4s, v21.4s, v7.4s
216 FMAX v22.4s, v22.4s, v7.4s
217 FMAX v23.4s, v23.4s, v7.4s
218 FMAX v24.4s, v24.4s, v7.4s
219 FMAX v25.4s, v25.4s, v7.4s
220 FMAX v26.4s, v26.4s, v7.4s
221 FMAX v27.4s, v27.4s, v7.4s
222 FMAX v28.4s, v28.4s, v7.4s
223 FMAX v29.4s, v29.4s, v7.4s
224 FMAX v30.4s, v30.4s, v7.4s
225 FMAX v31.4s, v31.4s, v7.4s
226
227 # Store full 6 x 8
228 CMP x1, 8
229 B.LO 5f
230
231 $if INC:
232 STP q30, q31, [x7]
233 ADD x7, x7, x14
234 SUB x3, x3, x2 // a0 -= kc
235 STP q28, q29, [x13]
236 ADD x13, x13, x14
237 SUB x9, x9, x2 // a1 -= kc
238 STP q26, q27, [x18]
239 ADD x18, x18, x14
240 SUB x10, x10, x2 // a2 -= kc
241 STP q24, q25, [x17]
242 ADD x17, x17, x14
243 SUB x11, x11, x2 // a3 -= kc
244 STP q22, q23, [x16]
245 ADD x16, x16, x14
246 SUB x12, x12, x2 // a4 -= kc
247 STP q20, q21, [x6]
248 ADD x6, x6, x14
249 SUB x4, x4, x2 // a5 -= kc
250 $else:
251 STP q20, q21, [x6]
252 ADD x6, x6, x14
253 SUB x3, x3, x2 // a0 -= kc
254 STP q22, q23, [x16]
255 ADD x16, x16, x14
256 SUB x9, x9, x2 // a1 -= kc
257 STP q24, q25, [x17]
258 ADD x17, x17, x14
259 SUB x10, x10, x2 // a2 -= kc
260 STP q26, q27, [x18]
261 ADD x18, x18, x14
262 SUB x11, x11, x2 // a3 -= kc
263 STP q28, q29, [x13]
264 ADD x13, x13, x14
265 SUB x12, x12, x2 // a4 -= kc
266 STP q30, q31, [x7]
267 ADD x7, x7, x14
268 SUB x4, x4, x2 // a5 -= kc
269
270 SUBS x1, x1, 8
271 B.HI 0b
272
273 # Restore d8-d15 from stack
274 LDP d14, d15, [sp, 48]
275 LDP d12, d13, [sp, 32]
276 LDP d10, d11, [sp, 16]
277 LDP d8, d9, [sp], 64
278 RET
279
2804:
281 # Remainder- 1 floats of A (4 bytes)
282 LDR s0, [x3], 4
283 LDP q12, q13, [x5], 32
284 LDR s1, [x9], 4
285 LDR s2, [x10], 4
286 LDR s3, [x11], 4
287 LDR s4, [x12], 4
288 LDR s5, [x4], 4
289 FMLA v20.4s, v12.4s, v0.s[0]
290 FMLA v22.4s, v12.4s, v1.s[0]
291 FMLA v24.4s, v12.4s, v2.s[0]
292 FMLA v26.4s, v12.4s, v3.s[0]
293 FMLA v28.4s, v12.4s, v4.s[0]
294 FMLA v30.4s, v12.4s, v5.s[0]
295 FMLA v21.4s, v13.4s, v0.s[0]
296 FMLA v23.4s, v13.4s, v1.s[0]
297 FMLA v25.4s, v13.4s, v2.s[0]
298 FMLA v27.4s, v13.4s, v3.s[0]
299 FMLA v29.4s, v13.4s, v4.s[0]
300 FMLA v31.4s, v13.4s, v5.s[0]
301 B 3b
302
303 # Store odd width
3045:
305 TBZ x1, 2, 6f
306 $if INC:
307 STR q30, [x7], 16
308 MOV v30.16b, v31.16b
309 STR q28, [x13], 16
310 MOV v28.16b, v29.16b
311 STR q26, [x18], 16
312 MOV v26.16b, v27.16b
313 STR q24, [x17], 16
314 MOV v24.16b, v25.16b
315 STR q22, [x16], 16
316 MOV v22.16b, v23.16b
317 STR q20, [x6], 16
318 MOV v20.16b, v21.16b
319 $else:
320 STR q20, [x6], 16
321 MOV v20.16b, v21.16b
322 STR q22, [x16], 16
323 MOV v22.16b, v23.16b
324 STR q24, [x17], 16
325 MOV v24.16b, v25.16b
326 STR q26, [x18], 16
327 MOV v26.16b, v27.16b
328 STR q28, [x13], 16
329 MOV v28.16b, v29.16b
330 STR q30, [x7], 16
331 MOV v30.16b, v31.16b
332
3336:
334 TBZ x1, 1, 7f
335 $if INC:
336 STR d30, [x7], 8
337 DUP d30, v30.d[1]
338 STR d28, [x13], 8
339 DUP d28, v28.d[1]
340 STR d26, [x18], 8
341 DUP d26, v26.d[1]
342 STR d24, [x17], 8
343 DUP d24, v24.d[1]
344 STR d22, [x16], 8
345 DUP d22, v22.d[1]
346 STR d20, [x6], 8
347 DUP d20, v20.d[1]
348 $else:
349 STR d20, [x6], 8
350 DUP d20, v20.d[1]
351 STR d22, [x16], 8
352 DUP d22, v22.d[1]
353 STR d24, [x17], 8
354 DUP d24, v24.d[1]
355 STR d26, [x18], 8
356 DUP d26, v26.d[1]
357 STR d28, [x13], 8
358 DUP d28, v28.d[1]
359 STR d30, [x7], 8
360 DUP d30, v30.d[1]
361
3627:
363 TBZ x1, 0, 8f
364 $if INC:
365 STR s30, [x7]
366 STR s28, [x13]
367 STR s26, [x18]
368 STR s24, [x17]
369 STR s22, [x16]
370 STR s20, [x6]
371 $else:
372 STR s20, [x6]
373 STR s22, [x16]
374 STR s24, [x17]
375 STR s26, [x18]
376 STR s28, [x13]
377 STR s30, [x7]
3788:
379 # Restore d8-d15 from stack
380 LDP d14, d15, [sp, 48]
381 LDP d12, d13, [sp, 32]
382 LDP d10, d11, [sp, 16]
383 LDP d8, d9, [sp], 64
384 RET
385
386END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma__ld64
387
388#ifdef __ELF__
389.section ".note.GNU-stack","",%progbits
390#endif