blob: 3a758264a018a4fc754b434e45359efb216bca59 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
Frank Barchardcaf85442019-10-21 22:11:06 -070050# B v16 v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070051# C v20 v21
52# C v22 v23
53# C v24 v25
54# C v26 v27
55# C v28 v29
56# C v30 v31
57# Clamp v6 v7
58# unused A v8 v9 v10 v11
Frank Barchardcaf85442019-10-21 22:11:06 -070059# unused B v12 v13 v14 v15
XNNPACK Teamb455b122019-09-27 18:10:33 -070060
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64
62
Frank Barchardcaf85442019-10-21 22:11:06 -070063 # Clamp A and C pointers
XNNPACK Teamb455b122019-09-27 18:10:33 -070064 ADD x9, x3, x4 // a1 = a0 + a_stride
65 ADD x16, x6, x7 // c1 = c0 + cm_stride
66 CMP x0, 2 // if mr < 2
67 CSEL x9, x3, x9, LO // a1 = a0
68 CSEL x16, x6, x16, LO // c1 = c0
69
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 ADD x10, x9, x4 // a2 = a1 + a_stride
71 ADD x17, x16, x7 // c2 = c1 + cm_stride
72 // if mr <= 2
73 CSEL x10, x9, x10, LS // a2 = a1
74 CSEL x17, x16, x17, LS // c2 = c1
75
XNNPACK Teamb455b122019-09-27 18:10:33 -070076 ADD x11, x10, x4 // a3 = a2 + a_stride
77 ADD x18, x17, x7 // c3 = c2 + cm_stride
78 CMP x0, 4 // if mr < 4
79 CSEL x11, x10, x11, LO // a3 = a2
80 CSEL x18, x17, x18, LO // c3 = c2
81
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 ADD x12, x11, x4 // a4 = a3 + a_stride
83 ADD x13, x18, x7 // c4 = c3 + cm_stride
84 // if mr <= 5
85 CSEL x12, x11, x12, LS // a4 = a3
86 CSEL x13, x18, x13, LS // c4 = c3
87
88 $if INC:
89 # Load acc, params pointer
Frank Barchardcaf85442019-10-21 22:11:06 -070090 LDP x15, x8, [sp, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070091 $else:
92 # Load params pointer
Frank Barchardcaf85442019-10-21 22:11:06 -070093 LDR x8, [sp, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070094
95 ADD x4, x12, x4 // a5 = a4 + a_stride
96 ADD x7, x13, x7 // c5 = c4 + cm_stride
97 CMP x0, 6 // if mr < 6
98 CSEL x4, x12, x4, LO // a5 = a4
99 CSEL x7, x13, x7, LO // c5 = c4
100
101 # Load clamping_params values
102 LD2R {v6.4s, v7.4s}, [x8]
103
104 # Load cn_stride
Frank Barchardcaf85442019-10-21 22:11:06 -0700105 LDR x14, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700106
1070:
108 $if INC:
109 # Load initial accumulators
110 LDP q20, q21, [x15], 32
111 LDP q22, q23, [x15], 32
112 LDP q24, q25, [x15], 32
113 LDP q26, q27, [x15], 32
114 LDP q28, q29, [x15], 32
115 LDP q30, q31, [x15], 32
116 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
117 PRFM PLDL1KEEP, [x5, 64]
118 PRFM PLDL1KEEP, [x5, 128]
119 PRFM PLDL1KEEP, [x5, 192]
120 PRFM PLDL1KEEP, [x3] // Prefetch A
121 PRFM PLDL1KEEP, [x9]
122 PRFM PLDL1KEEP, [x10]
123 PRFM PLDL1KEEP, [x11]
124 PRFM PLDL1KEEP, [x12]
125 PRFM PLDL1KEEP, [x4]
126 $else:
127 # Load initial bias from w into accumulators
128 LDP q20, q21, [x5], 32
129 MOV v22.16b, v20.16b
130 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
131 MOV v23.16b, v21.16b
132 PRFM PLDL1KEEP, [x5, 64]
133 MOV v24.16b, v20.16b
134 PRFM PLDL1KEEP, [x5, 128]
135 MOV v25.16b, v21.16b
136 PRFM PLDL1KEEP, [x5, 192]
137 MOV v26.16b, v20.16b
138 PRFM PLDL1KEEP, [x3] // Prefetch A
139 MOV v27.16b, v21.16b
140 PRFM PLDL1KEEP, [x9]
141 MOV v28.16b, v20.16b
142 PRFM PLDL1KEEP, [x10]
143 MOV v29.16b, v21.16b
144 PRFM PLDL1KEEP, [x11]
145 MOV v30.16b, v20.16b
146 PRFM PLDL1KEEP, [x12]
147 MOV v31.16b, v21.16b
148 PRFM PLDL1KEEP, [x4]
149
150 # Is there at least 2 floats (8 bytes) for main loop?
151 SUBS x0, x2, 8 // k = kc - 8
152 B.LO 2f
153
154 # Main loop - 2 floats of A (8 bytes)
155 # 24 FMA + 6 LD64 A + 2 LDP B
1561:
157 LDR d0, [x3], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700158 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700159 LDR d1, [x9], 8
160 LDR d2, [x10], 8
161 LDR d3, [x11], 8
162 LDR d4, [x12], 8
163 LDR d5, [x4], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700164 FMLA v20.4s, v16.4s, v0.s[0]
165 FMLA v22.4s, v16.4s, v1.s[0]
166 FMLA v24.4s, v16.4s, v2.s[0]
167 FMLA v26.4s, v16.4s, v3.s[0]
168 LDP q18, q19, [x5], 32
169 FMLA v28.4s, v16.4s, v4.s[0]
170 FMLA v30.4s, v16.4s, v5.s[0]
171 FMLA v21.4s, v17.4s, v0.s[0]
172 FMLA v23.4s, v17.4s, v1.s[0]
173 FMLA v25.4s, v17.4s, v2.s[0]
174 FMLA v27.4s, v17.4s, v3.s[0]
175 FMLA v29.4s, v17.4s, v4.s[0]
176 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700177
Frank Barchardcaf85442019-10-21 22:11:06 -0700178 FMLA v20.4s, v18.4s, v0.s[1]
179 FMLA v22.4s, v18.4s, v1.s[1]
180 FMLA v24.4s, v18.4s, v2.s[1]
181 FMLA v26.4s, v18.4s, v3.s[1]
182 FMLA v28.4s, v18.4s, v4.s[1]
183 FMLA v30.4s, v18.4s, v5.s[1]
184 FMLA v21.4s, v19.4s, v0.s[1]
185 FMLA v23.4s, v19.4s, v1.s[1]
186 FMLA v25.4s, v19.4s, v2.s[1]
187 FMLA v27.4s, v19.4s, v3.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700188 SUBS x0, x0, 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700189 FMLA v29.4s, v19.4s, v4.s[1]
190 FMLA v31.4s, v19.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700191 B.HS 1b
192
1932:
194 # Is there a remainder?- 1 floats of A (4 bytes)
195 TBNZ x0, 2, 4f
1963:
197 # Clamp
198 FMIN v20.4s, v20.4s, v6.4s
199 FMIN v21.4s, v21.4s, v6.4s
200 FMIN v22.4s, v22.4s, v6.4s
201 FMIN v23.4s, v23.4s, v6.4s
202 FMIN v24.4s, v24.4s, v6.4s
203 FMIN v25.4s, v25.4s, v6.4s
204 FMIN v26.4s, v26.4s, v6.4s
205 FMIN v27.4s, v27.4s, v6.4s
206 FMIN v28.4s, v28.4s, v6.4s
207 FMIN v29.4s, v29.4s, v6.4s
208 FMIN v30.4s, v30.4s, v6.4s
209 FMIN v31.4s, v31.4s, v6.4s
210 FMAX v20.4s, v20.4s, v7.4s
211 FMAX v21.4s, v21.4s, v7.4s
212 FMAX v22.4s, v22.4s, v7.4s
213 FMAX v23.4s, v23.4s, v7.4s
214 FMAX v24.4s, v24.4s, v7.4s
215 FMAX v25.4s, v25.4s, v7.4s
216 FMAX v26.4s, v26.4s, v7.4s
217 FMAX v27.4s, v27.4s, v7.4s
218 FMAX v28.4s, v28.4s, v7.4s
219 FMAX v29.4s, v29.4s, v7.4s
220 FMAX v30.4s, v30.4s, v7.4s
221 FMAX v31.4s, v31.4s, v7.4s
222
223 # Store full 6 x 8
224 CMP x1, 8
225 B.LO 5f
226
227 $if INC:
228 STP q30, q31, [x7]
229 ADD x7, x7, x14
230 SUB x3, x3, x2 // a0 -= kc
231 STP q28, q29, [x13]
232 ADD x13, x13, x14
233 SUB x9, x9, x2 // a1 -= kc
234 STP q26, q27, [x18]
235 ADD x18, x18, x14
236 SUB x10, x10, x2 // a2 -= kc
237 STP q24, q25, [x17]
238 ADD x17, x17, x14
239 SUB x11, x11, x2 // a3 -= kc
240 STP q22, q23, [x16]
241 ADD x16, x16, x14
242 SUB x12, x12, x2 // a4 -= kc
243 STP q20, q21, [x6]
244 ADD x6, x6, x14
245 SUB x4, x4, x2 // a5 -= kc
246 $else:
247 STP q20, q21, [x6]
248 ADD x6, x6, x14
249 SUB x3, x3, x2 // a0 -= kc
250 STP q22, q23, [x16]
251 ADD x16, x16, x14
252 SUB x9, x9, x2 // a1 -= kc
253 STP q24, q25, [x17]
254 ADD x17, x17, x14
255 SUB x10, x10, x2 // a2 -= kc
256 STP q26, q27, [x18]
257 ADD x18, x18, x14
258 SUB x11, x11, x2 // a3 -= kc
259 STP q28, q29, [x13]
260 ADD x13, x13, x14
261 SUB x12, x12, x2 // a4 -= kc
262 STP q30, q31, [x7]
263 ADD x7, x7, x14
264 SUB x4, x4, x2 // a5 -= kc
265
266 SUBS x1, x1, 8
267 B.HI 0b
268
XNNPACK Teamb455b122019-09-27 18:10:33 -0700269 RET
270
2714:
272 # Remainder- 1 floats of A (4 bytes)
273 LDR s0, [x3], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700274 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700275 LDR s1, [x9], 4
276 LDR s2, [x10], 4
277 LDR s3, [x11], 4
278 LDR s4, [x12], 4
279 LDR s5, [x4], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700280 FMLA v20.4s, v16.4s, v0.s[0]
281 FMLA v22.4s, v16.4s, v1.s[0]
282 FMLA v24.4s, v16.4s, v2.s[0]
283 FMLA v26.4s, v16.4s, v3.s[0]
284 FMLA v28.4s, v16.4s, v4.s[0]
285 FMLA v30.4s, v16.4s, v5.s[0]
286 FMLA v21.4s, v17.4s, v0.s[0]
287 FMLA v23.4s, v17.4s, v1.s[0]
288 FMLA v25.4s, v17.4s, v2.s[0]
289 FMLA v27.4s, v17.4s, v3.s[0]
290 FMLA v29.4s, v17.4s, v4.s[0]
291 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700292 B 3b
293
294 # Store odd width
2955:
296 TBZ x1, 2, 6f
297 $if INC:
298 STR q30, [x7], 16
299 MOV v30.16b, v31.16b
300 STR q28, [x13], 16
301 MOV v28.16b, v29.16b
302 STR q26, [x18], 16
303 MOV v26.16b, v27.16b
304 STR q24, [x17], 16
305 MOV v24.16b, v25.16b
306 STR q22, [x16], 16
307 MOV v22.16b, v23.16b
308 STR q20, [x6], 16
309 MOV v20.16b, v21.16b
310 $else:
311 STR q20, [x6], 16
312 MOV v20.16b, v21.16b
313 STR q22, [x16], 16
314 MOV v22.16b, v23.16b
315 STR q24, [x17], 16
316 MOV v24.16b, v25.16b
317 STR q26, [x18], 16
318 MOV v26.16b, v27.16b
319 STR q28, [x13], 16
320 MOV v28.16b, v29.16b
321 STR q30, [x7], 16
322 MOV v30.16b, v31.16b
323
3246:
325 TBZ x1, 1, 7f
326 $if INC:
327 STR d30, [x7], 8
328 DUP d30, v30.d[1]
329 STR d28, [x13], 8
330 DUP d28, v28.d[1]
331 STR d26, [x18], 8
332 DUP d26, v26.d[1]
333 STR d24, [x17], 8
334 DUP d24, v24.d[1]
335 STR d22, [x16], 8
336 DUP d22, v22.d[1]
337 STR d20, [x6], 8
338 DUP d20, v20.d[1]
339 $else:
340 STR d20, [x6], 8
341 DUP d20, v20.d[1]
342 STR d22, [x16], 8
343 DUP d22, v22.d[1]
344 STR d24, [x17], 8
345 DUP d24, v24.d[1]
346 STR d26, [x18], 8
347 DUP d26, v26.d[1]
348 STR d28, [x13], 8
349 DUP d28, v28.d[1]
350 STR d30, [x7], 8
351 DUP d30, v30.d[1]
352
3537:
354 TBZ x1, 0, 8f
355 $if INC:
356 STR s30, [x7]
357 STR s28, [x13]
358 STR s26, [x18]
359 STR s24, [x17]
360 STR s22, [x16]
361 STR s20, [x6]
362 $else:
363 STR s20, [x6]
364 STR s22, [x16]
365 STR s24, [x17]
366 STR s26, [x18]
367 STR s28, [x13]
368 STR s30, [x7]
3698:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370 RET
371
372END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma__ld64
373
374#ifdef __ELF__
375.section ".note.GNU-stack","",%progbits
376#endif