blob: eb0ecc28b7ccc409cdf2d6151bfd9f9823104471 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
Frank Barchardcaf85442019-10-21 22:11:06 -070050# B v16 v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070051# C v20 v21
52# C v22 v23
53# C v24 v25
54# C v26 v27
55# C v28 v29
56# C v30 v31
57# Clamp v6 v7
58# unused A v8 v9 v10 v11
Frank Barchardcaf85442019-10-21 22:11:06 -070059# unused B v12 v13 v14 v15
XNNPACK Teamb455b122019-09-27 18:10:33 -070060
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64
62
Frank Barchardcaf85442019-10-21 22:11:06 -070063 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080064 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 ADD x9, x3, x4 // a1 = a0 + a_stride
66 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 CSEL x9, x3, x9, LO // a1 = a0
68 CSEL x16, x6, x16, LO // c1 = c0
69
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 ADD x10, x9, x4 // a2 = a1 + a_stride
71 ADD x17, x16, x7 // c2 = c1 + cm_stride
72 // if mr <= 2
73 CSEL x10, x9, x10, LS // a2 = a1
74 CSEL x17, x16, x17, LS // c2 = c1
75
Frank Barchard684bbb02019-11-16 14:14:42 -080076 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070077 ADD x11, x10, x4 // a3 = a2 + a_stride
78 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 CSEL x11, x10, x11, LO // a3 = a2
80 CSEL x18, x17, x18, LO // c3 = c2
81
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 ADD x12, x11, x4 // a4 = a3 + a_stride
83 ADD x13, x18, x7 // c4 = c3 + cm_stride
84 // if mr <= 5
85 CSEL x12, x11, x12, LS // a4 = a3
86 CSEL x13, x18, x13, LS // c4 = c3
87
88 $if INC:
89 # Load acc, params pointer
Frank Barchardcaf85442019-10-21 22:11:06 -070090 LDP x15, x8, [sp, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070091 $else:
92 # Load params pointer
Frank Barchardcaf85442019-10-21 22:11:06 -070093 LDR x8, [sp, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070094
Frank Barchard684bbb02019-11-16 14:14:42 -080095 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070096 ADD x4, x12, x4 // a5 = a4 + a_stride
97 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 CSEL x4, x12, x4, LO // a5 = a4
99 CSEL x7, x13, x7, LO // c5 = c4
100
101 # Load clamping_params values
102 LD2R {v6.4s, v7.4s}, [x8]
103
104 # Load cn_stride
Frank Barchardcaf85442019-10-21 22:11:06 -0700105 LDR x14, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700106
1070:
108 $if INC:
109 # Load initial accumulators
110 LDP q20, q21, [x15], 32
111 LDP q22, q23, [x15], 32
112 LDP q24, q25, [x15], 32
113 LDP q26, q27, [x15], 32
114 LDP q28, q29, [x15], 32
115 LDP q30, q31, [x15], 32
116 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
117 PRFM PLDL1KEEP, [x5, 64]
118 PRFM PLDL1KEEP, [x5, 128]
119 PRFM PLDL1KEEP, [x5, 192]
120 PRFM PLDL1KEEP, [x3] // Prefetch A
121 PRFM PLDL1KEEP, [x9]
122 PRFM PLDL1KEEP, [x10]
123 PRFM PLDL1KEEP, [x11]
124 PRFM PLDL1KEEP, [x12]
125 PRFM PLDL1KEEP, [x4]
126 $else:
127 # Load initial bias from w into accumulators
128 LDP q20, q21, [x5], 32
129 MOV v22.16b, v20.16b
130 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
131 MOV v23.16b, v21.16b
132 PRFM PLDL1KEEP, [x5, 64]
133 MOV v24.16b, v20.16b
134 PRFM PLDL1KEEP, [x5, 128]
135 MOV v25.16b, v21.16b
136 PRFM PLDL1KEEP, [x5, 192]
137 MOV v26.16b, v20.16b
138 PRFM PLDL1KEEP, [x3] // Prefetch A
139 MOV v27.16b, v21.16b
140 PRFM PLDL1KEEP, [x9]
141 MOV v28.16b, v20.16b
142 PRFM PLDL1KEEP, [x10]
143 MOV v29.16b, v21.16b
144 PRFM PLDL1KEEP, [x11]
145 MOV v30.16b, v20.16b
146 PRFM PLDL1KEEP, [x12]
147 MOV v31.16b, v21.16b
148 PRFM PLDL1KEEP, [x4]
149
150 # Is there at least 2 floats (8 bytes) for main loop?
151 SUBS x0, x2, 8 // k = kc - 8
152 B.LO 2f
153
154 # Main loop - 2 floats of A (8 bytes)
155 # 24 FMA + 6 LD64 A + 2 LDP B
1561:
157 LDR d0, [x3], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700158 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700159 LDR d1, [x9], 8
160 LDR d2, [x10], 8
161 LDR d3, [x11], 8
162 LDR d4, [x12], 8
163 LDR d5, [x4], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700164 FMLA v20.4s, v16.4s, v0.s[0]
165 FMLA v22.4s, v16.4s, v1.s[0]
166 FMLA v24.4s, v16.4s, v2.s[0]
167 FMLA v26.4s, v16.4s, v3.s[0]
168 LDP q18, q19, [x5], 32
169 FMLA v28.4s, v16.4s, v4.s[0]
170 FMLA v30.4s, v16.4s, v5.s[0]
171 FMLA v21.4s, v17.4s, v0.s[0]
172 FMLA v23.4s, v17.4s, v1.s[0]
173 FMLA v25.4s, v17.4s, v2.s[0]
174 FMLA v27.4s, v17.4s, v3.s[0]
175 FMLA v29.4s, v17.4s, v4.s[0]
176 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700177
Frank Barchardcaf85442019-10-21 22:11:06 -0700178 FMLA v20.4s, v18.4s, v0.s[1]
179 FMLA v22.4s, v18.4s, v1.s[1]
180 FMLA v24.4s, v18.4s, v2.s[1]
181 FMLA v26.4s, v18.4s, v3.s[1]
182 FMLA v28.4s, v18.4s, v4.s[1]
183 FMLA v30.4s, v18.4s, v5.s[1]
184 FMLA v21.4s, v19.4s, v0.s[1]
185 FMLA v23.4s, v19.4s, v1.s[1]
186 FMLA v25.4s, v19.4s, v2.s[1]
187 FMLA v27.4s, v19.4s, v3.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700188 SUBS x0, x0, 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700189 FMLA v29.4s, v19.4s, v4.s[1]
190 FMLA v31.4s, v19.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700191 B.HS 1b
192
1932:
194 # Is there a remainder?- 1 floats of A (4 bytes)
195 TBNZ x0, 2, 4f
1963:
197 # Clamp
198 FMIN v20.4s, v20.4s, v6.4s
199 FMIN v21.4s, v21.4s, v6.4s
200 FMIN v22.4s, v22.4s, v6.4s
201 FMIN v23.4s, v23.4s, v6.4s
202 FMIN v24.4s, v24.4s, v6.4s
203 FMIN v25.4s, v25.4s, v6.4s
204 FMIN v26.4s, v26.4s, v6.4s
205 FMIN v27.4s, v27.4s, v6.4s
206 FMIN v28.4s, v28.4s, v6.4s
207 FMIN v29.4s, v29.4s, v6.4s
208 FMIN v30.4s, v30.4s, v6.4s
209 FMIN v31.4s, v31.4s, v6.4s
210 FMAX v20.4s, v20.4s, v7.4s
211 FMAX v21.4s, v21.4s, v7.4s
212 FMAX v22.4s, v22.4s, v7.4s
213 FMAX v23.4s, v23.4s, v7.4s
214 FMAX v24.4s, v24.4s, v7.4s
215 FMAX v25.4s, v25.4s, v7.4s
216 FMAX v26.4s, v26.4s, v7.4s
217 FMAX v27.4s, v27.4s, v7.4s
218 FMAX v28.4s, v28.4s, v7.4s
219 FMAX v29.4s, v29.4s, v7.4s
220 FMAX v30.4s, v30.4s, v7.4s
221 FMAX v31.4s, v31.4s, v7.4s
222
223 # Store full 6 x 8
224 CMP x1, 8
225 B.LO 5f
226
227 $if INC:
Frank Barcharde67b7832019-11-12 12:48:40 -0800228 ST1 {v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700229 SUB x3, x3, x2 // a0 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800230 ST1 {v28.16b, v29.16b}, [x13], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700231 SUB x9, x9, x2 // a1 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800232 ST1 {v26.16b, v27.16b}, [x18], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700233 SUB x10, x10, x2 // a2 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800234 ST1 {v24.16b, v25.16b}, [x17], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700235 SUB x11, x11, x2 // a3 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800236 ST1 {v22.16b, v23.16b}, [x16], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700237 SUB x12, x12, x2 // a4 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800238 ST1 {v20.16b, v21.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700239 SUB x4, x4, x2 // a5 -= kc
240 $else:
Frank Barcharde67b7832019-11-12 12:48:40 -0800241 ST1 {v20.16b, v21.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700242 SUB x3, x3, x2 // a0 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800243 ST1 {v22.16b, v23.16b}, [x16], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700244 SUB x9, x9, x2 // a1 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800245 ST1 {v24.16b, v25.16b}, [x17], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700246 SUB x10, x10, x2 // a2 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800247 ST1 {v26.16b, v27.16b}, [x18], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700248 SUB x11, x11, x2 // a3 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800249 ST1 {v28.16b, v29.16b}, [x13], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700250 SUB x12, x12, x2 // a4 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800251 ST1 {v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700252 SUB x4, x4, x2 // a5 -= kc
253
254 SUBS x1, x1, 8
255 B.HI 0b
256
XNNPACK Teamb455b122019-09-27 18:10:33 -0700257 RET
258
2594:
260 # Remainder- 1 floats of A (4 bytes)
261 LDR s0, [x3], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700262 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700263 LDR s1, [x9], 4
264 LDR s2, [x10], 4
265 LDR s3, [x11], 4
266 LDR s4, [x12], 4
267 LDR s5, [x4], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700268 FMLA v20.4s, v16.4s, v0.s[0]
269 FMLA v22.4s, v16.4s, v1.s[0]
270 FMLA v24.4s, v16.4s, v2.s[0]
271 FMLA v26.4s, v16.4s, v3.s[0]
272 FMLA v28.4s, v16.4s, v4.s[0]
273 FMLA v30.4s, v16.4s, v5.s[0]
274 FMLA v21.4s, v17.4s, v0.s[0]
275 FMLA v23.4s, v17.4s, v1.s[0]
276 FMLA v25.4s, v17.4s, v2.s[0]
277 FMLA v27.4s, v17.4s, v3.s[0]
278 FMLA v29.4s, v17.4s, v4.s[0]
279 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700280 B 3b
281
282 # Store odd width
2835:
284 TBZ x1, 2, 6f
285 $if INC:
286 STR q30, [x7], 16
287 MOV v30.16b, v31.16b
288 STR q28, [x13], 16
289 MOV v28.16b, v29.16b
290 STR q26, [x18], 16
291 MOV v26.16b, v27.16b
292 STR q24, [x17], 16
293 MOV v24.16b, v25.16b
294 STR q22, [x16], 16
295 MOV v22.16b, v23.16b
296 STR q20, [x6], 16
297 MOV v20.16b, v21.16b
298 $else:
299 STR q20, [x6], 16
300 MOV v20.16b, v21.16b
301 STR q22, [x16], 16
302 MOV v22.16b, v23.16b
303 STR q24, [x17], 16
304 MOV v24.16b, v25.16b
305 STR q26, [x18], 16
306 MOV v26.16b, v27.16b
307 STR q28, [x13], 16
308 MOV v28.16b, v29.16b
309 STR q30, [x7], 16
310 MOV v30.16b, v31.16b
311
3126:
313 TBZ x1, 1, 7f
314 $if INC:
315 STR d30, [x7], 8
316 DUP d30, v30.d[1]
317 STR d28, [x13], 8
318 DUP d28, v28.d[1]
319 STR d26, [x18], 8
320 DUP d26, v26.d[1]
321 STR d24, [x17], 8
322 DUP d24, v24.d[1]
323 STR d22, [x16], 8
324 DUP d22, v22.d[1]
325 STR d20, [x6], 8
326 DUP d20, v20.d[1]
327 $else:
328 STR d20, [x6], 8
329 DUP d20, v20.d[1]
330 STR d22, [x16], 8
331 DUP d22, v22.d[1]
332 STR d24, [x17], 8
333 DUP d24, v24.d[1]
334 STR d26, [x18], 8
335 DUP d26, v26.d[1]
336 STR d28, [x13], 8
337 DUP d28, v28.d[1]
338 STR d30, [x7], 8
339 DUP d30, v30.d[1]
340
3417:
342 TBZ x1, 0, 8f
343 $if INC:
344 STR s30, [x7]
345 STR s28, [x13]
346 STR s26, [x18]
347 STR s24, [x17]
348 STR s22, [x16]
349 STR s20, [x6]
350 $else:
351 STR s20, [x6]
352 STR s22, [x16]
353 STR s24, [x17]
354 STR s26, [x18]
355 STR s28, [x13]
356 STR s30, [x7]
3578:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700358 RET
359
360END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma__ld64
361
362#ifdef __ELF__
363.section ".note.GNU-stack","",%progbits
364#endif