blob: 348f970a76ea25cc20b829d88ac19f6292333406 [file] [log] [blame]
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard5c9e8892020-04-15 18:50:11 -070020 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070021$else:
Frank Barchard5c9e8892020-04-15 18:50:11 -070022 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
50# B v16 v17 v18 v19
51# C v20
52# C v22
53# C v24
54# C v26
55# C v28
56# C v30
57# Clamp v6 v7
58# unused A v8 v9 v10 v11
59# unused B v12 v13 v14 v15
60
61# Clamp v6, (v4), (v5)
62
63BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
64 # Clamp A and C pointers
65 CMP x0, 2 // if mr < 2
66 ADD x9, x3, x4 // a1 = a0 + a_stride
67 ADD x16, x6, x7 // c1 = c0 + cm_stride
68 CSEL x9, x3, x9, LO // a1 = a0
69 CSEL x16, x6, x16, LO // c1 = c0
70
71 ADD x10, x9, x4 // a2 = a1 + a_stride
72 ADD x17, x16, x7 // c2 = c1 + cm_stride
73 // if mr <= 2
74 CSEL x10, x9, x10, LS // a2 = a1
75 CSEL x17, x16, x17, LS // c2 = c1
76
77 CMP x0, 4 // if mr < 4
78 ADD x11, x10, x4 // a3 = a2 + a_stride
79 ADD x18, x17, x7 // c3 = c2 + cm_stride
80 CSEL x11, x10, x11, LO // a3 = a2
81 CSEL x18, x17, x18, LO // c3 = c2
82
83 ADD x12, x11, x4 // a4 = a3 + a_stride
84 ADD x13, x18, x7 // c4 = c3 + cm_stride
Frank Barchardc4668ed2020-04-20 11:04:28 -070085 // if mr <= 4
Frank Barchardbddfbcd2020-04-15 12:32:41 -070086 CSEL x12, x11, x12, LS // a4 = a3
87 CSEL x13, x18, x13, LS // c4 = c3
88
89 $if INC:
90 # Load acc, params pointer
91 LDP x15, x8, [sp, 8]
92 $else:
93 # Load params pointer
94 LDR x8, [sp, 8]
95
96 CMP x0, 6 // if mr < 6
97 ADD x4, x12, x4 // a5 = a4 + a_stride
98 ADD x7, x13, x7 // c5 = c4 + cm_stride
99 CSEL x4, x12, x4, LO // a5 = a4
100 CSEL x7, x13, x7, LO // c5 = c4
101
102 # Load params scale value
103 LD1R {v6.8h}, [x8]
104 ADD x8, x8, 2
105
106 # Load cn_stride
107 LDR x14, [sp]
108
1090:
110 $if INC:
111 # Load initial accumulators
112 LDP q20, q22, [x15], 32
113 LDP q24, q26, [x15], 32
114 LDP q28, q30, [x15], 32
115 #PRFM PLDL1KEEP, [x5, 0] // Prefetch B
116 #PRFM PLDL1KEEP, [x5, 64]
117 #PRFM PLDL1KEEP, [x5, 128]
118 #PRFM PLDL1KEEP, [x5, 192]
119 #PRFM PLDL1KEEP, [x3] // Prefetch A
120 #PRFM PLDL1KEEP, [x9]
121 #PRFM PLDL1KEEP, [x10]
122 #PRFM PLDL1KEEP, [x11]
123 #PRFM PLDL1KEEP, [x12]
124 #PRFM PLDL1KEEP, [x4]
125 $else:
126 # Load initial bias from w into accumulators
127 LDR q20, [x5], 16
128 MOV v22.16b, v20.16b
129 #PRFM PLDL1KEEP, [x5, 0] // Prefetch B
130 #PRFM PLDL1KEEP, [x5, 64]
131 MOV v24.16b, v20.16b
132 #PRFM PLDL1KEEP, [x5, 128]
133 #PRFM PLDL1KEEP, [x5, 192]
134 MOV v26.16b, v20.16b
135 #PRFM PLDL1KEEP, [x3] // Prefetch A
136 #PRFM PLDL1KEEP, [x9]
137 MOV v28.16b, v20.16b
138 #PRFM PLDL1KEEP, [x10]
139 #PRFM PLDL1KEEP, [x11]
140 MOV v30.16b, v20.16b
141 #PRFM PLDL1KEEP, [x12]
142 #PRFM PLDL1KEEP, [x4]
143
144 # Is there at least 4 halffloats (8 bytes)?
145 SUBS x0, x2, 8 // k = kc - 8
146 B.LO 5f
147
148 # Main loop - 4 halffloats of A (8 bytes)
149 # 24 FMA + 6 ld64 A + 2 LDP B
1501:
151 LDR d0, [x3], 8
152 LDP q16, q17, [x5], 32
153 LDR d1, [x9], 8
154 LDR d2, [x10], 8
155 LDR d3, [x11], 8
156 LDR d4, [x12], 8
157 LDR d5, [x4], 8
158
159 FMLA v20.8h, v16.8h, v0.h[0]
160 FMLA v22.8h, v16.8h, v1.h[0]
161 FMLA v24.8h, v16.8h, v2.h[0]
162 FMLA v26.8h, v16.8h, v3.h[0]
163 FMLA v28.8h, v16.8h, v4.h[0]
164 FMLA v30.8h, v16.8h, v5.h[0]
165 LDP q18, q19, [x5], 32
166
167 FMLA v20.8h, v17.8h, v0.h[1]
168 FMLA v22.8h, v17.8h, v1.h[1]
169 FMLA v24.8h, v17.8h, v2.h[1]
170 FMLA v26.8h, v17.8h, v3.h[1]
171 FMLA v28.8h, v17.8h, v4.h[1]
172 FMLA v30.8h, v17.8h, v5.h[1]
173
174 FMLA v20.8h, v18.8h, v0.h[2]
175 FMLA v22.8h, v18.8h, v1.h[2]
176 FMLA v24.8h, v18.8h, v2.h[2]
177 FMLA v26.8h, v18.8h, v3.h[2]
178 FMLA v28.8h, v18.8h, v4.h[2]
179 FMLA v30.8h, v18.8h, v5.h[2]
180 SUBS x0, x0, 8
181
182 FMLA v20.8h, v19.8h, v0.h[3]
183 FMLA v22.8h, v19.8h, v1.h[3]
184 FMLA v24.8h, v19.8h, v2.h[3]
185 FMLA v26.8h, v19.8h, v3.h[3]
186 FMLA v28.8h, v19.8h, v4.h[3]
187 FMLA v30.8h, v19.8h, v5.h[3]
188 B.HS 1b
189
190 # Is there a remainder?- 2 halffloats of A (4 bytes)
191 TBNZ x0, 2, 6f
192 # Is there a remainder?- 1 halffloats of A (2 bytes)
193 TBNZ x0, 1, 7f
1944:
195 # Scale and Clamp
196 FMUL v20.8h, v20.8h, v6.8h
197 # Load params values
198 LD2R {v4.8h, v5.8h}, [x8]
199 FMUL v22.8h, v22.8h, v6.8h
200 FMUL v24.8h, v24.8h, v6.8h
201 FMUL v26.8h, v26.8h, v6.8h
202 FMUL v28.8h, v28.8h, v6.8h
203 FMUL v30.8h, v30.8h, v6.8h
204 SUBS x1, x1, 8
205 FMAX v20.8h, v20.8h, v4.8h
206 FMAX v22.8h, v22.8h, v4.8h
207 FMAX v24.8h, v24.8h, v4.8h
208 FMAX v26.8h, v26.8h, v4.8h
209 FMAX v28.8h, v28.8h, v4.8h
210 FMAX v30.8h, v30.8h, v4.8h
211 FMIN v20.8h, v20.8h, v5.8h
212 FMIN v22.8h, v22.8h, v5.8h
213 FMIN v24.8h, v24.8h, v5.8h
214 FMIN v26.8h, v26.8h, v5.8h
215 FMIN v28.8h, v28.8h, v5.8h
216 FMIN v30.8h, v30.8h, v5.8h
217
218 # Store full 6 x 8
219 B.LO 8f
220
221 $if INC:
222 ST1 {v30.16b}, [x7], x14
223 SUB x3, x3, x2 // a0 -= kc
224 ST1 {v28.16b}, [x13], x14
225 SUB x9, x9, x2 // a1 -= kc
226 ST1 {v26.16b}, [x18], x14
227 SUB x10, x10, x2 // a2 -= kc
228 ST1 {v24.16b}, [x17], x14
229 SUB x11, x11, x2 // a3 -= kc
230 ST1 {v22.16b}, [x16], x14
231 SUB x12, x12, x2 // a4 -= kc
232 ST1 {v20.16b}, [x6], x14
233 SUB x4, x4, x2 // a5 -= kc
234 $else:
235 ST1 {v20.16b}, [x6], x14
236 SUB x3, x3, x2 // a0 -= kc
237 ST1 {v22.16b}, [x16], x14
238 SUB x9, x9, x2 // a1 -= kc
239 ST1 {v24.16b}, [x17], x14
240 SUB x10, x10, x2 // a2 -= kc
241 ST1 {v26.16b}, [x18], x14
242 SUB x11, x11, x2 // a3 -= kc
243 ST1 {v28.16b}, [x13], x14
244 SUB x12, x12, x2 // a4 -= kc
245 ST1 {v30.16b}, [x7], x14
246 SUB x4, x4, x2 // a5 -= kc
247
248 B.HI 0b
249 RET
250
2515:
252 TBZ x0, 2, 7f
2536:
254 # Remainder- 2 halffloats of A (4 bytes)
255 LDR s0, [x3], 4
256 LDP q16, q17, [x5], 32
257 LDR s1, [x9], 4
258 LDR s2, [x10], 4
259 LDR s3, [x11], 4
260 LDR s4, [x12], 4
261 LDR s5, [x4], 4
262
263 FMLA v20.8h, v16.8h, v0.h[0]
264 FMLA v22.8h, v16.8h, v1.h[0]
265 FMLA v24.8h, v16.8h, v2.h[0]
266 FMLA v26.8h, v16.8h, v3.h[0]
267 FMLA v28.8h, v16.8h, v4.h[0]
268 FMLA v30.8h, v16.8h, v5.h[0]
269
270 FMLA v20.8h, v17.8h, v0.h[1]
271 FMLA v22.8h, v17.8h, v1.h[1]
272 FMLA v24.8h, v17.8h, v2.h[1]
273 FMLA v26.8h, v17.8h, v3.h[1]
274 FMLA v28.8h, v17.8h, v4.h[1]
275 FMLA v30.8h, v17.8h, v5.h[1]
276
277 TBZ x0, 1, 4b
278
2797:
280 # Remainder- 1 halffloat of A (2 bytes)
281 LDR h0, [x3], 2
282 LDR q16, [x5], 16
283 LDR h1, [x9], 2
284 LDR h2, [x10], 2
285 LDR h3, [x11], 2
286 LDR h4, [x12], 2
287 LDR h5, [x4], 2
288 FMLA v20.8h, v16.8h, v0.h[0]
289 FMLA v22.8h, v16.8h, v1.h[0]
290 FMLA v24.8h, v16.8h, v2.h[0]
291 FMLA v26.8h, v16.8h, v3.h[0]
292 FMLA v28.8h, v16.8h, v4.h[0]
293 FMLA v30.8h, v16.8h, v5.h[0]
294 B 4b
295
296 # Store odd width
2978:
298 TBZ x1, 2, 9f
299 $if INC:
300 STR d30, [x7], 8
301 DUP d30, v30.d[1]
302 STR d28, [x13], 8
303 DUP d28, v28.d[1]
304 STR d26, [x18], 8
305 DUP d26, v26.d[1]
306 STR d24, [x17], 8
307 DUP d24, v24.d[1]
308 STR d22, [x16], 8
309 DUP d22, v22.d[1]
310 STR d20, [x6], 8
311 DUP d20, v20.d[1]
312 $else:
313 STR d20, [x6], 8
314 DUP d20, v20.d[1]
315 STR d22, [x16], 8
316 DUP d22, v22.d[1]
317 STR d24, [x17], 8
318 DUP d24, v24.d[1]
319 STR d26, [x18], 8
320 DUP d26, v26.d[1]
321 STR d28, [x13], 8
322 DUP d28, v28.d[1]
323 STR d30, [x7], 8
324 DUP d30, v30.d[1]
325
3269:
327 TBZ x1, 1, 10f
328 $if INC:
329 STR s30, [x7], 4
330 DUP s30, v30.s[1]
331 STR s28, [x13], 4
332 DUP s28, v28.s[1]
333 STR s26, [x18], 4
334 DUP s26, v26.s[1]
335 STR s24, [x17], 4
336 DUP s24, v24.s[1]
337 STR s22, [x16], 4
338 DUP s22, v22.s[1]
339 STR s20, [x6], 4
340 DUP s20, v20.s[1]
341 $else:
342 STR s20, [x6], 4
343 DUP s20, v20.s[1]
344 STR s22, [x16], 4
345 DUP s22, v22.s[1]
346 STR s24, [x17], 4
347 DUP s24, v24.s[1]
348 STR s26, [x18], 4
349 DUP s26, v26.s[1]
350 STR s28, [x13], 4
351 DUP s28, v28.s[1]
352 STR s30, [x7], 4
353 DUP s30, v30.s[1]
354
35510:
356 TBZ x1, 0, 11f
357 $if INC:
358 STR h30, [x7]
359 STR h28, [x13]
360 STR h26, [x18]
361 STR h24, [x17]
362 STR h22, [x16]
363 STR h20, [x6]
364 $else:
365 STR h20, [x6]
366 STR h22, [x16]
367 STR h24, [x17]
368 STR h26, [x18]
369 STR h28, [x13]
370 STR h30, [x7]
37111:
372 RET
373
374END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
375
376#ifdef __ELF__
377.section ".note.GNU-stack","",%progbits
378#endif