blob: 3a9be5451efccf3f203b86db1c4dd26970a926ae [file] [log] [blame]
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard5c9e8892020-04-15 18:50:11 -070020 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070021$else:
Frank Barchard5c9e8892020-04-15 18:50:11 -070022 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
50# B v16 v17 v18 v19
51# C v20
52# C v22
53# C v24
54# C v26
55# C v28
56# C v30
Frank Barchardf5cc7e72020-04-20 11:35:48 -070057# Clamp v6, (v4), (v5)
Frank Barchardbddfbcd2020-04-15 12:32:41 -070058# unused A v8 v9 v10 v11
59# unused B v12 v13 v14 v15
60
Frank Barchardbddfbcd2020-04-15 12:32:41 -070061
62BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
Frank Barchardf5cc7e72020-04-20 11:35:48 -070063
64 $if INC:
65 # Load cn_stride, acc
66 LDP x14, x15, [sp]
67 # Load params pointer
68 LDR x8, [sp, 16]
69 $else:
70 # Load cn_stride, params pointer
71 LDP x14, x8, [sp]
72
Frank Barchardbddfbcd2020-04-15 12:32:41 -070073 # Clamp A and C pointers
74 CMP x0, 2 // if mr < 2
75 ADD x9, x3, x4 // a1 = a0 + a_stride
76 ADD x16, x6, x7 // c1 = c0 + cm_stride
77 CSEL x9, x3, x9, LO // a1 = a0
78 CSEL x16, x6, x16, LO // c1 = c0
79
80 ADD x10, x9, x4 // a2 = a1 + a_stride
81 ADD x17, x16, x7 // c2 = c1 + cm_stride
82 // if mr <= 2
83 CSEL x10, x9, x10, LS // a2 = a1
84 CSEL x17, x16, x17, LS // c2 = c1
85
86 CMP x0, 4 // if mr < 4
87 ADD x11, x10, x4 // a3 = a2 + a_stride
88 ADD x18, x17, x7 // c3 = c2 + cm_stride
89 CSEL x11, x10, x11, LO // a3 = a2
90 CSEL x18, x17, x18, LO // c3 = c2
91
92 ADD x12, x11, x4 // a4 = a3 + a_stride
93 ADD x13, x18, x7 // c4 = c3 + cm_stride
Frank Barchardc4668ed2020-04-20 11:04:28 -070094 // if mr <= 4
Frank Barchardbddfbcd2020-04-15 12:32:41 -070095 CSEL x12, x11, x12, LS // a4 = a3
96 CSEL x13, x18, x13, LS // c4 = c3
97
Frank Barchardbddfbcd2020-04-15 12:32:41 -070098 CMP x0, 6 // if mr < 6
99 ADD x4, x12, x4 // a5 = a4 + a_stride
100 ADD x7, x13, x7 // c5 = c4 + cm_stride
101 CSEL x4, x12, x4, LO // a5 = a4
102 CSEL x7, x13, x7, LO // c5 = c4
103
104 # Load params scale value
105 LD1R {v6.8h}, [x8]
106 ADD x8, x8, 2
107
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001080:
109 $if INC:
110 # Load initial accumulators
111 LDP q20, q22, [x15], 32
112 LDP q24, q26, [x15], 32
113 LDP q28, q30, [x15], 32
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700114 $else:
115 # Load initial bias from w into accumulators
116 LDR q20, [x5], 16
117 MOV v22.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700118 MOV v24.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700119 MOV v26.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700120 MOV v28.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700121 MOV v30.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700122
123 # Is there at least 4 halffloats (8 bytes)?
124 SUBS x0, x2, 8 // k = kc - 8
125 B.LO 5f
126
127 # Main loop - 4 halffloats of A (8 bytes)
Frank Barchardfc563f52020-04-24 16:21:11 -0700128 # 24 FMA + 6 ld64 A + 4 LDR B
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001291:
130 LDR d0, [x3], 8
Frank Barchardfc563f52020-04-24 16:21:11 -0700131 LDR q16, [x5], 16
132 LDR q17, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700133 LDR d1, [x9], 8
134 LDR d2, [x10], 8
135 LDR d3, [x11], 8
136 LDR d4, [x12], 8
137 LDR d5, [x4], 8
Frank Barchardfc563f52020-04-24 16:21:11 -0700138 SUBS x0, x0, 8
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700139 FMLA v20.8h, v16.8h, v0.h[0]
140 FMLA v22.8h, v16.8h, v1.h[0]
141 FMLA v24.8h, v16.8h, v2.h[0]
142 FMLA v26.8h, v16.8h, v3.h[0]
143 FMLA v28.8h, v16.8h, v4.h[0]
144 FMLA v30.8h, v16.8h, v5.h[0]
Frank Barchardfc563f52020-04-24 16:21:11 -0700145 LDR q18, [x5], 16
146 LDR q19, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700147
148 FMLA v20.8h, v17.8h, v0.h[1]
149 FMLA v22.8h, v17.8h, v1.h[1]
150 FMLA v24.8h, v17.8h, v2.h[1]
151 FMLA v26.8h, v17.8h, v3.h[1]
152 FMLA v28.8h, v17.8h, v4.h[1]
153 FMLA v30.8h, v17.8h, v5.h[1]
154
155 FMLA v20.8h, v18.8h, v0.h[2]
156 FMLA v22.8h, v18.8h, v1.h[2]
157 FMLA v24.8h, v18.8h, v2.h[2]
158 FMLA v26.8h, v18.8h, v3.h[2]
159 FMLA v28.8h, v18.8h, v4.h[2]
160 FMLA v30.8h, v18.8h, v5.h[2]
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700161
162 FMLA v20.8h, v19.8h, v0.h[3]
163 FMLA v22.8h, v19.8h, v1.h[3]
164 FMLA v24.8h, v19.8h, v2.h[3]
165 FMLA v26.8h, v19.8h, v3.h[3]
166 FMLA v28.8h, v19.8h, v4.h[3]
167 FMLA v30.8h, v19.8h, v5.h[3]
168 B.HS 1b
169
170 # Is there a remainder?- 2 halffloats of A (4 bytes)
171 TBNZ x0, 2, 6f
172 # Is there a remainder?- 1 halffloats of A (2 bytes)
173 TBNZ x0, 1, 7f
1744:
175 # Scale and Clamp
176 FMUL v20.8h, v20.8h, v6.8h
177 # Load params values
178 LD2R {v4.8h, v5.8h}, [x8]
179 FMUL v22.8h, v22.8h, v6.8h
180 FMUL v24.8h, v24.8h, v6.8h
181 FMUL v26.8h, v26.8h, v6.8h
182 FMUL v28.8h, v28.8h, v6.8h
183 FMUL v30.8h, v30.8h, v6.8h
184 SUBS x1, x1, 8
185 FMAX v20.8h, v20.8h, v4.8h
186 FMAX v22.8h, v22.8h, v4.8h
187 FMAX v24.8h, v24.8h, v4.8h
188 FMAX v26.8h, v26.8h, v4.8h
189 FMAX v28.8h, v28.8h, v4.8h
190 FMAX v30.8h, v30.8h, v4.8h
191 FMIN v20.8h, v20.8h, v5.8h
192 FMIN v22.8h, v22.8h, v5.8h
193 FMIN v24.8h, v24.8h, v5.8h
194 FMIN v26.8h, v26.8h, v5.8h
195 FMIN v28.8h, v28.8h, v5.8h
196 FMIN v30.8h, v30.8h, v5.8h
197
198 # Store full 6 x 8
199 B.LO 8f
200
201 $if INC:
202 ST1 {v30.16b}, [x7], x14
203 SUB x3, x3, x2 // a0 -= kc
204 ST1 {v28.16b}, [x13], x14
205 SUB x9, x9, x2 // a1 -= kc
206 ST1 {v26.16b}, [x18], x14
207 SUB x10, x10, x2 // a2 -= kc
208 ST1 {v24.16b}, [x17], x14
209 SUB x11, x11, x2 // a3 -= kc
210 ST1 {v22.16b}, [x16], x14
211 SUB x12, x12, x2 // a4 -= kc
212 ST1 {v20.16b}, [x6], x14
213 SUB x4, x4, x2 // a5 -= kc
214 $else:
215 ST1 {v20.16b}, [x6], x14
216 SUB x3, x3, x2 // a0 -= kc
217 ST1 {v22.16b}, [x16], x14
218 SUB x9, x9, x2 // a1 -= kc
219 ST1 {v24.16b}, [x17], x14
220 SUB x10, x10, x2 // a2 -= kc
221 ST1 {v26.16b}, [x18], x14
222 SUB x11, x11, x2 // a3 -= kc
223 ST1 {v28.16b}, [x13], x14
224 SUB x12, x12, x2 // a4 -= kc
225 ST1 {v30.16b}, [x7], x14
226 SUB x4, x4, x2 // a5 -= kc
227
228 B.HI 0b
229 RET
230
2315:
232 TBZ x0, 2, 7f
2336:
234 # Remainder- 2 halffloats of A (4 bytes)
235 LDR s0, [x3], 4
Frank Barchardfc563f52020-04-24 16:21:11 -0700236 LDR q16, [x5], 16
237 LDR q17, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700238 LDR s1, [x9], 4
239 LDR s2, [x10], 4
240 LDR s3, [x11], 4
241 LDR s4, [x12], 4
242 LDR s5, [x4], 4
243
244 FMLA v20.8h, v16.8h, v0.h[0]
245 FMLA v22.8h, v16.8h, v1.h[0]
246 FMLA v24.8h, v16.8h, v2.h[0]
247 FMLA v26.8h, v16.8h, v3.h[0]
248 FMLA v28.8h, v16.8h, v4.h[0]
249 FMLA v30.8h, v16.8h, v5.h[0]
250
251 FMLA v20.8h, v17.8h, v0.h[1]
252 FMLA v22.8h, v17.8h, v1.h[1]
253 FMLA v24.8h, v17.8h, v2.h[1]
254 FMLA v26.8h, v17.8h, v3.h[1]
255 FMLA v28.8h, v17.8h, v4.h[1]
256 FMLA v30.8h, v17.8h, v5.h[1]
257
258 TBZ x0, 1, 4b
259
2607:
261 # Remainder- 1 halffloat of A (2 bytes)
262 LDR h0, [x3], 2
263 LDR q16, [x5], 16
264 LDR h1, [x9], 2
265 LDR h2, [x10], 2
266 LDR h3, [x11], 2
267 LDR h4, [x12], 2
268 LDR h5, [x4], 2
269 FMLA v20.8h, v16.8h, v0.h[0]
270 FMLA v22.8h, v16.8h, v1.h[0]
271 FMLA v24.8h, v16.8h, v2.h[0]
272 FMLA v26.8h, v16.8h, v3.h[0]
273 FMLA v28.8h, v16.8h, v4.h[0]
274 FMLA v30.8h, v16.8h, v5.h[0]
275 B 4b
276
277 # Store odd width
2788:
279 TBZ x1, 2, 9f
280 $if INC:
281 STR d30, [x7], 8
282 DUP d30, v30.d[1]
283 STR d28, [x13], 8
284 DUP d28, v28.d[1]
285 STR d26, [x18], 8
286 DUP d26, v26.d[1]
287 STR d24, [x17], 8
288 DUP d24, v24.d[1]
289 STR d22, [x16], 8
290 DUP d22, v22.d[1]
291 STR d20, [x6], 8
292 DUP d20, v20.d[1]
293 $else:
294 STR d20, [x6], 8
295 DUP d20, v20.d[1]
296 STR d22, [x16], 8
297 DUP d22, v22.d[1]
298 STR d24, [x17], 8
299 DUP d24, v24.d[1]
300 STR d26, [x18], 8
301 DUP d26, v26.d[1]
302 STR d28, [x13], 8
303 DUP d28, v28.d[1]
304 STR d30, [x7], 8
305 DUP d30, v30.d[1]
306
3079:
308 TBZ x1, 1, 10f
309 $if INC:
310 STR s30, [x7], 4
311 DUP s30, v30.s[1]
312 STR s28, [x13], 4
313 DUP s28, v28.s[1]
314 STR s26, [x18], 4
315 DUP s26, v26.s[1]
316 STR s24, [x17], 4
317 DUP s24, v24.s[1]
318 STR s22, [x16], 4
319 DUP s22, v22.s[1]
320 STR s20, [x6], 4
321 DUP s20, v20.s[1]
322 $else:
323 STR s20, [x6], 4
324 DUP s20, v20.s[1]
325 STR s22, [x16], 4
326 DUP s22, v22.s[1]
327 STR s24, [x17], 4
328 DUP s24, v24.s[1]
329 STR s26, [x18], 4
330 DUP s26, v26.s[1]
331 STR s28, [x13], 4
332 DUP s28, v28.s[1]
333 STR s30, [x7], 4
334 DUP s30, v30.s[1]
335
33610:
337 TBZ x1, 0, 11f
338 $if INC:
339 STR h30, [x7]
340 STR h28, [x13]
341 STR h26, [x18]
342 STR h24, [x17]
343 STR h22, [x16]
344 STR h20, [x6]
345 $else:
346 STR h20, [x6]
347 STR h22, [x16]
348 STR h24, [x17]
349 STR h26, [x18]
350 STR h28, [x13]
351 STR h30, [x7]
35211:
353 RET
354
355END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
356
357#ifdef __ELF__
358.section ".note.GNU-stack","",%progbits
359#endif