blob: d20a49ce291721c99d64977e7cfa65cca018367a [file] [log] [blame]
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard5c9e8892020-04-15 18:50:11 -070020 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070021$else:
Frank Barchard5c9e8892020-04-15 18:50:11 -070022 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070023
Frank Barchard909564c2020-06-09 03:54:33 -070024# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
Frank Barchardbddfbcd2020-04-15 12:32:41 -070025
26# A pointers
27# x3 a0
28# x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32# x4 a5
33
34# C pointers
35# x6 c0
36# x16 c1
37# x17 c2
38# x18 c3
39# x13 c4
40# x7 c5
41
42# Vector register usage
43# A0 v0
44# A1 v1
45# A2 v2
46# A3 v3
47# A4 v4
48# A5 v5
49# B v16 v17 v18 v19
50# C v20
51# C v22
52# C v24
53# C v26
54# C v28
55# C v30
Frank Barchardf5cc7e72020-04-20 11:35:48 -070056# Clamp v6, (v4), (v5)
Frank Barchardbddfbcd2020-04-15 12:32:41 -070057# unused A v8 v9 v10 v11
58# unused B v12 v13 v14 v15
59
Frank Barchardbddfbcd2020-04-15 12:32:41 -070060
61BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
Frank Barchardf5cc7e72020-04-20 11:35:48 -070062
63 $if INC:
64 # Load cn_stride, acc
65 LDP x14, x15, [sp]
66 # Load params pointer
67 LDR x8, [sp, 16]
68 $else:
69 # Load cn_stride, params pointer
70 LDP x14, x8, [sp]
71
Frank Barchardbddfbcd2020-04-15 12:32:41 -070072 # Clamp A and C pointers
73 CMP x0, 2 // if mr < 2
74 ADD x9, x3, x4 // a1 = a0 + a_stride
75 ADD x16, x6, x7 // c1 = c0 + cm_stride
76 CSEL x9, x3, x9, LO // a1 = a0
77 CSEL x16, x6, x16, LO // c1 = c0
78
79 ADD x10, x9, x4 // a2 = a1 + a_stride
80 ADD x17, x16, x7 // c2 = c1 + cm_stride
81 // if mr <= 2
82 CSEL x10, x9, x10, LS // a2 = a1
83 CSEL x17, x16, x17, LS // c2 = c1
84
85 CMP x0, 4 // if mr < 4
86 ADD x11, x10, x4 // a3 = a2 + a_stride
87 ADD x18, x17, x7 // c3 = c2 + cm_stride
88 CSEL x11, x10, x11, LO // a3 = a2
89 CSEL x18, x17, x18, LO // c3 = c2
90
91 ADD x12, x11, x4 // a4 = a3 + a_stride
92 ADD x13, x18, x7 // c4 = c3 + cm_stride
Frank Barchardc4668ed2020-04-20 11:04:28 -070093 // if mr <= 4
Frank Barchardbddfbcd2020-04-15 12:32:41 -070094 CSEL x12, x11, x12, LS // a4 = a3
95 CSEL x13, x18, x13, LS // c4 = c3
96
Frank Barchardbddfbcd2020-04-15 12:32:41 -070097 CMP x0, 6 // if mr < 6
98 ADD x4, x12, x4 // a5 = a4 + a_stride
99 ADD x7, x13, x7 // c5 = c4 + cm_stride
100 CSEL x4, x12, x4, LO // a5 = a4
101 CSEL x7, x13, x7, LO // c5 = c4
102
103 # Load params scale value
104 LD1R {v6.8h}, [x8]
105 ADD x8, x8, 2
106
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001070:
108 $if INC:
109 # Load initial accumulators
110 LDP q20, q22, [x15], 32
111 LDP q24, q26, [x15], 32
112 LDP q28, q30, [x15], 32
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700113 $else:
114 # Load initial bias from w into accumulators
115 LDR q20, [x5], 16
116 MOV v22.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700117 MOV v24.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700118 MOV v26.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700119 MOV v28.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700120 MOV v30.16b, v20.16b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700121
122 # Is there at least 4 halffloats (8 bytes)?
123 SUBS x0, x2, 8 // k = kc - 8
124 B.LO 5f
125
126 # Main loop - 4 halffloats of A (8 bytes)
Frank Barchardfc563f52020-04-24 16:21:11 -0700127 # 24 FMA + 6 ld64 A + 4 LDR B
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001281:
129 LDR d0, [x3], 8
Frank Barchardfc563f52020-04-24 16:21:11 -0700130 LDR q16, [x5], 16
131 LDR q17, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700132 LDR d1, [x9], 8
133 LDR d2, [x10], 8
134 LDR d3, [x11], 8
135 LDR d4, [x12], 8
136 LDR d5, [x4], 8
Frank Barchardfc563f52020-04-24 16:21:11 -0700137 SUBS x0, x0, 8
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700138 FMLA v20.8h, v16.8h, v0.h[0]
139 FMLA v22.8h, v16.8h, v1.h[0]
140 FMLA v24.8h, v16.8h, v2.h[0]
141 FMLA v26.8h, v16.8h, v3.h[0]
142 FMLA v28.8h, v16.8h, v4.h[0]
143 FMLA v30.8h, v16.8h, v5.h[0]
Frank Barchardfc563f52020-04-24 16:21:11 -0700144 LDR q18, [x5], 16
145 LDR q19, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700146
147 FMLA v20.8h, v17.8h, v0.h[1]
148 FMLA v22.8h, v17.8h, v1.h[1]
149 FMLA v24.8h, v17.8h, v2.h[1]
150 FMLA v26.8h, v17.8h, v3.h[1]
151 FMLA v28.8h, v17.8h, v4.h[1]
152 FMLA v30.8h, v17.8h, v5.h[1]
153
154 FMLA v20.8h, v18.8h, v0.h[2]
155 FMLA v22.8h, v18.8h, v1.h[2]
156 FMLA v24.8h, v18.8h, v2.h[2]
157 FMLA v26.8h, v18.8h, v3.h[2]
158 FMLA v28.8h, v18.8h, v4.h[2]
159 FMLA v30.8h, v18.8h, v5.h[2]
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700160
161 FMLA v20.8h, v19.8h, v0.h[3]
162 FMLA v22.8h, v19.8h, v1.h[3]
163 FMLA v24.8h, v19.8h, v2.h[3]
164 FMLA v26.8h, v19.8h, v3.h[3]
165 FMLA v28.8h, v19.8h, v4.h[3]
166 FMLA v30.8h, v19.8h, v5.h[3]
167 B.HS 1b
168
169 # Is there a remainder?- 2 halffloats of A (4 bytes)
170 TBNZ x0, 2, 6f
171 # Is there a remainder?- 1 halffloats of A (2 bytes)
172 TBNZ x0, 1, 7f
1734:
174 # Scale and Clamp
175 FMUL v20.8h, v20.8h, v6.8h
176 # Load params values
177 LD2R {v4.8h, v5.8h}, [x8]
178 FMUL v22.8h, v22.8h, v6.8h
179 FMUL v24.8h, v24.8h, v6.8h
180 FMUL v26.8h, v26.8h, v6.8h
181 FMUL v28.8h, v28.8h, v6.8h
182 FMUL v30.8h, v30.8h, v6.8h
183 SUBS x1, x1, 8
184 FMAX v20.8h, v20.8h, v4.8h
185 FMAX v22.8h, v22.8h, v4.8h
186 FMAX v24.8h, v24.8h, v4.8h
187 FMAX v26.8h, v26.8h, v4.8h
188 FMAX v28.8h, v28.8h, v4.8h
189 FMAX v30.8h, v30.8h, v4.8h
190 FMIN v20.8h, v20.8h, v5.8h
191 FMIN v22.8h, v22.8h, v5.8h
192 FMIN v24.8h, v24.8h, v5.8h
193 FMIN v26.8h, v26.8h, v5.8h
194 FMIN v28.8h, v28.8h, v5.8h
195 FMIN v30.8h, v30.8h, v5.8h
196
197 # Store full 6 x 8
198 B.LO 8f
199
200 $if INC:
201 ST1 {v30.16b}, [x7], x14
202 SUB x3, x3, x2 // a0 -= kc
203 ST1 {v28.16b}, [x13], x14
204 SUB x9, x9, x2 // a1 -= kc
205 ST1 {v26.16b}, [x18], x14
206 SUB x10, x10, x2 // a2 -= kc
207 ST1 {v24.16b}, [x17], x14
208 SUB x11, x11, x2 // a3 -= kc
209 ST1 {v22.16b}, [x16], x14
210 SUB x12, x12, x2 // a4 -= kc
211 ST1 {v20.16b}, [x6], x14
212 SUB x4, x4, x2 // a5 -= kc
213 $else:
214 ST1 {v20.16b}, [x6], x14
215 SUB x3, x3, x2 // a0 -= kc
216 ST1 {v22.16b}, [x16], x14
217 SUB x9, x9, x2 // a1 -= kc
218 ST1 {v24.16b}, [x17], x14
219 SUB x10, x10, x2 // a2 -= kc
220 ST1 {v26.16b}, [x18], x14
221 SUB x11, x11, x2 // a3 -= kc
222 ST1 {v28.16b}, [x13], x14
223 SUB x12, x12, x2 // a4 -= kc
224 ST1 {v30.16b}, [x7], x14
225 SUB x4, x4, x2 // a5 -= kc
226
227 B.HI 0b
228 RET
229
2305:
231 TBZ x0, 2, 7f
2326:
233 # Remainder- 2 halffloats of A (4 bytes)
234 LDR s0, [x3], 4
Frank Barchardfc563f52020-04-24 16:21:11 -0700235 LDR q16, [x5], 16
236 LDR q17, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700237 LDR s1, [x9], 4
238 LDR s2, [x10], 4
239 LDR s3, [x11], 4
240 LDR s4, [x12], 4
241 LDR s5, [x4], 4
242
243 FMLA v20.8h, v16.8h, v0.h[0]
244 FMLA v22.8h, v16.8h, v1.h[0]
245 FMLA v24.8h, v16.8h, v2.h[0]
246 FMLA v26.8h, v16.8h, v3.h[0]
247 FMLA v28.8h, v16.8h, v4.h[0]
248 FMLA v30.8h, v16.8h, v5.h[0]
249
250 FMLA v20.8h, v17.8h, v0.h[1]
251 FMLA v22.8h, v17.8h, v1.h[1]
252 FMLA v24.8h, v17.8h, v2.h[1]
253 FMLA v26.8h, v17.8h, v3.h[1]
254 FMLA v28.8h, v17.8h, v4.h[1]
255 FMLA v30.8h, v17.8h, v5.h[1]
256
257 TBZ x0, 1, 4b
258
2597:
260 # Remainder- 1 halffloat of A (2 bytes)
261 LDR h0, [x3], 2
262 LDR q16, [x5], 16
263 LDR h1, [x9], 2
264 LDR h2, [x10], 2
265 LDR h3, [x11], 2
266 LDR h4, [x12], 2
267 LDR h5, [x4], 2
268 FMLA v20.8h, v16.8h, v0.h[0]
269 FMLA v22.8h, v16.8h, v1.h[0]
270 FMLA v24.8h, v16.8h, v2.h[0]
271 FMLA v26.8h, v16.8h, v3.h[0]
272 FMLA v28.8h, v16.8h, v4.h[0]
273 FMLA v30.8h, v16.8h, v5.h[0]
274 B 4b
275
276 # Store odd width
2778:
278 TBZ x1, 2, 9f
279 $if INC:
280 STR d30, [x7], 8
281 DUP d30, v30.d[1]
282 STR d28, [x13], 8
283 DUP d28, v28.d[1]
284 STR d26, [x18], 8
285 DUP d26, v26.d[1]
286 STR d24, [x17], 8
287 DUP d24, v24.d[1]
288 STR d22, [x16], 8
289 DUP d22, v22.d[1]
290 STR d20, [x6], 8
291 DUP d20, v20.d[1]
292 $else:
293 STR d20, [x6], 8
294 DUP d20, v20.d[1]
295 STR d22, [x16], 8
296 DUP d22, v22.d[1]
297 STR d24, [x17], 8
298 DUP d24, v24.d[1]
299 STR d26, [x18], 8
300 DUP d26, v26.d[1]
301 STR d28, [x13], 8
302 DUP d28, v28.d[1]
303 STR d30, [x7], 8
304 DUP d30, v30.d[1]
305
3069:
307 TBZ x1, 1, 10f
308 $if INC:
309 STR s30, [x7], 4
310 DUP s30, v30.s[1]
311 STR s28, [x13], 4
312 DUP s28, v28.s[1]
313 STR s26, [x18], 4
314 DUP s26, v26.s[1]
315 STR s24, [x17], 4
316 DUP s24, v24.s[1]
317 STR s22, [x16], 4
318 DUP s22, v22.s[1]
319 STR s20, [x6], 4
320 DUP s20, v20.s[1]
321 $else:
322 STR s20, [x6], 4
323 DUP s20, v20.s[1]
324 STR s22, [x16], 4
325 DUP s22, v22.s[1]
326 STR s24, [x17], 4
327 DUP s24, v24.s[1]
328 STR s26, [x18], 4
329 DUP s26, v26.s[1]
330 STR s28, [x13], 4
331 DUP s28, v28.s[1]
332 STR s30, [x7], 4
333 DUP s30, v30.s[1]
334
33510:
336 TBZ x1, 0, 11f
337 $if INC:
338 STR h30, [x7]
339 STR h28, [x13]
340 STR h26, [x18]
341 STR h24, [x17]
342 STR h22, [x16]
343 STR h20, [x6]
344 $else:
345 STR h20, [x6]
346 STR h22, [x16]
347 STR h24, [x17]
348 STR h26, [x18]
349 STR h28, [x13]
350 STR h30, [x7]
35111:
352 RET
353
354END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
355
356#ifdef __ELF__
357.section ".note.GNU-stack","",%progbits
358#endif