blob: 6e4e142343692423aeed5d3dc7bb565fe4a3f6dc [file] [log] [blame]
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard5c9e8892020-04-15 18:50:11 -070020 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070021$else:
Frank Barchard5c9e8892020-04-15 18:50:11 -070022 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
50# B v16 v17 v18 v19
51# C v20
52# C v22
53# C v24
54# C v26
55# C v28
56# C v30
Frank Barchardf5cc7e72020-04-20 11:35:48 -070057# Clamp v6, (v4), (v5)
Frank Barchardbddfbcd2020-04-15 12:32:41 -070058# unused A v8 v9 v10 v11
59# unused B v12 v13 v14 v15
60
Frank Barchardbddfbcd2020-04-15 12:32:41 -070061
62BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
Frank Barchardf5cc7e72020-04-20 11:35:48 -070063
64 $if INC:
65 # Load cn_stride, acc
66 LDP x14, x15, [sp]
67 # Load params pointer
68 LDR x8, [sp, 16]
69 $else:
70 # Load cn_stride, params pointer
71 LDP x14, x8, [sp]
72
Frank Barchardbddfbcd2020-04-15 12:32:41 -070073 # Clamp A and C pointers
74 CMP x0, 2 // if mr < 2
75 ADD x9, x3, x4 // a1 = a0 + a_stride
76 ADD x16, x6, x7 // c1 = c0 + cm_stride
77 CSEL x9, x3, x9, LO // a1 = a0
78 CSEL x16, x6, x16, LO // c1 = c0
79
80 ADD x10, x9, x4 // a2 = a1 + a_stride
81 ADD x17, x16, x7 // c2 = c1 + cm_stride
82 // if mr <= 2
83 CSEL x10, x9, x10, LS // a2 = a1
84 CSEL x17, x16, x17, LS // c2 = c1
85
86 CMP x0, 4 // if mr < 4
87 ADD x11, x10, x4 // a3 = a2 + a_stride
88 ADD x18, x17, x7 // c3 = c2 + cm_stride
89 CSEL x11, x10, x11, LO // a3 = a2
90 CSEL x18, x17, x18, LO // c3 = c2
91
92 ADD x12, x11, x4 // a4 = a3 + a_stride
93 ADD x13, x18, x7 // c4 = c3 + cm_stride
Frank Barchardc4668ed2020-04-20 11:04:28 -070094 // if mr <= 4
Frank Barchardbddfbcd2020-04-15 12:32:41 -070095 CSEL x12, x11, x12, LS // a4 = a3
96 CSEL x13, x18, x13, LS // c4 = c3
97
Frank Barchardbddfbcd2020-04-15 12:32:41 -070098 CMP x0, 6 // if mr < 6
99 ADD x4, x12, x4 // a5 = a4 + a_stride
100 ADD x7, x13, x7 // c5 = c4 + cm_stride
101 CSEL x4, x12, x4, LO // a5 = a4
102 CSEL x7, x13, x7, LO // c5 = c4
103
104 # Load params scale value
105 LD1R {v6.8h}, [x8]
106 ADD x8, x8, 2
107
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001080:
109 $if INC:
110 # Load initial accumulators
111 LDP q20, q22, [x15], 32
112 LDP q24, q26, [x15], 32
113 LDP q28, q30, [x15], 32
114 #PRFM PLDL1KEEP, [x5, 0] // Prefetch B
115 #PRFM PLDL1KEEP, [x5, 64]
116 #PRFM PLDL1KEEP, [x5, 128]
117 #PRFM PLDL1KEEP, [x5, 192]
118 #PRFM PLDL1KEEP, [x3] // Prefetch A
119 #PRFM PLDL1KEEP, [x9]
120 #PRFM PLDL1KEEP, [x10]
121 #PRFM PLDL1KEEP, [x11]
122 #PRFM PLDL1KEEP, [x12]
123 #PRFM PLDL1KEEP, [x4]
124 $else:
125 # Load initial bias from w into accumulators
126 LDR q20, [x5], 16
127 MOV v22.16b, v20.16b
128 #PRFM PLDL1KEEP, [x5, 0] // Prefetch B
129 #PRFM PLDL1KEEP, [x5, 64]
130 MOV v24.16b, v20.16b
131 #PRFM PLDL1KEEP, [x5, 128]
132 #PRFM PLDL1KEEP, [x5, 192]
133 MOV v26.16b, v20.16b
134 #PRFM PLDL1KEEP, [x3] // Prefetch A
135 #PRFM PLDL1KEEP, [x9]
136 MOV v28.16b, v20.16b
137 #PRFM PLDL1KEEP, [x10]
138 #PRFM PLDL1KEEP, [x11]
139 MOV v30.16b, v20.16b
140 #PRFM PLDL1KEEP, [x12]
141 #PRFM PLDL1KEEP, [x4]
142
143 # Is there at least 4 halffloats (8 bytes)?
144 SUBS x0, x2, 8 // k = kc - 8
145 B.LO 5f
146
147 # Main loop - 4 halffloats of A (8 bytes)
148 # 24 FMA + 6 ld64 A + 2 LDP B
1491:
150 LDR d0, [x3], 8
151 LDP q16, q17, [x5], 32
152 LDR d1, [x9], 8
153 LDR d2, [x10], 8
154 LDR d3, [x11], 8
155 LDR d4, [x12], 8
156 LDR d5, [x4], 8
157
158 FMLA v20.8h, v16.8h, v0.h[0]
159 FMLA v22.8h, v16.8h, v1.h[0]
160 FMLA v24.8h, v16.8h, v2.h[0]
161 FMLA v26.8h, v16.8h, v3.h[0]
162 FMLA v28.8h, v16.8h, v4.h[0]
163 FMLA v30.8h, v16.8h, v5.h[0]
164 LDP q18, q19, [x5], 32
165
166 FMLA v20.8h, v17.8h, v0.h[1]
167 FMLA v22.8h, v17.8h, v1.h[1]
168 FMLA v24.8h, v17.8h, v2.h[1]
169 FMLA v26.8h, v17.8h, v3.h[1]
170 FMLA v28.8h, v17.8h, v4.h[1]
171 FMLA v30.8h, v17.8h, v5.h[1]
172
173 FMLA v20.8h, v18.8h, v0.h[2]
174 FMLA v22.8h, v18.8h, v1.h[2]
175 FMLA v24.8h, v18.8h, v2.h[2]
176 FMLA v26.8h, v18.8h, v3.h[2]
177 FMLA v28.8h, v18.8h, v4.h[2]
178 FMLA v30.8h, v18.8h, v5.h[2]
179 SUBS x0, x0, 8
180
181 FMLA v20.8h, v19.8h, v0.h[3]
182 FMLA v22.8h, v19.8h, v1.h[3]
183 FMLA v24.8h, v19.8h, v2.h[3]
184 FMLA v26.8h, v19.8h, v3.h[3]
185 FMLA v28.8h, v19.8h, v4.h[3]
186 FMLA v30.8h, v19.8h, v5.h[3]
187 B.HS 1b
188
189 # Is there a remainder?- 2 halffloats of A (4 bytes)
190 TBNZ x0, 2, 6f
191 # Is there a remainder?- 1 halffloats of A (2 bytes)
192 TBNZ x0, 1, 7f
1934:
194 # Scale and Clamp
195 FMUL v20.8h, v20.8h, v6.8h
196 # Load params values
197 LD2R {v4.8h, v5.8h}, [x8]
198 FMUL v22.8h, v22.8h, v6.8h
199 FMUL v24.8h, v24.8h, v6.8h
200 FMUL v26.8h, v26.8h, v6.8h
201 FMUL v28.8h, v28.8h, v6.8h
202 FMUL v30.8h, v30.8h, v6.8h
203 SUBS x1, x1, 8
204 FMAX v20.8h, v20.8h, v4.8h
205 FMAX v22.8h, v22.8h, v4.8h
206 FMAX v24.8h, v24.8h, v4.8h
207 FMAX v26.8h, v26.8h, v4.8h
208 FMAX v28.8h, v28.8h, v4.8h
209 FMAX v30.8h, v30.8h, v4.8h
210 FMIN v20.8h, v20.8h, v5.8h
211 FMIN v22.8h, v22.8h, v5.8h
212 FMIN v24.8h, v24.8h, v5.8h
213 FMIN v26.8h, v26.8h, v5.8h
214 FMIN v28.8h, v28.8h, v5.8h
215 FMIN v30.8h, v30.8h, v5.8h
216
217 # Store full 6 x 8
218 B.LO 8f
219
220 $if INC:
221 ST1 {v30.16b}, [x7], x14
222 SUB x3, x3, x2 // a0 -= kc
223 ST1 {v28.16b}, [x13], x14
224 SUB x9, x9, x2 // a1 -= kc
225 ST1 {v26.16b}, [x18], x14
226 SUB x10, x10, x2 // a2 -= kc
227 ST1 {v24.16b}, [x17], x14
228 SUB x11, x11, x2 // a3 -= kc
229 ST1 {v22.16b}, [x16], x14
230 SUB x12, x12, x2 // a4 -= kc
231 ST1 {v20.16b}, [x6], x14
232 SUB x4, x4, x2 // a5 -= kc
233 $else:
234 ST1 {v20.16b}, [x6], x14
235 SUB x3, x3, x2 // a0 -= kc
236 ST1 {v22.16b}, [x16], x14
237 SUB x9, x9, x2 // a1 -= kc
238 ST1 {v24.16b}, [x17], x14
239 SUB x10, x10, x2 // a2 -= kc
240 ST1 {v26.16b}, [x18], x14
241 SUB x11, x11, x2 // a3 -= kc
242 ST1 {v28.16b}, [x13], x14
243 SUB x12, x12, x2 // a4 -= kc
244 ST1 {v30.16b}, [x7], x14
245 SUB x4, x4, x2 // a5 -= kc
246
247 B.HI 0b
248 RET
249
2505:
251 TBZ x0, 2, 7f
2526:
253 # Remainder- 2 halffloats of A (4 bytes)
254 LDR s0, [x3], 4
255 LDP q16, q17, [x5], 32
256 LDR s1, [x9], 4
257 LDR s2, [x10], 4
258 LDR s3, [x11], 4
259 LDR s4, [x12], 4
260 LDR s5, [x4], 4
261
262 FMLA v20.8h, v16.8h, v0.h[0]
263 FMLA v22.8h, v16.8h, v1.h[0]
264 FMLA v24.8h, v16.8h, v2.h[0]
265 FMLA v26.8h, v16.8h, v3.h[0]
266 FMLA v28.8h, v16.8h, v4.h[0]
267 FMLA v30.8h, v16.8h, v5.h[0]
268
269 FMLA v20.8h, v17.8h, v0.h[1]
270 FMLA v22.8h, v17.8h, v1.h[1]
271 FMLA v24.8h, v17.8h, v2.h[1]
272 FMLA v26.8h, v17.8h, v3.h[1]
273 FMLA v28.8h, v17.8h, v4.h[1]
274 FMLA v30.8h, v17.8h, v5.h[1]
275
276 TBZ x0, 1, 4b
277
2787:
279 # Remainder- 1 halffloat of A (2 bytes)
280 LDR h0, [x3], 2
281 LDR q16, [x5], 16
282 LDR h1, [x9], 2
283 LDR h2, [x10], 2
284 LDR h3, [x11], 2
285 LDR h4, [x12], 2
286 LDR h5, [x4], 2
287 FMLA v20.8h, v16.8h, v0.h[0]
288 FMLA v22.8h, v16.8h, v1.h[0]
289 FMLA v24.8h, v16.8h, v2.h[0]
290 FMLA v26.8h, v16.8h, v3.h[0]
291 FMLA v28.8h, v16.8h, v4.h[0]
292 FMLA v30.8h, v16.8h, v5.h[0]
293 B 4b
294
295 # Store odd width
2968:
297 TBZ x1, 2, 9f
298 $if INC:
299 STR d30, [x7], 8
300 DUP d30, v30.d[1]
301 STR d28, [x13], 8
302 DUP d28, v28.d[1]
303 STR d26, [x18], 8
304 DUP d26, v26.d[1]
305 STR d24, [x17], 8
306 DUP d24, v24.d[1]
307 STR d22, [x16], 8
308 DUP d22, v22.d[1]
309 STR d20, [x6], 8
310 DUP d20, v20.d[1]
311 $else:
312 STR d20, [x6], 8
313 DUP d20, v20.d[1]
314 STR d22, [x16], 8
315 DUP d22, v22.d[1]
316 STR d24, [x17], 8
317 DUP d24, v24.d[1]
318 STR d26, [x18], 8
319 DUP d26, v26.d[1]
320 STR d28, [x13], 8
321 DUP d28, v28.d[1]
322 STR d30, [x7], 8
323 DUP d30, v30.d[1]
324
3259:
326 TBZ x1, 1, 10f
327 $if INC:
328 STR s30, [x7], 4
329 DUP s30, v30.s[1]
330 STR s28, [x13], 4
331 DUP s28, v28.s[1]
332 STR s26, [x18], 4
333 DUP s26, v26.s[1]
334 STR s24, [x17], 4
335 DUP s24, v24.s[1]
336 STR s22, [x16], 4
337 DUP s22, v22.s[1]
338 STR s20, [x6], 4
339 DUP s20, v20.s[1]
340 $else:
341 STR s20, [x6], 4
342 DUP s20, v20.s[1]
343 STR s22, [x16], 4
344 DUP s22, v22.s[1]
345 STR s24, [x17], 4
346 DUP s24, v24.s[1]
347 STR s26, [x18], 4
348 DUP s26, v26.s[1]
349 STR s28, [x13], 4
350 DUP s28, v28.s[1]
351 STR s30, [x7], 4
352 DUP s30, v30.s[1]
353
35410:
355 TBZ x1, 0, 11f
356 $if INC:
357 STR h30, [x7]
358 STR h28, [x13]
359 STR h26, [x18]
360 STR h24, [x17]
361 STR h22, [x16]
362 STR h20, [x6]
363 $else:
364 STR h20, [x6]
365 STR h22, [x16]
366 STR h24, [x17]
367 STR h26, [x18]
368 STR h28, [x13]
369 STR h30, [x7]
37011:
371 RET
372
373END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
374
375#ifdef __ELF__
376.section ".note.GNU-stack","",%progbits
377#endif