blob: cb35faede0f80566920d16ae0687bfafc285e6a7 [file] [log] [blame]
Frank Barchardbddfbcd2020-04-15 12:32:41 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard5c9e8892020-04-15 18:50:11 -070020 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070021$else:
Frank Barchard5c9e8892020-04-15 18:50:11 -070022 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
Frank Barchardbddfbcd2020-04-15 12:32:41 -070023
Frank Barchard909564c2020-06-09 03:54:33 -070024# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
Frank Barchardbddfbcd2020-04-15 12:32:41 -070025
26# A pointers
27# x3 a0
28# x11 a1
29# x12 a2
30# x4 a3 / a_stride
31
32# C pointers
33# x6 c0
34# x9 c1
35# x10 c2
36# x7 c3 / cm_stride
37
38# Vector register usage
39# A0 v0
40# A1 v1
41# A2 v2
42# A3 v3
43# B v20 v21 v22 v23
44# C v16
45# C v18
46# C v28
47# C v30
48# Clamp v4, v5, v6
49# unused A v7 v8 v9 v10 v11
50# unused B v19
51
52BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
Frank Barchardf5cc7e72020-04-20 11:35:48 -070053
Frank Barchardbddfbcd2020-04-15 12:32:41 -070054 $if INC:
55 # Load cn_stride, acc
56 LDP x14, x15, [sp]
57 # Load params pointer
58 LDR x8, [sp, 16]
59 $else:
60 # Load cn_stride, params pointer
61 LDP x14, x8, [sp]
62
63 # Load params values
64 LD3R {v4.8h, v5.8h, v6.8h}, [x8]
65
66 # Clamp A and C pointers
67 CMP x0, 2 // if mr < 2
68 ADD x11, x3, x4 // a1 = a0 + a_stride
69 ADD x9, x6, x7 // c1 = c0 + cm_stride
70 CSEL x11, x3, x11, LO // a1 = a0
71 CSEL x9, x6, x9, LO // c1 = c0
72
73 ADD x12, x11, x4 // a2 = a1 + a_stride
74 ADD x10, x9, x7 // c2 = c1 + cm_stride
75 // if mr <= 2
76 CSEL x12, x11, x12, LS // a2 = a1
77 CSEL x10, x9, x10, LS // c2 = c1
78
79 CMP x0, 4 // if mr < 4
80 ADD x4, x12, x4 // a3 = a2 + a_stride
81 ADD x7, x10, x7 // c3 = c2 + cm_stride
82 CSEL x4, x12, x4, LO // a3 = a2
83 CSEL x7, x10, x7, LO // c3 = c2
84
850:
86 $if INC:
87 # Load initial accumulators
88 LDP q16, q18, [x15], 32
89 LDP q28, q30, [x15], 32
90 $else:
91 # Load initial bias from w into accumulators
92 LDR q16, [x5], 16
93 MOV v18.16b, v16.16b
94 MOV v28.16b, v16.16b
95 MOV v30.16b, v16.16b
96
97 # Is there at least 4 halffloats (8 bytes)?
98 SUBS x0, x2, 8 // k = kc - 8
Frank Barchard3b262062020-09-30 15:53:17 -070099 B.LO 3f
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700100
101 # Main loop - 4 halffloats of A (8 bytes)
1021:
Frank Barchardfc563f52020-04-24 16:21:11 -0700103 LDR d0, [x3], 8
104 LDR q20, [x5], 16
105 LDR q21, [x5], 16
106 LDR d1, [x11], 8
107 LDR d2, [x12], 8
108 LDR d3, [x4], 8
109 SUBS x0, x0, 8
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700110 FMLA v16.8h, v20.8h, v0.h[0]
111 FMLA v18.8h, v20.8h, v1.h[0]
112 FMLA v28.8h, v20.8h, v2.h[0]
113 FMLA v30.8h, v20.8h, v3.h[0]
Frank Barchardfc563f52020-04-24 16:21:11 -0700114 LDR q22, [x5], 16
115 LDR q23, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700116
117 FMLA v16.8h, v21.8h, v0.h[1]
118 FMLA v18.8h, v21.8h, v1.h[1]
119 FMLA v28.8h, v21.8h, v2.h[1]
120 FMLA v30.8h, v21.8h, v3.h[1]
121
122 FMLA v16.8h, v22.8h, v0.h[2]
123 FMLA v18.8h, v22.8h, v1.h[2]
124 FMLA v28.8h, v22.8h, v2.h[2]
125 FMLA v30.8h, v22.8h, v3.h[2]
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700126
127 FMLA v16.8h, v23.8h, v0.h[3]
128 FMLA v18.8h, v23.8h, v1.h[3]
129 FMLA v28.8h, v23.8h, v2.h[3]
130 FMLA v30.8h, v23.8h, v3.h[3]
131 B.HS 1b
132
133 # Is there a remainder?- 2 halffloats of A (4 bytes)
Frank Barchard3b262062020-09-30 15:53:17 -0700134 TBNZ x0, 2, 4f
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700135 # Is there a remainder?- 1 halffloats of A (2 bytes)
Frank Barchard3b262062020-09-30 15:53:17 -0700136 TBNZ x0, 1, 5f
1372:
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700138 # Scale and Clamp
139 FMUL v16.8h, v16.8h, v4.8h
140 SUBS x1, x1, 8
141 FMUL v18.8h, v18.8h, v4.8h
142 FMUL v28.8h, v28.8h, v4.8h
143 FMUL v30.8h, v30.8h, v4.8h
144 FMAX v16.8h, v16.8h, v5.8h
145 FMAX v18.8h, v18.8h, v5.8h
146 FMAX v28.8h, v28.8h, v5.8h
147 FMAX v30.8h, v30.8h, v5.8h
148 FMIN v16.8h, v16.8h, v6.8h
149 FMIN v18.8h, v18.8h, v6.8h
150 FMIN v28.8h, v28.8h, v6.8h
151 FMIN v30.8h, v30.8h, v6.8h
152
153 # Store full 4 x 8
Frank Barchard3b262062020-09-30 15:53:17 -0700154 B.LO 6f
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700155
156 $if INC:
157 ST1 {v30.16b}, [x7], x14
158 SUB x3, x3, x2 // a0 -= kc
159 ST1 {v28.16b}, [x10], x14
160 SUB x11, x11, x2 // a1 -= kc
161 ST1 {v18.16b}, [x9], x14
162 SUB x12, x12, x2 // a2 -= kc
163 ST1 {v16.16b}, [x6], x14
164 SUB x4, x4, x2 // a3 -= kc
165 $else:
166 ST1 {v16.16b}, [x6], x14
167 SUB x3, x3, x2 // a0 -= kc
168 ST1 {v18.16b}, [x9], x14
169 SUB x11, x11, x2 // a1 -= kc
170 ST1 {v28.16b}, [x10], x14
171 SUB x12, x12, x2 // a2 -= kc
172 ST1 {v30.16b}, [x7], x14
173 SUB x4, x4, x2 // a3 -= kc
174
175 B.HI 0b
176 RET
177
Frank Barchard3b262062020-09-30 15:53:17 -07001783:
179 TBZ x0, 2, 5f
1804:
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700181 # Remainder- 2 halffloats of A (4 bytes)
182 LDR s0, [x3], 4
Frank Barchardfc563f52020-04-24 16:21:11 -0700183 LDR q20, [x5], 16
184 LDR q21, [x5], 16
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700185 LDR s1, [x11], 4
186 LDR s2, [x12], 4
187 LDR s3, [x4], 4
188
189 FMLA v16.8h, v20.8h, v0.h[0]
190 FMLA v18.8h, v20.8h, v1.h[0]
191 FMLA v28.8h, v20.8h, v2.h[0]
192 FMLA v30.8h, v20.8h, v3.h[0]
193
194 FMLA v16.8h, v21.8h, v0.h[1]
195 FMLA v18.8h, v21.8h, v1.h[1]
196 FMLA v28.8h, v21.8h, v2.h[1]
197 FMLA v30.8h, v21.8h, v3.h[1]
198
Frank Barchard3b262062020-09-30 15:53:17 -0700199 TBZ x0, 1, 2b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700200
Frank Barchard3b262062020-09-30 15:53:17 -07002015:
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700202 # Remainder- 1 halffloat of A (2 bytes)
203 LDR h0, [x3], 2
204 LDR q20, [x5], 16
205 LDR h1, [x11], 2
206 LDR h2, [x12], 2
207 LDR h3 , [x4], 2
208 FMLA v16.8h, v20.8h, v0.h[0]
209 FMLA v18.8h, v20.8h, v1.h[0]
210 FMLA v28.8h, v20.8h, v2.h[0]
211 FMLA v30.8h, v20.8h, v3.h[0]
Frank Barchard3b262062020-09-30 15:53:17 -0700212 B 2b
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700213
214 # Store odd width
Frank Barchard3b262062020-09-30 15:53:17 -07002156:
216 TBZ x1, 2, 7f
217 $if INC:
218 STR d30, [x7], 8
219 DUP d30, v30.d[1]
220 STR d28, [x10], 8
221 DUP d28, v28.d[1]
222 STR d18, [x9], 8
223 DUP d18, v18.d[1]
224 STR d16, [x6], 8
225 DUP d16, v16.d[1]
226 $else:
227 STR d16, [x6], 8
228 DUP d16, v16.d[1]
229 STR d18, [x9], 8
230 DUP d18, v18.d[1]
231 STR d28, [x10], 8
232 DUP d28, v28.d[1]
233 STR d30, [x7], 8
234 DUP d30, v30.d[1]
235
2367:
237 TBZ x1, 1, 8f
238 $if INC:
239 STR s30, [x7], 4
240 DUP s30, v30.s[1]
241 STR s28, [x10], 4
242 DUP s28, v28.s[1]
243 STR s18, [x9], 4
244 DUP s18, v18.s[1]
245 STR s16, [x6], 4
246 DUP s16, v16.s[1]
247 $else:
248 STR s16, [x6], 4
249 DUP s16, v16.s[1]
250 STR s18, [x9], 4
251 DUP s18, v18.s[1]
252 STR s28, [x10], 4
253 DUP s28, v28.s[1]
254 STR s30, [x7], 4
255 DUP s30, v30.s[1]
256
Frank Barchardbddfbcd2020-04-15 12:32:41 -07002578:
Frank Barchard3b262062020-09-30 15:53:17 -0700258 TBZ x1, 0, 9f
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700259 $if INC:
Frank Barchard3b262062020-09-30 15:53:17 -0700260 STR h30, [x7]
261 STR h28, [x10]
262 STR h18, [x9]
263 STR h16, [x6]
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700264 $else:
Frank Barchard3b262062020-09-30 15:53:17 -0700265 STR h16, [x6]
266 STR h18, [x9]
267 STR h28, [x10]
268 STR h30, [x7]
Frank Barchardbddfbcd2020-04-15 12:32:41 -07002699:
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700270 RET
271
272END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64
273
274#ifdef __ELF__
275.section ".note.GNU-stack","",%progbits
276#endif