blob: 392eceef1659473afbd8cff8225e3ee574cebafa [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
Marat Dukhande06f492020-04-09 00:19:31 -07008# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld64(
XNNPACK Teamb455b122019-09-27 18:10:33 -07009# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
Frank Barchard67242182020-06-11 11:12:50 -070017# size_t cn_stride, [sp] -> (x0)
XNNPACK Teamb455b122019-09-27 18:10:33 -070018$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard167d6672021-06-15 10:31:54 -070020 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8)
XNNPACK Teamb455b122019-09-27 18:10:33 -070021$else:
Frank Barchard167d6672021-06-15 10:31:54 -070022 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8)
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
Frank Barchard909564c2020-06-09 03:54:33 -070024# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26# A pointers
27# x3 a0
28# x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32# x4 a5
33
34# C pointers
35# x6 c0
36# x16 c1
37# x17 c2
Frank Barchard67242182020-06-11 11:12:50 -070038# x14 c3
XNNPACK Teamb455b122019-09-27 18:10:33 -070039# x13 c4
40# x7 c5
41
42# Vector register usage
43# A0 v0
44# A1 v1
45# A2 v2
46# A3 v3
47# A4 v4
48# A5 v5
Frank Barchardcaf85442019-10-21 22:11:06 -070049# B v16 v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070050# C v20 v21
51# C v22 v23
52# C v24 v25
53# C v26 v27
54# C v28 v29
55# C v30 v31
56# Clamp v6 v7
57# unused A v8 v9 v10 v11
Frank Barchardcaf85442019-10-21 22:11:06 -070058# unused B v12 v13 v14 v15
XNNPACK Teamb455b122019-09-27 18:10:33 -070059
Marat Dukhande06f492020-04-09 00:19:31 -070060BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld64
XNNPACK Teamb455b122019-09-27 18:10:33 -070061
Frank Barchardf5cc7e72020-04-20 11:35:48 -070062 $if INC:
Frank Barchard67242182020-06-11 11:12:50 -070063 # Load acc, params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070064 LDP x15, x8, [sp, 8]
Frank Barchardf5cc7e72020-04-20 11:35:48 -070065 $else:
Frank Barchard67242182020-06-11 11:12:50 -070066 # Load params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070067 LDR x8, [sp, 8]
Frank Barchardf5cc7e72020-04-20 11:35:48 -070068
Frank Barchardcaf85442019-10-21 22:11:06 -070069 # Clamp A and C pointers
Frank Barchard76f43f02021-05-12 14:52:01 -070070 CMP x0, 2 // if mr < 2
71 ADD x9, x3, x4 // a1 = a0 + a_stride
72 ADD x16, x6, x7 // c1 = c0 + cm_stride
73 CSEL x9, x3, x9, LO // a1 = a0
74 CSEL x16, x6, x16, LO // c1 = c0
XNNPACK Teamb455b122019-09-27 18:10:33 -070075
Frank Barcharde3491242021-06-11 14:04:57 -070076 # Load min/max values
77 LD2R {v6.4s, v7.4s}, [x8]
78
Frank Barchard76f43f02021-05-12 14:52:01 -070079 ADD x10, x9, x4 // a2 = a1 + a_stride
80 ADD x17, x16, x7 // c2 = c1 + cm_stride
Frank Barchard7c9f1f92021-06-04 14:38:55 -070081 // if mr <= 2
Frank Barchard76f43f02021-05-12 14:52:01 -070082 CSEL x10, x9, x10, LS // a2 = a1
83 CSEL x17, x16, x17, LS // c2 = c1
XNNPACK Teamb455b122019-09-27 18:10:33 -070084
Frank Barchard76f43f02021-05-12 14:52:01 -070085 CMP x0, 4 // if mr < 4
86 ADD x11, x10, x4 // a3 = a2 + a_stride
87 ADD x14, x17, x7 // c3 = c2 + cm_stride
88 CSEL x11, x10, x11, LO // a3 = a2
89 CSEL x14, x17, x14, LO // c3 = c2
XNNPACK Teamb455b122019-09-27 18:10:33 -070090
Frank Barchard76f43f02021-05-12 14:52:01 -070091 ADD x12, x11, x4 // a4 = a3 + a_stride
92 ADD x13, x14, x7 // c4 = c3 + cm_stride
Frank Barchard7c9f1f92021-06-04 14:38:55 -070093 // if mr <= 4
Frank Barchard76f43f02021-05-12 14:52:01 -070094 CSEL x12, x11, x12, LS // a4 = a3
95 CSEL x13, x14, x13, LS // c4 = c3
XNNPACK Teamb455b122019-09-27 18:10:33 -070096
Frank Barchard76f43f02021-05-12 14:52:01 -070097 CMP x0, 6 // if mr < 6
98 ADD x4, x12, x4 // a5 = a4 + a_stride
99 ADD x7, x13, x7 // c5 = c4 + cm_stride
100 CSEL x4, x12, x4, LO // a5 = a4
101 CSEL x7, x13, x7, LO // c5 = c4
XNNPACK Teamb455b122019-09-27 18:10:33 -0700102
XNNPACK Teamb455b122019-09-27 18:10:33 -0700103
XNNPACK Teamb455b122019-09-27 18:10:33 -07001040:
105 $if INC:
106 # Load initial accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -0700107 LDP q20, q21, [x15], 32
108 LDP q22, q23, [x15], 32
109 LDP q24, q25, [x15], 32
110 LDP q26, q27, [x15], 32
111 LDP q28, q29, [x15], 32
112 LDP q30, q31, [x15], 32
113 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
Frank Barchardcbfa3382021-05-07 10:30:05 -0700114 PRFM PLDL1KEEP, [x5, 64]
115 PRFM PLDL1KEEP, [x5, 128]
116 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard76f43f02021-05-12 14:52:01 -0700117 PRFM PLDL1KEEP, [x3] // Prefetch A
Frank Barchardcbfa3382021-05-07 10:30:05 -0700118 PRFM PLDL1KEEP, [x9]
119 PRFM PLDL1KEEP, [x10]
120 PRFM PLDL1KEEP, [x11]
121 PRFM PLDL1KEEP, [x12]
122 PRFM PLDL1KEEP, [x4]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700123 $else:
124 # Load initial bias from w into accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -0700125 LDP q20, q21, [x5], 32
126 MOV v22.16b, v20.16b
127 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
128 MOV v23.16b, v21.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700129 PRFM PLDL1KEEP, [x5, 64]
Frank Barchard76f43f02021-05-12 14:52:01 -0700130 MOV v24.16b, v20.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700131 PRFM PLDL1KEEP, [x5, 128]
Frank Barchard76f43f02021-05-12 14:52:01 -0700132 MOV v25.16b, v21.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700133 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard76f43f02021-05-12 14:52:01 -0700134 MOV v26.16b, v20.16b
135 PRFM PLDL1KEEP, [x3] // Prefetch A
136 MOV v27.16b, v21.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700137 PRFM PLDL1KEEP, [x9]
Frank Barchard76f43f02021-05-12 14:52:01 -0700138 MOV v28.16b, v20.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700139 PRFM PLDL1KEEP, [x10]
Frank Barchard76f43f02021-05-12 14:52:01 -0700140 MOV v29.16b, v21.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700141 PRFM PLDL1KEEP, [x11]
Frank Barchard76f43f02021-05-12 14:52:01 -0700142 MOV v30.16b, v20.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700143 PRFM PLDL1KEEP, [x12]
Frank Barchard76f43f02021-05-12 14:52:01 -0700144 MOV v31.16b, v21.16b
Frank Barchardcbfa3382021-05-07 10:30:05 -0700145 PRFM PLDL1KEEP, [x4]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700146
147 # Is there at least 2 floats (8 bytes) for main loop?
Frank Barchard76f43f02021-05-12 14:52:01 -0700148 SUBS x0, x2, 8 // k = kc - 8
149 B.LO 3f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700150
151 # Main loop - 2 floats of A (8 bytes)
152 # 24 FMA + 6 LD64 A + 2 LDP B
1531:
Frank Barchard76f43f02021-05-12 14:52:01 -0700154 LDR d0, [x3], 8
155 LDP q16, q17, [x5], 32
156 LDR d1, [x9], 8
157 LDR d2, [x10], 8
158 LDR d3, [x11], 8
159 LDR d4, [x12], 8
160 LDR d5, [x4], 8
161 FMLA v20.4s, v16.4s, v0.s[0]
162 FMLA v22.4s, v16.4s, v1.s[0]
163 FMLA v24.4s, v16.4s, v2.s[0]
164 FMLA v26.4s, v16.4s, v3.s[0]
165 LDP q18, q19, [x5], 32
166 FMLA v28.4s, v16.4s, v4.s[0]
167 FMLA v30.4s, v16.4s, v5.s[0]
168 FMLA v21.4s, v17.4s, v0.s[0]
169 FMLA v23.4s, v17.4s, v1.s[0]
170 FMLA v25.4s, v17.4s, v2.s[0]
171 FMLA v27.4s, v17.4s, v3.s[0]
172 FMLA v29.4s, v17.4s, v4.s[0]
173 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700174
Frank Barchard76f43f02021-05-12 14:52:01 -0700175 FMLA v20.4s, v18.4s, v0.s[1]
176 FMLA v22.4s, v18.4s, v1.s[1]
177 FMLA v24.4s, v18.4s, v2.s[1]
178 FMLA v26.4s, v18.4s, v3.s[1]
179 FMLA v28.4s, v18.4s, v4.s[1]
180 FMLA v30.4s, v18.4s, v5.s[1]
181 FMLA v21.4s, v19.4s, v0.s[1]
182 FMLA v23.4s, v19.4s, v1.s[1]
183 FMLA v25.4s, v19.4s, v2.s[1]
184 FMLA v27.4s, v19.4s, v3.s[1]
185 SUBS x0, x0, 8
186 FMLA v29.4s, v19.4s, v4.s[1]
187 FMLA v31.4s, v19.4s, v5.s[1]
188 B.HS 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700189
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700191 TBNZ x0, 2, 3f
Frank Barchard3b262062020-09-30 15:53:17 -07001922:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700193 # Clamp
Frank Barchard76f43f02021-05-12 14:52:01 -0700194 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard67242182020-06-11 11:12:50 -0700195 # Load cn_stride
Frank Barchard76f43f02021-05-12 14:52:01 -0700196 LDR x0, [sp, 0]
197 FMAX v21.4s, v21.4s, v6.4s
198 FMAX v22.4s, v22.4s, v6.4s
199 FMAX v23.4s, v23.4s, v6.4s
200 FMAX v24.4s, v24.4s, v6.4s
201 FMAX v25.4s, v25.4s, v6.4s
202 FMAX v26.4s, v26.4s, v6.4s
203 FMAX v27.4s, v27.4s, v6.4s
204 FMAX v28.4s, v28.4s, v6.4s
205 FMAX v29.4s, v29.4s, v6.4s
206 FMAX v30.4s, v30.4s, v6.4s
207 FMAX v31.4s, v31.4s, v6.4s
208 SUBS x1, x1, 8
209 FMIN v20.4s, v20.4s, v7.4s
210 FMIN v21.4s, v21.4s, v7.4s
211 FMIN v22.4s, v22.4s, v7.4s
212 FMIN v23.4s, v23.4s, v7.4s
213 FMIN v24.4s, v24.4s, v7.4s
214 FMIN v25.4s, v25.4s, v7.4s
215 FMIN v26.4s, v26.4s, v7.4s
216 FMIN v27.4s, v27.4s, v7.4s
217 FMIN v28.4s, v28.4s, v7.4s
218 FMIN v29.4s, v29.4s, v7.4s
219 FMIN v30.4s, v30.4s, v7.4s
220 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700221
222 # Store full 6 x 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700223 B.LO 4f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700224
225 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700226 ST1 {v30.16b, v31.16b}, [x7], x0
227 SUB x3, x3, x2 // a0 -= kc
228 ST1 {v28.16b, v29.16b}, [x13], x0
229 SUB x9, x9, x2 // a1 -= kc
230 ST1 {v26.16b, v27.16b}, [x14], x0
231 SUB x10, x10, x2 // a2 -= kc
232 ST1 {v24.16b, v25.16b}, [x17], x0
233 SUB x11, x11, x2 // a3 -= kc
234 ST1 {v22.16b, v23.16b}, [x16], x0
235 SUB x12, x12, x2 // a4 -= kc
236 ST1 {v20.16b, v21.16b}, [x6], x0
237 SUB x4, x4, x2 // a5 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700238 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700239 ST1 {v20.16b, v21.16b}, [x6], x0
240 SUB x3, x3, x2 // a0 -= kc
241 ST1 {v22.16b, v23.16b}, [x16], x0
242 SUB x9, x9, x2 // a1 -= kc
243 ST1 {v24.16b, v25.16b}, [x17], x0
244 SUB x10, x10, x2 // a2 -= kc
245 ST1 {v26.16b, v27.16b}, [x14], x0
246 SUB x11, x11, x2 // a3 -= kc
247 ST1 {v28.16b, v29.16b}, [x13], x0
248 SUB x12, x12, x2 // a4 -= kc
249 ST1 {v30.16b, v31.16b}, [x7], x0
250 SUB x4, x4, x2 // a5 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700251
Frank Barchard76f43f02021-05-12 14:52:01 -0700252 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700253 RET
254
Frank Barchard3b262062020-09-30 15:53:17 -07002553:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700256 # Remainder- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700257 LDR s0, [x3], 4
258 LDP q16, q17, [x5], 32
259 LDR s1, [x9], 4
260 LDR s2, [x10], 4
261 LDR s3, [x11], 4
262 LDR s4, [x12], 4
263 LDR s5, [x4], 4
264 FMLA v20.4s, v16.4s, v0.s[0]
265 FMLA v22.4s, v16.4s, v1.s[0]
266 FMLA v24.4s, v16.4s, v2.s[0]
267 FMLA v26.4s, v16.4s, v3.s[0]
268 FMLA v28.4s, v16.4s, v4.s[0]
269 FMLA v30.4s, v16.4s, v5.s[0]
270 FMLA v21.4s, v17.4s, v0.s[0]
271 FMLA v23.4s, v17.4s, v1.s[0]
272 FMLA v25.4s, v17.4s, v2.s[0]
273 FMLA v27.4s, v17.4s, v3.s[0]
274 FMLA v29.4s, v17.4s, v4.s[0]
275 FMLA v31.4s, v17.4s, v5.s[0]
276 B 2b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700277
278 # Store odd width
Frank Barchard3b262062020-09-30 15:53:17 -07002794:
Frank Barchard76f43f02021-05-12 14:52:01 -0700280 TBZ x1, 2, 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700281 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700282 STR q30, [x7], 16
283 MOV v30.16b, v31.16b
284 STR q28, [x13], 16
285 MOV v28.16b, v29.16b
286 STR q26, [x14], 16
287 MOV v26.16b, v27.16b
288 STR q24, [x17], 16
289 MOV v24.16b, v25.16b
290 STR q22, [x16], 16
291 MOV v22.16b, v23.16b
292 STR q20, [x6], 16
293 MOV v20.16b, v21.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700294 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700295 STR q20, [x6], 16
296 MOV v20.16b, v21.16b
297 STR q22, [x16], 16
298 MOV v22.16b, v23.16b
299 STR q24, [x17], 16
300 MOV v24.16b, v25.16b
301 STR q26, [x14], 16
302 MOV v26.16b, v27.16b
303 STR q28, [x13], 16
304 MOV v28.16b, v29.16b
305 STR q30, [x7], 16
306 MOV v30.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700307
Frank Barchard3b262062020-09-30 15:53:17 -07003085:
Frank Barchard76f43f02021-05-12 14:52:01 -0700309 TBZ x1, 1, 6f
Frank Barchard3b262062020-09-30 15:53:17 -0700310 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700311 STR d30, [x7], 8
312 DUP d30, v30.d[1]
313 STR d28, [x13], 8
314 DUP d28, v28.d[1]
315 STR d26, [x14], 8
316 DUP d26, v26.d[1]
317 STR d24, [x17], 8
318 DUP d24, v24.d[1]
319 STR d22, [x16], 8
320 DUP d22, v22.d[1]
321 STR d20, [x6], 8
322 DUP d20, v20.d[1]
Frank Barchard3b262062020-09-30 15:53:17 -0700323 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700324 STR d20, [x6], 8
325 DUP d20, v20.d[1]
326 STR d22, [x16], 8
327 DUP d22, v22.d[1]
328 STR d24, [x17], 8
329 DUP d24, v24.d[1]
330 STR d26, [x14], 8
331 DUP d26, v26.d[1]
332 STR d28, [x13], 8
333 DUP d28, v28.d[1]
334 STR d30, [x7], 8
335 DUP d30, v30.d[1]
Frank Barchard3b262062020-09-30 15:53:17 -0700336
XNNPACK Teamb455b122019-09-27 18:10:33 -07003376:
Frank Barchard76f43f02021-05-12 14:52:01 -0700338 TBZ x1, 0, 7f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700339 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700340 STR s30, [x7]
341 STR s28, [x13]
342 STR s26, [x14]
343 STR s24, [x17]
344 STR s22, [x16]
345 STR s20, [x6]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700346 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700347 STR s20, [x6]
348 STR s22, [x16]
349 STR s24, [x17]
350 STR s26, [x14]
351 STR s28, [x13]
352 STR s30, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -07003537:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700354 RET
355
Marat Dukhande06f492020-04-09 00:19:31 -0700356END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700357
358#ifdef __ELF__
359.section ".note.GNU-stack","",%progbits
360#endif