blob: 9760ea5388b49d6b7e656cb1b8e5252d25b7adbd [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001# Copyright 2019 Google LLC
2#
3# This source code is licensed under the BSD-style license found in the
4# LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a75(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
39# Vector register usage
40# A0 v0 v4
41# A1 v1 v5
42# A2 v2 v6
43# A3 v3 v7
44# B v8 v9 v10 v11
45# B v12 v13 v14 v15
46# B v20 v21 v22 v23
47# B v24 v25 v26 v27
48# C v16 v17
49# C v18 v19
50# C v28 v29
51# C v30 v31
52# Clamp v4 v5
53
54BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a75
55
56 $if INC:
57 # Load cn_stride, acc
58 LDP x14, x15, [sp]
59 # Load params pointer
60 LDR x8, [sp, 16]
61 $else:
62 # Load cn_stride, params pointer
63 LDP x14, x8, [sp]
64
65 # Load clamping_params values
66 LD2R {v4.4s, v5.4s}, [x8]
67
68 # Save d8-d15 on stack
69 STP d8, d9, [sp, -64]!
70 STP d10, d11, [sp, 16]
71 STP d12, d13, [sp, 32]
72 STP d14, d15, [sp, 48]
73
74 # Clamp A and C pointers
75 ADD x11, x3, x4 // a1 = a0 + a_stride
76 ADD x9, x6, x7 // c1 = c0 + cm_stride
77 CMP x0, 2 // if mr < 2
78 CSEL x11, x3, x11, LO // a1 = a0
79 CSEL x9, x6, x9, LO // c1 = c0
80
81 ADD x12, x11, x4 // a2 = a1 + a_stride
82 ADD x10, x9, x7 // c2 = c1 + cm_stride
83 // if mr <= 2
84 CSEL x12, x11, x12, LS // a2 = a1
85 CSEL x10, x9, x10, LS // c2 = c1
86
87 ADD x4, x12, x4 // a3 = a2 + a_stride
88 ADD x7, x10, x7 // c3 = c2 + cm_stride
89 CMP x0, 4 // if mr < 4
90 CSEL x4, x12, x4, LO // a3 = a2
91 CSEL x7, x10, x7, LO // c3 = c2
92
930:
94 $if INC:
95 # Load initial accumulators
96 LDP q16, q17, [x15], 32
97 LDP q18, q19, [x15], 32
98 LDP q28, q29, [x15], 32
99 LDP q30, q31, [x15], 32
100 $else:
101 # Load initial bias from w into accumulators
102 LDP q16, q17, [x5], 32
103 MOV v18.16b, v16.16b
104 MOV v19.16b, v17.16b
105 MOV v28.16b, v16.16b
106 MOV v29.16b, v17.16b
107 MOV v30.16b, v16.16b
108 MOV v31.16b, v17.16b
109
110 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
111 SUBS x0, x2, 32 // k = kc - 32
112 B.LO 3f
113
114 # 16 prologue
115 # Read first block of 4 A and B.
116 LDR q0, [x3], 16
117 LDP q20, q21, [x5], 32
118 LDR q1, [x11], 16
119 LDR q2, [x12], 16
120 LDR q3, [x4], 16
121 LDP q22, q23, [x5], 32
122 LDP q24, q25, [x5], 32
123 LDP q26, q27, [x5], 32
124
125 # Is there at least 32. yes do main loop
126 SUBS x0, x0, 32
127 B.LO 2f
128
129 # Main loop - 8 floats of A (32 bytes)
1301:
131 # First block of 4. FMA for first 4, loads for 2nd block of 4.
132 FMLA v16.4s, v20.4s, v0.s[0]
133 LDP q8, q9, [x5], 32
134 FMLA v17.4s, v21.4s, v0.s[0]
135 FMLA v18.4s, v20.4s, v1.s[0]
136 LDP q10, q11, [x5], 32
137 FMLA v19.4s, v21.4s, v1.s[0]
138 FMLA v28.4s, v20.4s, v2.s[0]
139 LDP q12, q13, [x5], 32
140 FMLA v29.4s, v21.4s, v2.s[0]
141 FMLA v30.4s, v20.4s, v3.s[0]
142 LDP q14, q15, [x5], 32
143 FMLA v31.4s, v21.4s, v3.s[0]
144 FMLA v16.4s, v22.4s, v0.s[1]
145 LDR q4, [x3], 16
146 FMLA v17.4s, v23.4s, v0.s[1]
147 FMLA v18.4s, v22.4s, v1.s[1]
148 LDR q5, [x11], 16
149 FMLA v19.4s, v23.4s, v1.s[1]
150 FMLA v28.4s, v22.4s, v2.s[1]
151 LDR q6, [x12], 16
152 FMLA v29.4s, v23.4s, v2.s[1]
153 FMLA v30.4s, v22.4s, v3.s[1]
154 LDR q7, [x4], 16
155 FMLA v31.4s, v23.4s, v3.s[1]
156 FMLA v16.4s, v24.4s, v0.s[2]
157 PRFM PLDL1KEEP, [x5, 128]
158 FMLA v17.4s, v25.4s, v0.s[2]
159 FMLA v18.4s, v24.4s, v1.s[2]
160 PRFM PLDL1KEEP, [x5, 192]
161 FMLA v19.4s, v25.4s, v1.s[2]
162 FMLA v28.4s, v24.4s, v2.s[2]
163 PRFM PLDL1KEEP, [x5, 256]
164 FMLA v29.4s, v25.4s, v2.s[2]
165 FMLA v30.4s, v24.4s, v3.s[2]
166 PRFM PLDL1KEEP, [x5, 320]
167 FMLA v31.4s, v25.4s, v3.s[2]
168 FMLA v16.4s, v26.4s, v0.s[3]
169 FMLA v17.4s, v27.4s, v0.s[3]
170 FMLA v18.4s, v26.4s, v1.s[3]
171 FMLA v19.4s, v27.4s, v1.s[3]
172 FMLA v28.4s, v26.4s, v2.s[3]
173 FMLA v29.4s, v27.4s, v2.s[3]
174 FMLA v30.4s, v26.4s, v3.s[3]
175 FMLA v31.4s, v27.4s, v3.s[3]
176
177 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
178 FMLA v16.4s, v8.4s, v4.s[0]
179 LDP q20, q21, [x5], 32
180 FMLA v17.4s, v9.4s, v4.s[0]
181 FMLA v18.4s, v8.4s, v5.s[0]
182 LDP q22, q23, [x5], 32
183 FMLA v19.4s, v9.4s, v5.s[0]
184 FMLA v28.4s, v8.4s, v6.s[0]
185 LDP q24, q25, [x5], 32
186 FMLA v29.4s, v9.4s, v6.s[0]
187 FMLA v30.4s, v8.4s, v7.s[0]
188 LDP q26, q27, [x5], 32
189 FMLA v31.4s, v9.4s, v7.s[0]
190 FMLA v16.4s, v10.4s, v4.s[1]
191 LDR q0, [x3], 16
192 FMLA v17.4s, v11.4s, v4.s[1]
193 FMLA v18.4s, v10.4s, v5.s[1]
194 LDR q1, [x11], 16
195 FMLA v19.4s, v11.4s, v5.s[1]
196 FMLA v28.4s, v10.4s, v6.s[1]
197 LDR q2, [x12], 16
198 FMLA v29.4s, v11.4s, v6.s[1]
199 FMLA v30.4s, v10.4s, v7.s[1]
200 LDR q3, [x4], 16
201 FMLA v31.4s, v11.4s, v7.s[1]
202 FMLA v16.4s, v12.4s, v4.s[2]
203 FMLA v17.4s, v13.4s, v4.s[2]
204 FMLA v18.4s, v12.4s, v5.s[2]
205 FMLA v19.4s, v13.4s, v5.s[2]
206 FMLA v28.4s, v12.4s, v6.s[2]
207 FMLA v29.4s, v13.4s, v6.s[2]
208 FMLA v30.4s, v12.4s, v7.s[2]
209 FMLA v31.4s, v13.4s, v7.s[2]
210 FMLA v16.4s, v14.4s, v4.s[3]
211 FMLA v17.4s, v15.4s, v4.s[3]
212 FMLA v18.4s, v14.4s, v5.s[3]
213 FMLA v19.4s, v15.4s, v5.s[3]
214 FMLA v28.4s, v14.4s, v6.s[3]
215 FMLA v29.4s, v15.4s, v6.s[3]
216 SUBS x0, x0, 32
217 FMLA v30.4s, v14.4s, v7.s[3]
218 FMLA v31.4s, v15.4s, v7.s[3]
219 B.HS 1b
220
2212:
222 # Epilogue
223 # First block of 4. FMA for first 4, loads for 2nd block of 4.
224 FMLA v16.4s, v20.4s, v0.s[0]
225 LDP q8, q9, [x5], 32
226 FMLA v17.4s, v21.4s, v0.s[0]
227 FMLA v18.4s, v20.4s, v1.s[0]
228 LDP q10, q11, [x5], 32
229 FMLA v19.4s, v21.4s, v1.s[0]
230 FMLA v28.4s, v20.4s, v2.s[0]
231 LDP q12, q13, [x5], 32
232 FMLA v29.4s, v21.4s, v2.s[0]
233 FMLA v30.4s, v20.4s, v3.s[0]
234 LDP q14, q15, [x5], 32
235 FMLA v31.4s, v21.4s, v3.s[0]
236 FMLA v16.4s, v22.4s, v0.s[1]
237 LDR q4, [x3], 16
238 FMLA v17.4s, v23.4s, v0.s[1]
239 FMLA v18.4s, v22.4s, v1.s[1]
240 LDR q5, [x11], 16
241 FMLA v19.4s, v23.4s, v1.s[1]
242 FMLA v28.4s, v22.4s, v2.s[1]
243 LDR q6, [x12], 16
244 FMLA v29.4s, v23.4s, v2.s[1]
245 FMLA v30.4s, v22.4s, v3.s[1]
246 LDR q7, [x4], 16
247 FMLA v31.4s, v23.4s, v3.s[1]
248 FMLA v16.4s, v24.4s, v0.s[2]
249 FMLA v17.4s, v25.4s, v0.s[2]
250 FMLA v18.4s, v24.4s, v1.s[2]
251 FMLA v19.4s, v25.4s, v1.s[2]
252 FMLA v28.4s, v24.4s, v2.s[2]
253 FMLA v29.4s, v25.4s, v2.s[2]
254 FMLA v30.4s, v24.4s, v3.s[2]
255 FMLA v31.4s, v25.4s, v3.s[2]
256 FMLA v16.4s, v26.4s, v0.s[3]
257 FMLA v17.4s, v27.4s, v0.s[3]
258 FMLA v18.4s, v26.4s, v1.s[3]
259 FMLA v19.4s, v27.4s, v1.s[3]
260 FMLA v28.4s, v26.4s, v2.s[3]
261 FMLA v29.4s, v27.4s, v2.s[3]
262 FMLA v30.4s, v26.4s, v3.s[3]
263 FMLA v31.4s, v27.4s, v3.s[3]
264
265 # Second block of 4. FMA for second 4, noloads
266 FMLA v16.4s, v8.4s, v4.s[0]
267 FMLA v17.4s, v9.4s, v4.s[0]
268 FMLA v18.4s, v8.4s, v5.s[0]
269 FMLA v19.4s, v9.4s, v5.s[0]
270 FMLA v28.4s, v8.4s, v6.s[0]
271 FMLA v29.4s, v9.4s, v6.s[0]
272 FMLA v30.4s, v8.4s, v7.s[0]
273 FMLA v31.4s, v9.4s, v7.s[0]
274
275 FMLA v16.4s, v10.4s, v4.s[1]
276 FMLA v17.4s, v11.4s, v4.s[1]
277 FMLA v18.4s, v10.4s, v5.s[1]
278 FMLA v19.4s, v11.4s, v5.s[1]
279 FMLA v28.4s, v10.4s, v6.s[1]
280 FMLA v29.4s, v11.4s, v6.s[1]
281 FMLA v30.4s, v10.4s, v7.s[1]
282 FMLA v31.4s, v11.4s, v7.s[1]
283
284 FMLA v16.4s, v12.4s, v4.s[2]
285 FMLA v17.4s, v13.4s, v4.s[2]
286 FMLA v18.4s, v12.4s, v5.s[2]
287 FMLA v19.4s, v13.4s, v5.s[2]
288 FMLA v28.4s, v12.4s, v6.s[2]
289 FMLA v29.4s, v13.4s, v6.s[2]
290 FMLA v30.4s, v12.4s, v7.s[2]
291 FMLA v31.4s, v13.4s, v7.s[2]
292
293 FMLA v16.4s, v14.4s, v4.s[3]
294 FMLA v17.4s, v15.4s, v4.s[3]
295 FMLA v18.4s, v14.4s, v5.s[3]
296 FMLA v19.4s, v15.4s, v5.s[3]
297
298 # Load clamping_params values
299 LD2R {v4.4s, v5.4s}, [x8]
300
301 FMLA v28.4s, v14.4s, v6.s[3]
302 FMLA v29.4s, v15.4s, v6.s[3]
303 FMLA v30.4s, v14.4s, v7.s[3]
304 FMLA v31.4s, v15.4s, v7.s[3]
305
3063:
307 # Remainder- 4 floats of A (16 bytes)
308 TBZ x0, 4, 4f
309
310 LDR q0, [x3], 16
311 LDP q20, q21, [x5], 32
312 LDR q1, [x11], 16
313 LDR q2, [x12], 16
314 LDR q3, [x4], 16
315 FMLA v16.4s, v20.4s, v0.s[0]
316 FMLA v17.4s, v21.4s, v0.s[0]
317 LDP q22, q23, [x5], 32
318 FMLA v18.4s, v20.4s, v1.s[0]
319 FMLA v19.4s, v21.4s, v1.s[0]
320 LDP q24, q25, [x5], 32
321 FMLA v28.4s, v20.4s, v2.s[0]
322 FMLA v29.4s, v21.4s, v2.s[0]
323 LDP q26, q27, [x5], 32
324 FMLA v30.4s, v20.4s, v3.s[0]
325 FMLA v31.4s, v21.4s, v3.s[0]
326 FMLA v16.4s, v22.4s, v0.s[1]
327 FMLA v17.4s, v23.4s, v0.s[1]
328 FMLA v18.4s, v22.4s, v1.s[1]
329 FMLA v19.4s, v23.4s, v1.s[1]
330 FMLA v28.4s, v22.4s, v2.s[1]
331 FMLA v29.4s, v23.4s, v2.s[1]
332 FMLA v30.4s, v22.4s, v3.s[1]
333 FMLA v31.4s, v23.4s, v3.s[1]
334 FMLA v16.4s, v24.4s, v0.s[2]
335 FMLA v17.4s, v25.4s, v0.s[2]
336 FMLA v18.4s, v24.4s, v1.s[2]
337 FMLA v19.4s, v25.4s, v1.s[2]
338 FMLA v28.4s, v24.4s, v2.s[2]
339 FMLA v29.4s, v25.4s, v2.s[2]
340 FMLA v30.4s, v24.4s, v3.s[2]
341 FMLA v31.4s, v25.4s, v3.s[2]
342 FMLA v16.4s, v26.4s, v0.s[3]
343 FMLA v17.4s, v27.4s, v0.s[3]
344 FMLA v18.4s, v26.4s, v1.s[3]
345 FMLA v19.4s, v27.4s, v1.s[3]
346 FMLA v28.4s, v26.4s, v2.s[3]
347 FMLA v29.4s, v27.4s, v2.s[3]
348 FMLA v30.4s, v26.4s, v3.s[3]
349 FMLA v31.4s, v27.4s, v3.s[3]
350
3514:
352 # Remainder- 2 floats of A (8 bytes)
353 TBZ x0, 3, 5f
354
355 LDR d0, [x3], 8
356 LDP q20, q21, [x5], 32
357 LDR d1, [x11], 8
358 LDR d2, [x12], 8
359 LDR d3, [x4], 8
360 FMLA v16.4s, v20.4s, v0.s[0]
361 FMLA v17.4s, v21.4s, v0.s[0]
362 LDP q22, q23, [x5], 32
363 FMLA v18.4s, v20.4s, v1.s[0]
364 FMLA v19.4s, v21.4s, v1.s[0]
365 FMLA v28.4s, v20.4s, v2.s[0]
366 FMLA v29.4s, v21.4s, v2.s[0]
367 FMLA v30.4s, v20.4s, v3.s[0]
368 FMLA v31.4s, v21.4s, v3.s[0]
369 FMLA v16.4s, v22.4s, v0.s[1]
370 FMLA v17.4s, v23.4s, v0.s[1]
371 FMLA v18.4s, v22.4s, v1.s[1]
372 FMLA v19.4s, v23.4s, v1.s[1]
373 FMLA v28.4s, v22.4s, v2.s[1]
374 FMLA v29.4s, v23.4s, v2.s[1]
375 FMLA v30.4s, v22.4s, v3.s[1]
376 FMLA v31.4s, v23.4s, v3.s[1]
377
3785:
379 # Remainder- 1 float of A (4 bytes)
380 TBZ x0, 2, 6f
381
382 LDR s0, [x3], 4
383 LDP q20, q21, [x5], 32
384 LDR s1, [x11], 4
385 LDR s2, [x12], 4
386 LDR s3, [x4], 4
387 FMLA v16.4s, v20.4s, v0.s[0]
388 FMLA v17.4s, v21.4s, v0.s[0]
389 FMLA v18.4s, v20.4s, v1.s[0]
390 FMLA v19.4s, v21.4s, v1.s[0]
391 FMLA v28.4s, v20.4s, v2.s[0]
392 FMLA v29.4s, v21.4s, v2.s[0]
393 FMLA v30.4s, v20.4s, v3.s[0]
394 FMLA v31.4s, v21.4s, v3.s[0]
395
3966:
397 # Clamp
398 FMIN v16.4s, v16.4s, v4.4s
399 FMIN v17.4s, v17.4s, v4.4s
400 FMIN v18.4s, v18.4s, v4.4s
401 FMIN v19.4s, v19.4s, v4.4s
402 FMIN v28.4s, v28.4s, v4.4s
403 FMIN v29.4s, v29.4s, v4.4s
404 FMIN v30.4s, v30.4s, v4.4s
405 FMIN v31.4s, v31.4s, v4.4s
406 FMAX v16.4s, v16.4s, v5.4s
407 FMAX v17.4s, v17.4s, v5.4s
408 FMAX v18.4s, v18.4s, v5.4s
409 FMAX v19.4s, v19.4s, v5.4s
410 FMAX v28.4s, v28.4s, v5.4s
411 FMAX v29.4s, v29.4s, v5.4s
412 FMAX v30.4s, v30.4s, v5.4s
413 FMAX v31.4s, v31.4s, v5.4s
414
415 # Store full 4 x 8
416 CMP x1, 8
417 B.LO 7f
418
419 STP q30, q31, [x7]
420 ADD x7, x7, x14
421 STP q28, q29, [x10]
422 ADD x10, x10, x14
423 STP q18, q19, [x9]
424 ADD x9, x9, x14
425 STP q16, q17, [x6]
426 ADD x6, x6, x14
427
428 SUB x3, x3, x2 // a0 -= kc
429 SUB x11, x11, x2 // a1 -= kc
430 SUB x12, x12, x2 // a2 -= kc
431 SUB x4, x4, x2 // a3 -= kc
432
433 SUBS x1, x1, 8
434 B.HI 0b
435
436 # Restore d8-d15 from stack
437 LDP d14, d15, [sp, 48]
438 LDP d12, d13, [sp, 32]
439 LDP d10, d11, [sp, 16]
440 LDP d8, d9, [sp], 64
441 RET
442
443 # Store odd width
4447:
445 TBZ x1, 2, 8f
446 STR q30, [x7], 16
447 MOV v30.16b, v31.16b
448 STR q28, [x10], 16
449 MOV v28.16b, v29.16b
450 STR q18, [x9], 16
451 MOV v18.16b, v19.16b
452 STR q16, [x6], 16
453 MOV v16.16b, v17.16b
454
4558:
456 TBZ x1, 1, 9f
457 STR d30, [x7], 8
458 DUP d30, v30.d[1]
459 STR d28, [x10], 8
460 DUP d28, v28.d[1]
461 STR d18, [x9], 8
462 DUP d18, v18.d[1]
463 STR d16, [x6], 8
464 DUP d16, v16.d[1]
465
4669:
467 TBZ x1, 0, 10f
468 STR s30, [x7]
469 STR s28, [x10]
470 STR s18, [x9]
471 STR s16, [x6]
47210:
473 # Restore d8-d15 from stack
474 LDP d14, d15, [sp, 48]
475 LDP d12, d13, [sp, 32]
476 LDP d10, d11, [sp, 16]
477 LDP d8, d9, [sp], 64
478 RET
479
480END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a75
481
482#ifdef __ELF__
483.section ".note.GNU-stack","",%progbits
484#endif