blob: c50de214fc1eee1ecaf5c4279e388f27dbf8d32f [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
39# Vector register usage
40# A0 v0 v4
41# A1 v1 v5
42# A2 v2 v6
43# A3 v3 v7
44# B v8 v9 v10 v11
45# B v12 v13 v14 v15
46# B v20 v21 v22 v23
47# B v24 v25 v26 v27
48# C v16 v17
49# C v18 v19
50# C v28 v29
51# C v30 v31
52# Clamp v4 v5
53
54BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75
55
56 # Load cn_stride, params pointer
57 LDP x14, x8, [sp]
58
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070059 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070060 LD2R {v4.4s, v5.4s}, [x8]
61
62 # Save d8-d15 on stack
63 STP d8, d9, [sp, -64]!
64 STP d10, d11, [sp, 16]
65 STP d12, d13, [sp, 32]
66 STP d14, d15, [sp, 48]
67
68 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080069 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 ADD x11, x3, x4 // a1 = a0 + a_stride
71 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070072 CSEL x11, x3, x11, LO // a1 = a0
73 CSEL x9, x6, x9, LO // c1 = c0
74
75 ADD x12, x11, x4 // a2 = a1 + a_stride
76 ADD x10, x9, x7 // c2 = c1 + cm_stride
77 // if mr <= 2
78 CSEL x12, x11, x12, LS // a2 = a1
79 CSEL x10, x9, x10, LS // c2 = c1
80
Frank Barchard684bbb02019-11-16 14:14:42 -080081 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 ADD x4, x12, x4 // a3 = a2 + a_stride
83 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070084 CSEL x4, x12, x4, LO // a3 = a2
85 CSEL x7, x10, x7, LO // c3 = c2
86
870:
88 # Load initial bias from w into accumulators
89 LDP q16, q17, [x5], 32
90 MOV v18.16b, v16.16b
91 MOV v19.16b, v17.16b
92 MOV v28.16b, v16.16b
93 MOV v29.16b, v17.16b
94 MOV v30.16b, v16.16b
95 MOV v31.16b, v17.16b
96
97 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
98 SUBS x0, x2, 32 // k = kc - 32
99 B.LO 3f
100
101 # 16 prologue
102 # Read first block of 4 A and B.
103 LDR q0, [x3], 16
104 LDP q20, q21, [x5], 32
105 LDR q1, [x11], 16
106 LDR q2, [x12], 16
107 LDR q3, [x4], 16
108 LDP q22, q23, [x5], 32
109 LDP q24, q25, [x5], 32
110 LDP q26, q27, [x5], 32
111
112 # Is there at least 32. yes do main loop
113 SUBS x0, x0, 32
114 B.LO 2f
115
116 # Main loop - 8 floats of A (32 bytes)
1171:
118 # First block of 4. FMA for first 4, loads for 2nd block of 4.
119 FMLA v16.4s, v20.4s, v0.s[0]
120 LDP q8, q9, [x5], 32
121 FMLA v17.4s, v21.4s, v0.s[0]
122 FMLA v18.4s, v20.4s, v1.s[0]
123 LDP q10, q11, [x5], 32
124 FMLA v19.4s, v21.4s, v1.s[0]
125 FMLA v28.4s, v20.4s, v2.s[0]
126 LDP q12, q13, [x5], 32
127 FMLA v29.4s, v21.4s, v2.s[0]
128 FMLA v30.4s, v20.4s, v3.s[0]
129 LDP q14, q15, [x5], 32
130 FMLA v31.4s, v21.4s, v3.s[0]
131 FMLA v16.4s, v22.4s, v0.s[1]
132 LDR q4, [x3], 16
133 FMLA v17.4s, v23.4s, v0.s[1]
134 FMLA v18.4s, v22.4s, v1.s[1]
135 LDR q5, [x11], 16
136 FMLA v19.4s, v23.4s, v1.s[1]
137 FMLA v28.4s, v22.4s, v2.s[1]
138 LDR q6, [x12], 16
139 FMLA v29.4s, v23.4s, v2.s[1]
140 FMLA v30.4s, v22.4s, v3.s[1]
141 LDR q7, [x4], 16
142 FMLA v31.4s, v23.4s, v3.s[1]
143 FMLA v16.4s, v24.4s, v0.s[2]
144 PRFM PLDL1KEEP, [x5, 128]
145 FMLA v17.4s, v25.4s, v0.s[2]
146 FMLA v18.4s, v24.4s, v1.s[2]
147 PRFM PLDL1KEEP, [x5, 192]
148 FMLA v19.4s, v25.4s, v1.s[2]
149 FMLA v28.4s, v24.4s, v2.s[2]
150 PRFM PLDL1KEEP, [x5, 256]
151 FMLA v29.4s, v25.4s, v2.s[2]
152 FMLA v30.4s, v24.4s, v3.s[2]
153 PRFM PLDL1KEEP, [x5, 320]
154 FMLA v31.4s, v25.4s, v3.s[2]
155 FMLA v16.4s, v26.4s, v0.s[3]
156 FMLA v17.4s, v27.4s, v0.s[3]
157 FMLA v18.4s, v26.4s, v1.s[3]
158 FMLA v19.4s, v27.4s, v1.s[3]
159 FMLA v28.4s, v26.4s, v2.s[3]
160 FMLA v29.4s, v27.4s, v2.s[3]
161 FMLA v30.4s, v26.4s, v3.s[3]
162 FMLA v31.4s, v27.4s, v3.s[3]
163
164 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
165 FMLA v16.4s, v8.4s, v4.s[0]
166 LDP q20, q21, [x5], 32
167 FMLA v17.4s, v9.4s, v4.s[0]
168 FMLA v18.4s, v8.4s, v5.s[0]
169 LDP q22, q23, [x5], 32
170 FMLA v19.4s, v9.4s, v5.s[0]
171 FMLA v28.4s, v8.4s, v6.s[0]
172 LDP q24, q25, [x5], 32
173 FMLA v29.4s, v9.4s, v6.s[0]
174 FMLA v30.4s, v8.4s, v7.s[0]
175 LDP q26, q27, [x5], 32
176 FMLA v31.4s, v9.4s, v7.s[0]
177 FMLA v16.4s, v10.4s, v4.s[1]
178 LDR q0, [x3], 16
179 FMLA v17.4s, v11.4s, v4.s[1]
180 FMLA v18.4s, v10.4s, v5.s[1]
181 LDR q1, [x11], 16
182 FMLA v19.4s, v11.4s, v5.s[1]
183 FMLA v28.4s, v10.4s, v6.s[1]
184 LDR q2, [x12], 16
185 FMLA v29.4s, v11.4s, v6.s[1]
186 FMLA v30.4s, v10.4s, v7.s[1]
187 LDR q3, [x4], 16
188 FMLA v31.4s, v11.4s, v7.s[1]
189 FMLA v16.4s, v12.4s, v4.s[2]
190 FMLA v17.4s, v13.4s, v4.s[2]
191 FMLA v18.4s, v12.4s, v5.s[2]
192 FMLA v19.4s, v13.4s, v5.s[2]
193 FMLA v28.4s, v12.4s, v6.s[2]
194 FMLA v29.4s, v13.4s, v6.s[2]
195 FMLA v30.4s, v12.4s, v7.s[2]
196 FMLA v31.4s, v13.4s, v7.s[2]
197 FMLA v16.4s, v14.4s, v4.s[3]
198 FMLA v17.4s, v15.4s, v4.s[3]
199 FMLA v18.4s, v14.4s, v5.s[3]
200 FMLA v19.4s, v15.4s, v5.s[3]
201 FMLA v28.4s, v14.4s, v6.s[3]
202 FMLA v29.4s, v15.4s, v6.s[3]
203 SUBS x0, x0, 32
204 FMLA v30.4s, v14.4s, v7.s[3]
205 FMLA v31.4s, v15.4s, v7.s[3]
206 B.HS 1b
207
2082:
209 # Epilogue
210 # First block of 4. FMA for first 4, loads for 2nd block of 4.
211 FMLA v16.4s, v20.4s, v0.s[0]
212 LDP q8, q9, [x5], 32
213 FMLA v17.4s, v21.4s, v0.s[0]
214 FMLA v18.4s, v20.4s, v1.s[0]
215 LDP q10, q11, [x5], 32
216 FMLA v19.4s, v21.4s, v1.s[0]
217 FMLA v28.4s, v20.4s, v2.s[0]
218 LDP q12, q13, [x5], 32
219 FMLA v29.4s, v21.4s, v2.s[0]
220 FMLA v30.4s, v20.4s, v3.s[0]
221 LDP q14, q15, [x5], 32
222 FMLA v31.4s, v21.4s, v3.s[0]
223 FMLA v16.4s, v22.4s, v0.s[1]
224 LDR q4, [x3], 16
225 FMLA v17.4s, v23.4s, v0.s[1]
226 FMLA v18.4s, v22.4s, v1.s[1]
227 LDR q5, [x11], 16
228 FMLA v19.4s, v23.4s, v1.s[1]
229 FMLA v28.4s, v22.4s, v2.s[1]
230 LDR q6, [x12], 16
231 FMLA v29.4s, v23.4s, v2.s[1]
232 FMLA v30.4s, v22.4s, v3.s[1]
233 LDR q7, [x4], 16
234 FMLA v31.4s, v23.4s, v3.s[1]
235 FMLA v16.4s, v24.4s, v0.s[2]
236 FMLA v17.4s, v25.4s, v0.s[2]
237 FMLA v18.4s, v24.4s, v1.s[2]
238 FMLA v19.4s, v25.4s, v1.s[2]
239 FMLA v28.4s, v24.4s, v2.s[2]
240 FMLA v29.4s, v25.4s, v2.s[2]
241 FMLA v30.4s, v24.4s, v3.s[2]
242 FMLA v31.4s, v25.4s, v3.s[2]
243 FMLA v16.4s, v26.4s, v0.s[3]
244 FMLA v17.4s, v27.4s, v0.s[3]
245 FMLA v18.4s, v26.4s, v1.s[3]
246 FMLA v19.4s, v27.4s, v1.s[3]
247 FMLA v28.4s, v26.4s, v2.s[3]
248 FMLA v29.4s, v27.4s, v2.s[3]
249 FMLA v30.4s, v26.4s, v3.s[3]
250 FMLA v31.4s, v27.4s, v3.s[3]
251
252 # Second block of 4. FMA for second 4, noloads
253 FMLA v16.4s, v8.4s, v4.s[0]
254 FMLA v17.4s, v9.4s, v4.s[0]
255 FMLA v18.4s, v8.4s, v5.s[0]
256 FMLA v19.4s, v9.4s, v5.s[0]
257 FMLA v28.4s, v8.4s, v6.s[0]
258 FMLA v29.4s, v9.4s, v6.s[0]
259 FMLA v30.4s, v8.4s, v7.s[0]
260 FMLA v31.4s, v9.4s, v7.s[0]
261
262 FMLA v16.4s, v10.4s, v4.s[1]
263 FMLA v17.4s, v11.4s, v4.s[1]
264 FMLA v18.4s, v10.4s, v5.s[1]
265 FMLA v19.4s, v11.4s, v5.s[1]
266 FMLA v28.4s, v10.4s, v6.s[1]
267 FMLA v29.4s, v11.4s, v6.s[1]
268 FMLA v30.4s, v10.4s, v7.s[1]
269 FMLA v31.4s, v11.4s, v7.s[1]
270
271 FMLA v16.4s, v12.4s, v4.s[2]
272 FMLA v17.4s, v13.4s, v4.s[2]
273 FMLA v18.4s, v12.4s, v5.s[2]
274 FMLA v19.4s, v13.4s, v5.s[2]
275 FMLA v28.4s, v12.4s, v6.s[2]
276 FMLA v29.4s, v13.4s, v6.s[2]
277 FMLA v30.4s, v12.4s, v7.s[2]
278 FMLA v31.4s, v13.4s, v7.s[2]
279
280 FMLA v16.4s, v14.4s, v4.s[3]
281 FMLA v17.4s, v15.4s, v4.s[3]
282 FMLA v18.4s, v14.4s, v5.s[3]
283 FMLA v19.4s, v15.4s, v5.s[3]
284
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700285 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700286 LD2R {v4.4s, v5.4s}, [x8]
287
288 FMLA v28.4s, v14.4s, v6.s[3]
289 FMLA v29.4s, v15.4s, v6.s[3]
290 FMLA v30.4s, v14.4s, v7.s[3]
291 FMLA v31.4s, v15.4s, v7.s[3]
292
2933:
294 # Remainder- 4 floats of A (16 bytes)
295 TBZ x0, 4, 4f
296
297 LDR q0, [x3], 16
298 LDP q20, q21, [x5], 32
299 LDR q1, [x11], 16
300 LDR q2, [x12], 16
301 LDR q3, [x4], 16
302 FMLA v16.4s, v20.4s, v0.s[0]
303 FMLA v17.4s, v21.4s, v0.s[0]
304 LDP q22, q23, [x5], 32
305 FMLA v18.4s, v20.4s, v1.s[0]
306 FMLA v19.4s, v21.4s, v1.s[0]
307 LDP q24, q25, [x5], 32
308 FMLA v28.4s, v20.4s, v2.s[0]
309 FMLA v29.4s, v21.4s, v2.s[0]
310 LDP q26, q27, [x5], 32
311 FMLA v30.4s, v20.4s, v3.s[0]
312 FMLA v31.4s, v21.4s, v3.s[0]
313 FMLA v16.4s, v22.4s, v0.s[1]
314 FMLA v17.4s, v23.4s, v0.s[1]
315 FMLA v18.4s, v22.4s, v1.s[1]
316 FMLA v19.4s, v23.4s, v1.s[1]
317 FMLA v28.4s, v22.4s, v2.s[1]
318 FMLA v29.4s, v23.4s, v2.s[1]
319 FMLA v30.4s, v22.4s, v3.s[1]
320 FMLA v31.4s, v23.4s, v3.s[1]
321 FMLA v16.4s, v24.4s, v0.s[2]
322 FMLA v17.4s, v25.4s, v0.s[2]
323 FMLA v18.4s, v24.4s, v1.s[2]
324 FMLA v19.4s, v25.4s, v1.s[2]
325 FMLA v28.4s, v24.4s, v2.s[2]
326 FMLA v29.4s, v25.4s, v2.s[2]
327 FMLA v30.4s, v24.4s, v3.s[2]
328 FMLA v31.4s, v25.4s, v3.s[2]
329 FMLA v16.4s, v26.4s, v0.s[3]
330 FMLA v17.4s, v27.4s, v0.s[3]
331 FMLA v18.4s, v26.4s, v1.s[3]
332 FMLA v19.4s, v27.4s, v1.s[3]
333 FMLA v28.4s, v26.4s, v2.s[3]
334 FMLA v29.4s, v27.4s, v2.s[3]
335 FMLA v30.4s, v26.4s, v3.s[3]
336 FMLA v31.4s, v27.4s, v3.s[3]
337
3384:
339 # Remainder- 2 floats of A (8 bytes)
340 TBZ x0, 3, 5f
341
342 LDR d0, [x3], 8
343 LDP q20, q21, [x5], 32
344 LDR d1, [x11], 8
345 LDR d2, [x12], 8
346 LDR d3, [x4], 8
347 FMLA v16.4s, v20.4s, v0.s[0]
348 FMLA v17.4s, v21.4s, v0.s[0]
349 LDP q22, q23, [x5], 32
350 FMLA v18.4s, v20.4s, v1.s[0]
351 FMLA v19.4s, v21.4s, v1.s[0]
352 FMLA v28.4s, v20.4s, v2.s[0]
353 FMLA v29.4s, v21.4s, v2.s[0]
354 FMLA v30.4s, v20.4s, v3.s[0]
355 FMLA v31.4s, v21.4s, v3.s[0]
356 FMLA v16.4s, v22.4s, v0.s[1]
357 FMLA v17.4s, v23.4s, v0.s[1]
358 FMLA v18.4s, v22.4s, v1.s[1]
359 FMLA v19.4s, v23.4s, v1.s[1]
360 FMLA v28.4s, v22.4s, v2.s[1]
361 FMLA v29.4s, v23.4s, v2.s[1]
362 FMLA v30.4s, v22.4s, v3.s[1]
363 FMLA v31.4s, v23.4s, v3.s[1]
364
3655:
366 # Remainder- 1 float of A (4 bytes)
367 TBZ x0, 2, 6f
368
369 LDR s0, [x3], 4
370 LDP q20, q21, [x5], 32
371 LDR s1, [x11], 4
372 LDR s2, [x12], 4
373 LDR s3, [x4], 4
374 FMLA v16.4s, v20.4s, v0.s[0]
375 FMLA v17.4s, v21.4s, v0.s[0]
376 FMLA v18.4s, v20.4s, v1.s[0]
377 FMLA v19.4s, v21.4s, v1.s[0]
378 FMLA v28.4s, v20.4s, v2.s[0]
379 FMLA v29.4s, v21.4s, v2.s[0]
380 FMLA v30.4s, v20.4s, v3.s[0]
381 FMLA v31.4s, v21.4s, v3.s[0]
382
3836:
384 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700385 FMAX v16.4s, v16.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800386 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700387 FMAX v17.4s, v17.4s, v4.4s
388 FMAX v18.4s, v18.4s, v4.4s
389 FMAX v19.4s, v19.4s, v4.4s
390 FMAX v28.4s, v28.4s, v4.4s
391 FMAX v29.4s, v29.4s, v4.4s
392 FMAX v30.4s, v30.4s, v4.4s
393 FMAX v31.4s, v31.4s, v4.4s
394 FMIN v16.4s, v16.4s, v5.4s
395 FMIN v17.4s, v17.4s, v5.4s
396 FMIN v18.4s, v18.4s, v5.4s
397 FMIN v19.4s, v19.4s, v5.4s
398 FMIN v28.4s, v28.4s, v5.4s
399 FMIN v29.4s, v29.4s, v5.4s
400 FMIN v30.4s, v30.4s, v5.4s
401 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700402
403 # Store full 4 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700404 B.LO 7f
405
XNNPACK Teamb455b122019-09-27 18:10:33 -0700406 STP q16, q17, [x6]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700407 SUB x3, x3, x2 // a0 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800408 ADD x6, x6, x14
409 STP q18, q19, [x9]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700410 SUB x11, x11, x2 // a1 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800411 ADD x9, x9, x14
412 STP q28, q29, [x10]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700413 SUB x12, x12, x2 // a2 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800414 ADD x10, x10, x14
415 STP q30, q31, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700416 SUB x4, x4, x2 // a3 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800417 ADD x7, x7, x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700418
XNNPACK Teamb455b122019-09-27 18:10:33 -0700419 B.HI 0b
420
421 # Restore d8-d15 from stack
422 LDP d14, d15, [sp, 48]
423 LDP d12, d13, [sp, 32]
424 LDP d10, d11, [sp, 16]
425 LDP d8, d9, [sp], 64
426 RET
427
428 # Store odd width
4297:
430 TBZ x1, 2, 8f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700431 STR q16, [x6], 16
432 MOV v16.16b, v17.16b
Frank Barchard19418b52019-11-15 15:15:13 -0800433 STR q18, [x9], 16
434 MOV v18.16b, v19.16b
435 STR q28, [x10], 16
436 MOV v28.16b, v29.16b
437 STR q30, [x7], 16
438 MOV v30.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700439
4408:
441 TBZ x1, 1, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700442 STR d16, [x6], 8
443 DUP d16, v16.d[1]
Frank Barchard19418b52019-11-15 15:15:13 -0800444 STR d18, [x9], 8
445 DUP d18, v18.d[1]
446 STR d28, [x10], 8
447 DUP d28, v28.d[1]
448 STR d30, [x7], 8
449 DUP d30, v30.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700450
4519:
452 TBZ x1, 0, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700453 STR s16, [x6]
Frank Barchard19418b52019-11-15 15:15:13 -0800454 STR s18, [x9]
455 STR s28, [x10]
456 STR s30, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -070045710:
458 # Restore d8-d15 from stack
459 LDP d14, d15, [sp, 48]
460 LDP d12, d13, [sp, 32]
461 LDP d10, d11, [sp, 16]
462 LDP d8, d9, [sp], 64
463 RET
464
Frank Barchard19418b52019-11-15 15:15:13 -0800465
XNNPACK Teamb455b122019-09-27 18:10:33 -0700466END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75
467
468#ifdef __ELF__
469.section ".note.GNU-stack","",%progbits
470#endif