blob: bbc1b00d6b4b7e8abae148b5dc65659bfe770e25 [file] [log] [blame]
Frank Barchard387c2d12019-12-16 19:14:07 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
Frank Barchard387c2d12019-12-16 19:14:07 -080012# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57(
XNNPACK Teamb455b122019-09-27 18:10:33 -070013# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# size_t ks, x3 / x9
17# const float**restrict a, x4
18# const float*restrict w, x5
19# float*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x10
22# size_t a_offset, [sp + 8] -> x11
23# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070024# const xnn_f32_minmax_params params [sp + 24] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26# d8-d15 need to be preserved if used.
27# x19-30 need to be preserved if used.
28
29# A pointers
30# x20 a0
31# x13 a1
32# x14 a2
33# x15 a3
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x7 c3 / cm_stride
40
41# Vector register usage
42# A0 v0 v4
43# A1 v1 v5
44# A2 v2 v6
45# A3 v3 v7
46# B v8 v9 v10 v11
47# B v12 v13 v14 v15
48# B v20 v21 v22 v23
49# B v24 v25 v26 v27
50# C v16 v17
51# C v18 v19
52# C v28 v29
53# C v30 v31
54# Clamp v4 v5
55
Frank Barchard387c2d12019-12-16 19:14:07 -080056BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57
XNNPACK Teamb455b122019-09-27 18:10:33 -070057
58 # Load cn_stride, a_offset
59 LDP x10, x11, [sp]
60
61 # Load zero, clamping params pointer
62 LDP x12, x8, [sp, 16]
63
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070064 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 LD2R {v4.4s, v5.4s}, [x8]
66
67 # Save x20 on stack
68 STR x20, [sp, -80]!
69
70 # Save d8-d15 on stack
71 STP d8, d9, [sp, 16]
72 STP d10, d11, [sp, 32]
73 STP d12, d13, [sp, 48]
74 STP d14, d15, [sp, 64]
75
76 # Clamp C pointers
XNNPACK Teamb455b122019-09-27 18:10:33 -070077 CMP x0, 2 // if mr < 2
Frank Barchard684bbb02019-11-16 14:14:42 -080078 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 CSEL x16, x6, x16, LO // c1 = c0
80
81 ADD x17, x16, x7 // c2 = c1 + cm_stride
82 // if mr <= 2
83 CSEL x17, x16, x17, LS // c2 = c1
84
XNNPACK Teamb455b122019-09-27 18:10:33 -070085 CMP x0, 4 // if mr < 4
Frank Barchard684bbb02019-11-16 14:14:42 -080086 ADD x7, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070087 CSEL x7, x17, x7, LO // c3 = c2
88
890:
90 # Load initial bias from w into accumulators
91 LDP q16, q17, [x5], 32
92 MOV v18.16b, v16.16b
93 MOV v19.16b, v17.16b
94 MOV v28.16b, v16.16b
95 MOV v29.16b, v17.16b
96 MOV v30.16b, v16.16b
97 MOV v31.16b, v17.16b
98
99 MOV x9, x3 // p = ks
100
1011:
102 # Load next 4 A pointers
103 LDP x20, x13, [x4], 16
104 LDP x14, x15, [x4], 16
105
106 CMP x20, x12 // if a0 == zero
107 ADD x20, x20, x11 // a0 += a_offset
108 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset
109 CMP x13, x12 // if a1 == zero
110 ADD x13, x13, x11 // a1 += a_offset
111 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset
112 CMP x14, x12 // if a2 == zero
113 ADD x14, x14, x11 // a2 += a_offset
114 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset
115 CMP x15, x12 // if a3 == zero
116 ADD x15, x15, x11 // a3 += a_offset
117 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset
118
119 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
120 SUBS x0, x2, 32 // k = kc - 32
121 B.LO 4f
122
123 # 16 prologue
124 # Read first block of 4 A and B.
125 LDR q0, [x20], 16
126 LDP q20, q21, [x5], 32
127 LDR q1, [x13], 16
128 LDR q2, [x14], 16
129 LDR q3, [x15], 16
130 LDP q22, q23, [x5], 32
131 LDP q24, q25, [x5], 32
132 LDP q26, q27, [x5], 32
133
134 # Is there at least 32. yes do main loop
135 SUBS x0, x0, 32
136 B.LO 3f
137
138 # Main loop - 8 floats of A
1392:
140 # First block of 4. FMA for first 4, loads for 2nd block of 4.
141 FMLA v16.4s, v20.4s, v0.s[0]
142 LDP q8, q9, [x5], 32
143 FMLA v17.4s, v21.4s, v0.s[0]
144 FMLA v18.4s, v20.4s, v1.s[0]
145 LDP q10, q11, [x5], 32
146 FMLA v19.4s, v21.4s, v1.s[0]
147 FMLA v28.4s, v20.4s, v2.s[0]
148 LDP q12, q13, [x5], 32
149 FMLA v29.4s, v21.4s, v2.s[0]
150 FMLA v30.4s, v20.4s, v3.s[0]
151 LDP q14, q15, [x5], 32
152 FMLA v31.4s, v21.4s, v3.s[0]
153 FMLA v16.4s, v22.4s, v0.s[1]
154 LDR q4, [x20], 16
155 FMLA v17.4s, v23.4s, v0.s[1]
156 FMLA v18.4s, v22.4s, v1.s[1]
157 LDR q5, [x13], 16
158 FMLA v19.4s, v23.4s, v1.s[1]
159 FMLA v28.4s, v22.4s, v2.s[1]
160 LDR q6, [x14], 16
161 FMLA v29.4s, v23.4s, v2.s[1]
162 FMLA v30.4s, v22.4s, v3.s[1]
163 LDR q7, [x15], 16
164 FMLA v31.4s, v23.4s, v3.s[1]
165 FMLA v16.4s, v24.4s, v0.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700166 FMLA v17.4s, v25.4s, v0.s[2]
167 FMLA v18.4s, v24.4s, v1.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700168 FMLA v19.4s, v25.4s, v1.s[2]
169 FMLA v28.4s, v24.4s, v2.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700170 FMLA v29.4s, v25.4s, v2.s[2]
171 FMLA v30.4s, v24.4s, v3.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700172 FMLA v31.4s, v25.4s, v3.s[2]
173 FMLA v16.4s, v26.4s, v0.s[3]
174 FMLA v17.4s, v27.4s, v0.s[3]
175 FMLA v18.4s, v26.4s, v1.s[3]
176 FMLA v19.4s, v27.4s, v1.s[3]
177 FMLA v28.4s, v26.4s, v2.s[3]
178 FMLA v29.4s, v27.4s, v2.s[3]
179 FMLA v30.4s, v26.4s, v3.s[3]
180 FMLA v31.4s, v27.4s, v3.s[3]
181
182 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
183 FMLA v16.4s, v8.4s, v4.s[0]
184 LDP q20, q21, [x5], 32
185 FMLA v17.4s, v9.4s, v4.s[0]
186 FMLA v18.4s, v8.4s, v5.s[0]
187 LDP q22, q23, [x5], 32
188 FMLA v19.4s, v9.4s, v5.s[0]
189 FMLA v28.4s, v8.4s, v6.s[0]
190 LDP q24, q25, [x5], 32
191 FMLA v29.4s, v9.4s, v6.s[0]
192 FMLA v30.4s, v8.4s, v7.s[0]
193 LDP q26, q27, [x5], 32
194 FMLA v31.4s, v9.4s, v7.s[0]
195 FMLA v16.4s, v10.4s, v4.s[1]
196 LDR q0, [x20], 16
197 FMLA v17.4s, v11.4s, v4.s[1]
198 FMLA v18.4s, v10.4s, v5.s[1]
199 LDR q1, [x13], 16
200 FMLA v19.4s, v11.4s, v5.s[1]
201 FMLA v28.4s, v10.4s, v6.s[1]
202 LDR q2, [x14], 16
203 FMLA v29.4s, v11.4s, v6.s[1]
204 FMLA v30.4s, v10.4s, v7.s[1]
205 LDR q3, [x15], 16
206 FMLA v31.4s, v11.4s, v7.s[1]
207 FMLA v16.4s, v12.4s, v4.s[2]
208 FMLA v17.4s, v13.4s, v4.s[2]
209 FMLA v18.4s, v12.4s, v5.s[2]
210 FMLA v19.4s, v13.4s, v5.s[2]
211 FMLA v28.4s, v12.4s, v6.s[2]
212 FMLA v29.4s, v13.4s, v6.s[2]
213 FMLA v30.4s, v12.4s, v7.s[2]
214 FMLA v31.4s, v13.4s, v7.s[2]
215 FMLA v16.4s, v14.4s, v4.s[3]
216 FMLA v17.4s, v15.4s, v4.s[3]
217 FMLA v18.4s, v14.4s, v5.s[3]
218 FMLA v19.4s, v15.4s, v5.s[3]
219 FMLA v28.4s, v14.4s, v6.s[3]
220 FMLA v29.4s, v15.4s, v6.s[3]
221 SUBS x0, x0, 32
222 FMLA v30.4s, v14.4s, v7.s[3]
223 FMLA v31.4s, v15.4s, v7.s[3]
224
225 B.HS 2b
226
2273:
228 # Epilogue
229 # First block of 4. FMA for first 4, loads for 2nd block of 4.
230 FMLA v16.4s, v20.4s, v0.s[0]
231 LDP q8, q9, [x5], 32
232 FMLA v17.4s, v21.4s, v0.s[0]
233 FMLA v18.4s, v20.4s, v1.s[0]
234 LDP q10, q11, [x5], 32
235 FMLA v19.4s, v21.4s, v1.s[0]
236 FMLA v28.4s, v20.4s, v2.s[0]
237 LDP q12, q13, [x5], 32
238 FMLA v29.4s, v21.4s, v2.s[0]
239 FMLA v30.4s, v20.4s, v3.s[0]
240 LDP q14, q15, [x5], 32
241 FMLA v31.4s, v21.4s, v3.s[0]
242 FMLA v16.4s, v22.4s, v0.s[1]
243 LDR q4, [x20], 16
244 FMLA v17.4s, v23.4s, v0.s[1]
245 FMLA v18.4s, v22.4s, v1.s[1]
246 LDR q5, [x13], 16
247 FMLA v19.4s, v23.4s, v1.s[1]
248 FMLA v28.4s, v22.4s, v2.s[1]
249 LDR q6, [x14], 16
250 FMLA v29.4s, v23.4s, v2.s[1]
251 FMLA v30.4s, v22.4s, v3.s[1]
252 LDR q7, [x15], 16
253 FMLA v31.4s, v23.4s, v3.s[1]
254 FMLA v16.4s, v24.4s, v0.s[2]
255 FMLA v17.4s, v25.4s, v0.s[2]
256 FMLA v18.4s, v24.4s, v1.s[2]
257 FMLA v19.4s, v25.4s, v1.s[2]
258 FMLA v28.4s, v24.4s, v2.s[2]
259 FMLA v29.4s, v25.4s, v2.s[2]
260 FMLA v30.4s, v24.4s, v3.s[2]
261 FMLA v31.4s, v25.4s, v3.s[2]
262 FMLA v16.4s, v26.4s, v0.s[3]
263 FMLA v17.4s, v27.4s, v0.s[3]
264 FMLA v18.4s, v26.4s, v1.s[3]
265 FMLA v19.4s, v27.4s, v1.s[3]
266 FMLA v28.4s, v26.4s, v2.s[3]
267 FMLA v29.4s, v27.4s, v2.s[3]
268 FMLA v30.4s, v26.4s, v3.s[3]
269 FMLA v31.4s, v27.4s, v3.s[3]
270
271 # Second block of 4. FMA for second 4, noloads
272 FMLA v16.4s, v8.4s, v4.s[0]
273 FMLA v17.4s, v9.4s, v4.s[0]
274 FMLA v18.4s, v8.4s, v5.s[0]
275 FMLA v19.4s, v9.4s, v5.s[0]
276 FMLA v28.4s, v8.4s, v6.s[0]
277 FMLA v29.4s, v9.4s, v6.s[0]
278 FMLA v30.4s, v8.4s, v7.s[0]
279 FMLA v31.4s, v9.4s, v7.s[0]
280 FMLA v16.4s, v10.4s, v4.s[1]
281 FMLA v17.4s, v11.4s, v4.s[1]
282 FMLA v18.4s, v10.4s, v5.s[1]
283 FMLA v19.4s, v11.4s, v5.s[1]
284 FMLA v28.4s, v10.4s, v6.s[1]
285 FMLA v29.4s, v11.4s, v6.s[1]
286 FMLA v30.4s, v10.4s, v7.s[1]
287 FMLA v31.4s, v11.4s, v7.s[1]
288 FMLA v16.4s, v12.4s, v4.s[2]
289 FMLA v17.4s, v13.4s, v4.s[2]
290 FMLA v18.4s, v12.4s, v5.s[2]
291 FMLA v19.4s, v13.4s, v5.s[2]
292 FMLA v28.4s, v12.4s, v6.s[2]
293 FMLA v29.4s, v13.4s, v6.s[2]
294 FMLA v30.4s, v12.4s, v7.s[2]
295 FMLA v31.4s, v13.4s, v7.s[2]
296
297 FMLA v16.4s, v14.4s, v4.s[3]
298 FMLA v17.4s, v15.4s, v4.s[3]
299 FMLA v18.4s, v14.4s, v5.s[3]
300 FMLA v19.4s, v15.4s, v5.s[3]
301
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700302 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700303 LD2R {v4.4s, v5.4s}, [x8]
304
305 FMLA v28.4s, v14.4s, v6.s[3]
306 FMLA v29.4s, v15.4s, v6.s[3]
307 FMLA v30.4s, v14.4s, v7.s[3]
308 FMLA v31.4s, v15.4s, v7.s[3]
309
3104:
311 # Remainder- 4 floats of A
312 TBZ x0, 4, 5f
313
314 LDR q0, [x20], 16
315 LDP q20, q21, [x5], 32
316 LDR q1, [x13], 16
317 LDR q2, [x14], 16
318 LDR q3, [x15], 16
319 FMLA v16.4s, v20.4s, v0.s[0]
320 FMLA v17.4s, v21.4s, v0.s[0]
321 LDP q22, q23, [x5], 32
322 FMLA v18.4s, v20.4s, v1.s[0]
323 FMLA v19.4s, v21.4s, v1.s[0]
324 LDP q24, q25, [x5], 32
325 FMLA v28.4s, v20.4s, v2.s[0]
326 FMLA v29.4s, v21.4s, v2.s[0]
327 LDP q26, q27, [x5], 32
328 FMLA v30.4s, v20.4s, v3.s[0]
329 FMLA v31.4s, v21.4s, v3.s[0]
330 FMLA v16.4s, v22.4s, v0.s[1]
331 FMLA v17.4s, v23.4s, v0.s[1]
332 FMLA v18.4s, v22.4s, v1.s[1]
333 FMLA v19.4s, v23.4s, v1.s[1]
334 FMLA v28.4s, v22.4s, v2.s[1]
335 FMLA v29.4s, v23.4s, v2.s[1]
336 FMLA v30.4s, v22.4s, v3.s[1]
337 FMLA v31.4s, v23.4s, v3.s[1]
338 FMLA v16.4s, v24.4s, v0.s[2]
339 FMLA v17.4s, v25.4s, v0.s[2]
340 FMLA v18.4s, v24.4s, v1.s[2]
341 FMLA v19.4s, v25.4s, v1.s[2]
342 FMLA v28.4s, v24.4s, v2.s[2]
343 FMLA v29.4s, v25.4s, v2.s[2]
344 FMLA v30.4s, v24.4s, v3.s[2]
345 FMLA v31.4s, v25.4s, v3.s[2]
346 FMLA v16.4s, v26.4s, v0.s[3]
347 FMLA v17.4s, v27.4s, v0.s[3]
348 FMLA v18.4s, v26.4s, v1.s[3]
349 FMLA v19.4s, v27.4s, v1.s[3]
350 FMLA v28.4s, v26.4s, v2.s[3]
351 FMLA v29.4s, v27.4s, v2.s[3]
352 FMLA v30.4s, v26.4s, v3.s[3]
353 FMLA v31.4s, v27.4s, v3.s[3]
354
3555:
356 # Remainder- 2 floats of A
357 TBZ x0, 3, 6f
358
359 LDR d0, [x20], 8
360 LDP q20, q21, [x5], 32
361 LDR d1, [x13], 8
362 LDR d2, [x14], 8
363 LDR d3, [x15], 8
364 FMLA v16.4s, v20.4s, v0.s[0]
365 FMLA v17.4s, v21.4s, v0.s[0]
366 LDP q22, q23, [x5], 32
367 FMLA v18.4s, v20.4s, v1.s[0]
368 FMLA v19.4s, v21.4s, v1.s[0]
369 FMLA v28.4s, v20.4s, v2.s[0]
370 FMLA v29.4s, v21.4s, v2.s[0]
371 FMLA v30.4s, v20.4s, v3.s[0]
372 FMLA v31.4s, v21.4s, v3.s[0]
373 FMLA v16.4s, v22.4s, v0.s[1]
374 FMLA v17.4s, v23.4s, v0.s[1]
375 FMLA v18.4s, v22.4s, v1.s[1]
376 FMLA v19.4s, v23.4s, v1.s[1]
377 FMLA v28.4s, v22.4s, v2.s[1]
378 FMLA v29.4s, v23.4s, v2.s[1]
379 FMLA v30.4s, v22.4s, v3.s[1]
380 FMLA v31.4s, v23.4s, v3.s[1]
381
3826:
383 # Remainder- 1 float of A
384 TBZ x0, 2, 7f
385
386 LDR s0, [x20], 4
387 LDP q20, q21, [x5], 32
388 LDR s1, [x13], 4
389 LDR s2, [x14], 4
390 LDR s3, [x15], 4
391 FMLA v16.4s, v20.4s, v0.s[0]
392 FMLA v17.4s, v21.4s, v0.s[0]
393 FMLA v18.4s, v20.4s, v1.s[0]
394 FMLA v19.4s, v21.4s, v1.s[0]
395 FMLA v28.4s, v20.4s, v2.s[0]
396 FMLA v29.4s, v21.4s, v2.s[0]
397 FMLA v30.4s, v20.4s, v3.s[0]
398 FMLA v31.4s, v21.4s, v3.s[0]
399
4007:
401 # ks loop
402 SUBS x9, x9, 32 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800403 B.HI 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700404
405 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700406 FMAX v16.4s, v16.4s, v4.4s
407 FMAX v17.4s, v17.4s, v4.4s
408 FMAX v18.4s, v18.4s, v4.4s
409 FMAX v19.4s, v19.4s, v4.4s
410 FMAX v28.4s, v28.4s, v4.4s
411 FMAX v29.4s, v29.4s, v4.4s
412 FMAX v30.4s, v30.4s, v4.4s
413 FMAX v31.4s, v31.4s, v4.4s
414 FMIN v16.4s, v16.4s, v5.4s
415 FMIN v17.4s, v17.4s, v5.4s
416 FMIN v18.4s, v18.4s, v5.4s
417 FMIN v19.4s, v19.4s, v5.4s
418 FMIN v28.4s, v28.4s, v5.4s
419 FMIN v29.4s, v29.4s, v5.4s
420 FMIN v30.4s, v30.4s, v5.4s
421 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700422
423 # Store full 4 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800424 SUBS x1, x1, 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700425 B.LO 8f
426
427 STP q30, q31, [x7]
428 ADD x7, x7, x10
429 STP q28, q29, [x17]
430 ADD x17, x17, x10
431 STP q18, q19, [x16]
432 ADD x16, x16, x10
433 STP q16, q17, [x6]
434 ADD x6, x6, x10
435
436 SUB x4, x4, x3 // a -= ks
437
438 # nc loop
XNNPACK Teamb455b122019-09-27 18:10:33 -0700439 B.HI 0b
440
441 # Restore d8-d15 from stack
442 LDP d14, d15, [sp, 64]
443 LDP d12, d13, [sp, 48]
444 LDP d10, d11, [sp, 32]
445 LDP d8, d9, [sp, 16]
446
447 # Restore x20 from stack
448 LDR x20, [sp], 80
449 RET
450
451 # Store odd width
4528:
453 TBZ x1, 2, 9f
454 STR q30, [x7], 16
455 MOV v30.16b, v31.16b
456 STR q28, [x17], 16
457 MOV v28.16b, v29.16b
458 STR q18, [x16], 16
459 MOV v18.16b, v19.16b
460 STR q16, [x6], 16
461 MOV v16.16b, v17.16b
462
4639:
464 TBZ x1, 1, 10f
465 STR d30, [x7], 8
466 DUP d30, v30.d[1]
467 STR d28, [x17], 8
468 DUP d28, v28.d[1]
469 STR d18, [x16], 8
470 DUP d18, v18.d[1]
471 STR d16, [x6], 8
472 DUP d16, v16.d[1]
473
47410:
475 TBZ x1, 0, 11f
476 STR s30, [x7]
477 STR s28, [x17]
478 STR s18, [x16]
479 STR s16, [x6]
48011:
481 # Restore d8-d15 from stack
482 LDP d14, d15, [sp, 64]
483 LDP d12, d13, [sp, 48]
484 LDP d10, d11, [sp, 32]
485 LDP d8, d9, [sp, 16]
486
487 # Restore x20 from stack
488 LDR x20, [sp], 80
489 RET
490
Frank Barchard387c2d12019-12-16 19:14:07 -0800491END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57
XNNPACK Teamb455b122019-09-27 18:10:33 -0700492
493#ifdef __ELF__
494.section ".note.GNU-stack","",%progbits
495#endif