blob: 8260ee87e86e6e81f4dce7f7c879e3477996a7cf [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const float*restrict w, x5
15# float*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
20# const xnn_f32_output_params params [sp + 24] -> x8
21
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x20 a0
27# x13 a1
28# x14 a2
29# x15 a3
30
31# C pointers
32# x6 c0
33# x16 c1
34# x17 c2
35# x7 c3 / cm_stride
36
37# Vector register usage
38# A0 v0 v4
39# A1 v1 v5
40# A2 v2 v6
41# A3 v3 v7
42# B v8 v9 v10 v11
43# B v12 v13 v14 v15
44# B v20 v21 v22 v23
45# B v24 v25 v26 v27
46# C v16 v17
47# C v18 v19
48# C v28 v29
49# C v30 v31
50# Clamp v4 v5
51
52BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75
53
54 # Load cn_stride, a_offset
55 LDP x10, x11, [sp]
56
57 # Load zero, clamping params pointer
58 LDP x12, x8, [sp, 16]
59
60 # Load clamping_params values
61 LD2R {v4.4s, v5.4s}, [x8]
62
63 # Save x20 on stack
64 STR x20, [sp, -80]!
65
66 # Save d8-d15 on stack
67 STP d8, d9, [sp, 16]
68 STP d10, d11, [sp, 32]
69 STP d12, d13, [sp, 48]
70 STP d14, d15, [sp, 64]
71
72 # Clamp C pointers
73 ADD x16, x6, x7 // c1 = c0 + cm_stride
74 CMP x0, 2 // if mr < 2
75 CSEL x16, x6, x16, LO // c1 = c0
76
77 ADD x17, x16, x7 // c2 = c1 + cm_stride
78 // if mr <= 2
79 CSEL x17, x16, x17, LS // c2 = c1
80
81 ADD x7, x17, x7 // c3 = c2 + cm_stride
82 CMP x0, 4 // if mr < 4
83 CSEL x7, x17, x7, LO // c3 = c2
84
850:
86 # Load initial bias from w into accumulators
87 LDP q16, q17, [x5], 32
88 MOV v18.16b, v16.16b
89 MOV v19.16b, v17.16b
90 MOV v28.16b, v16.16b
91 MOV v29.16b, v17.16b
92 MOV v30.16b, v16.16b
93 MOV v31.16b, v17.16b
94
95 MOV x9, x3 // p = ks
96
971:
98 # Load next 4 A pointers
99 LDP x20, x13, [x4], 16
100 LDP x14, x15, [x4], 16
101
102 CMP x20, x12 // if a0 == zero
103 ADD x20, x20, x11 // a0 += a_offset
104 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset
105 CMP x13, x12 // if a1 == zero
106 ADD x13, x13, x11 // a1 += a_offset
107 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset
108 CMP x14, x12 // if a2 == zero
109 ADD x14, x14, x11 // a2 += a_offset
110 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset
111 CMP x15, x12 // if a3 == zero
112 ADD x15, x15, x11 // a3 += a_offset
113 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset
114
115 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
116 SUBS x0, x2, 32 // k = kc - 32
117 B.LO 4f
118
119 # 16 prologue
120 # Read first block of 4 A and B.
121 LDR q0, [x20], 16
122 LDP q20, q21, [x5], 32
123 LDR q1, [x13], 16
124 LDR q2, [x14], 16
125 LDR q3, [x15], 16
126 LDP q22, q23, [x5], 32
127 LDP q24, q25, [x5], 32
128 LDP q26, q27, [x5], 32
129
130 # Is there at least 32. yes do main loop
131 SUBS x0, x0, 32
132 B.LO 3f
133
134 # Main loop - 8 floats of A
1352:
136 # First block of 4. FMA for first 4, loads for 2nd block of 4.
137 FMLA v16.4s, v20.4s, v0.s[0]
138 LDP q8, q9, [x5], 32
139 FMLA v17.4s, v21.4s, v0.s[0]
140 FMLA v18.4s, v20.4s, v1.s[0]
141 LDP q10, q11, [x5], 32
142 FMLA v19.4s, v21.4s, v1.s[0]
143 FMLA v28.4s, v20.4s, v2.s[0]
144 LDP q12, q13, [x5], 32
145 FMLA v29.4s, v21.4s, v2.s[0]
146 FMLA v30.4s, v20.4s, v3.s[0]
147 LDP q14, q15, [x5], 32
148 FMLA v31.4s, v21.4s, v3.s[0]
149 FMLA v16.4s, v22.4s, v0.s[1]
150 LDR q4, [x20], 16
151 FMLA v17.4s, v23.4s, v0.s[1]
152 FMLA v18.4s, v22.4s, v1.s[1]
153 LDR q5, [x13], 16
154 FMLA v19.4s, v23.4s, v1.s[1]
155 FMLA v28.4s, v22.4s, v2.s[1]
156 LDR q6, [x14], 16
157 FMLA v29.4s, v23.4s, v2.s[1]
158 FMLA v30.4s, v22.4s, v3.s[1]
159 LDR q7, [x15], 16
160 FMLA v31.4s, v23.4s, v3.s[1]
161 FMLA v16.4s, v24.4s, v0.s[2]
162 PRFM PLDL1KEEP, [x5, 128]
163 FMLA v17.4s, v25.4s, v0.s[2]
164 FMLA v18.4s, v24.4s, v1.s[2]
165 PRFM PLDL1KEEP, [x5, 192]
166 FMLA v19.4s, v25.4s, v1.s[2]
167 FMLA v28.4s, v24.4s, v2.s[2]
168 PRFM PLDL1KEEP, [x5, 256]
169 FMLA v29.4s, v25.4s, v2.s[2]
170 FMLA v30.4s, v24.4s, v3.s[2]
171 PRFM PLDL1KEEP, [x5, 320]
172 FMLA v31.4s, v25.4s, v3.s[2]
173 FMLA v16.4s, v26.4s, v0.s[3]
174 FMLA v17.4s, v27.4s, v0.s[3]
175 FMLA v18.4s, v26.4s, v1.s[3]
176 FMLA v19.4s, v27.4s, v1.s[3]
177 FMLA v28.4s, v26.4s, v2.s[3]
178 FMLA v29.4s, v27.4s, v2.s[3]
179 FMLA v30.4s, v26.4s, v3.s[3]
180 FMLA v31.4s, v27.4s, v3.s[3]
181
182 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
183 FMLA v16.4s, v8.4s, v4.s[0]
184 LDP q20, q21, [x5], 32
185 FMLA v17.4s, v9.4s, v4.s[0]
186 FMLA v18.4s, v8.4s, v5.s[0]
187 LDP q22, q23, [x5], 32
188 FMLA v19.4s, v9.4s, v5.s[0]
189 FMLA v28.4s, v8.4s, v6.s[0]
190 LDP q24, q25, [x5], 32
191 FMLA v29.4s, v9.4s, v6.s[0]
192 FMLA v30.4s, v8.4s, v7.s[0]
193 LDP q26, q27, [x5], 32
194 FMLA v31.4s, v9.4s, v7.s[0]
195 FMLA v16.4s, v10.4s, v4.s[1]
196 LDR q0, [x20], 16
197 FMLA v17.4s, v11.4s, v4.s[1]
198 FMLA v18.4s, v10.4s, v5.s[1]
199 LDR q1, [x13], 16
200 FMLA v19.4s, v11.4s, v5.s[1]
201 FMLA v28.4s, v10.4s, v6.s[1]
202 LDR q2, [x14], 16
203 FMLA v29.4s, v11.4s, v6.s[1]
204 FMLA v30.4s, v10.4s, v7.s[1]
205 LDR q3, [x15], 16
206 FMLA v31.4s, v11.4s, v7.s[1]
207 FMLA v16.4s, v12.4s, v4.s[2]
208 FMLA v17.4s, v13.4s, v4.s[2]
209 FMLA v18.4s, v12.4s, v5.s[2]
210 FMLA v19.4s, v13.4s, v5.s[2]
211 FMLA v28.4s, v12.4s, v6.s[2]
212 FMLA v29.4s, v13.4s, v6.s[2]
213 FMLA v30.4s, v12.4s, v7.s[2]
214 FMLA v31.4s, v13.4s, v7.s[2]
215 FMLA v16.4s, v14.4s, v4.s[3]
216 FMLA v17.4s, v15.4s, v4.s[3]
217 FMLA v18.4s, v14.4s, v5.s[3]
218 FMLA v19.4s, v15.4s, v5.s[3]
219 FMLA v28.4s, v14.4s, v6.s[3]
220 FMLA v29.4s, v15.4s, v6.s[3]
221 SUBS x0, x0, 32
222 FMLA v30.4s, v14.4s, v7.s[3]
223 FMLA v31.4s, v15.4s, v7.s[3]
224
225 B.HS 2b
226
2273:
228 # Epilogue
229 # First block of 4. FMA for first 4, loads for 2nd block of 4.
230 FMLA v16.4s, v20.4s, v0.s[0]
231 LDP q8, q9, [x5], 32
232 FMLA v17.4s, v21.4s, v0.s[0]
233 FMLA v18.4s, v20.4s, v1.s[0]
234 LDP q10, q11, [x5], 32
235 FMLA v19.4s, v21.4s, v1.s[0]
236 FMLA v28.4s, v20.4s, v2.s[0]
237 LDP q12, q13, [x5], 32
238 FMLA v29.4s, v21.4s, v2.s[0]
239 FMLA v30.4s, v20.4s, v3.s[0]
240 LDP q14, q15, [x5], 32
241 FMLA v31.4s, v21.4s, v3.s[0]
242 FMLA v16.4s, v22.4s, v0.s[1]
243 LDR q4, [x20], 16
244 FMLA v17.4s, v23.4s, v0.s[1]
245 FMLA v18.4s, v22.4s, v1.s[1]
246 LDR q5, [x13], 16
247 FMLA v19.4s, v23.4s, v1.s[1]
248 FMLA v28.4s, v22.4s, v2.s[1]
249 LDR q6, [x14], 16
250 FMLA v29.4s, v23.4s, v2.s[1]
251 FMLA v30.4s, v22.4s, v3.s[1]
252 LDR q7, [x15], 16
253 FMLA v31.4s, v23.4s, v3.s[1]
254 FMLA v16.4s, v24.4s, v0.s[2]
255 FMLA v17.4s, v25.4s, v0.s[2]
256 FMLA v18.4s, v24.4s, v1.s[2]
257 FMLA v19.4s, v25.4s, v1.s[2]
258 FMLA v28.4s, v24.4s, v2.s[2]
259 FMLA v29.4s, v25.4s, v2.s[2]
260 FMLA v30.4s, v24.4s, v3.s[2]
261 FMLA v31.4s, v25.4s, v3.s[2]
262 FMLA v16.4s, v26.4s, v0.s[3]
263 FMLA v17.4s, v27.4s, v0.s[3]
264 FMLA v18.4s, v26.4s, v1.s[3]
265 FMLA v19.4s, v27.4s, v1.s[3]
266 FMLA v28.4s, v26.4s, v2.s[3]
267 FMLA v29.4s, v27.4s, v2.s[3]
268 FMLA v30.4s, v26.4s, v3.s[3]
269 FMLA v31.4s, v27.4s, v3.s[3]
270
271 # Second block of 4. FMA for second 4, noloads
272 FMLA v16.4s, v8.4s, v4.s[0]
273 FMLA v17.4s, v9.4s, v4.s[0]
274 FMLA v18.4s, v8.4s, v5.s[0]
275 FMLA v19.4s, v9.4s, v5.s[0]
276 FMLA v28.4s, v8.4s, v6.s[0]
277 FMLA v29.4s, v9.4s, v6.s[0]
278 FMLA v30.4s, v8.4s, v7.s[0]
279 FMLA v31.4s, v9.4s, v7.s[0]
280 FMLA v16.4s, v10.4s, v4.s[1]
281 FMLA v17.4s, v11.4s, v4.s[1]
282 FMLA v18.4s, v10.4s, v5.s[1]
283 FMLA v19.4s, v11.4s, v5.s[1]
284 FMLA v28.4s, v10.4s, v6.s[1]
285 FMLA v29.4s, v11.4s, v6.s[1]
286 FMLA v30.4s, v10.4s, v7.s[1]
287 FMLA v31.4s, v11.4s, v7.s[1]
288 FMLA v16.4s, v12.4s, v4.s[2]
289 FMLA v17.4s, v13.4s, v4.s[2]
290 FMLA v18.4s, v12.4s, v5.s[2]
291 FMLA v19.4s, v13.4s, v5.s[2]
292 FMLA v28.4s, v12.4s, v6.s[2]
293 FMLA v29.4s, v13.4s, v6.s[2]
294 FMLA v30.4s, v12.4s, v7.s[2]
295 FMLA v31.4s, v13.4s, v7.s[2]
296
297 FMLA v16.4s, v14.4s, v4.s[3]
298 FMLA v17.4s, v15.4s, v4.s[3]
299 FMLA v18.4s, v14.4s, v5.s[3]
300 FMLA v19.4s, v15.4s, v5.s[3]
301
302 # Load clamping_params values
303 LD2R {v4.4s, v5.4s}, [x8]
304
305 FMLA v28.4s, v14.4s, v6.s[3]
306 FMLA v29.4s, v15.4s, v6.s[3]
307 FMLA v30.4s, v14.4s, v7.s[3]
308 FMLA v31.4s, v15.4s, v7.s[3]
309
3104:
311 # Remainder- 4 floats of A
312 TBZ x0, 4, 5f
313
314 LDR q0, [x20], 16
315 LDP q20, q21, [x5], 32
316 LDR q1, [x13], 16
317 LDR q2, [x14], 16
318 LDR q3, [x15], 16
319 FMLA v16.4s, v20.4s, v0.s[0]
320 FMLA v17.4s, v21.4s, v0.s[0]
321 LDP q22, q23, [x5], 32
322 FMLA v18.4s, v20.4s, v1.s[0]
323 FMLA v19.4s, v21.4s, v1.s[0]
324 LDP q24, q25, [x5], 32
325 FMLA v28.4s, v20.4s, v2.s[0]
326 FMLA v29.4s, v21.4s, v2.s[0]
327 LDP q26, q27, [x5], 32
328 FMLA v30.4s, v20.4s, v3.s[0]
329 FMLA v31.4s, v21.4s, v3.s[0]
330 FMLA v16.4s, v22.4s, v0.s[1]
331 FMLA v17.4s, v23.4s, v0.s[1]
332 FMLA v18.4s, v22.4s, v1.s[1]
333 FMLA v19.4s, v23.4s, v1.s[1]
334 FMLA v28.4s, v22.4s, v2.s[1]
335 FMLA v29.4s, v23.4s, v2.s[1]
336 FMLA v30.4s, v22.4s, v3.s[1]
337 FMLA v31.4s, v23.4s, v3.s[1]
338 FMLA v16.4s, v24.4s, v0.s[2]
339 FMLA v17.4s, v25.4s, v0.s[2]
340 FMLA v18.4s, v24.4s, v1.s[2]
341 FMLA v19.4s, v25.4s, v1.s[2]
342 FMLA v28.4s, v24.4s, v2.s[2]
343 FMLA v29.4s, v25.4s, v2.s[2]
344 FMLA v30.4s, v24.4s, v3.s[2]
345 FMLA v31.4s, v25.4s, v3.s[2]
346 FMLA v16.4s, v26.4s, v0.s[3]
347 FMLA v17.4s, v27.4s, v0.s[3]
348 FMLA v18.4s, v26.4s, v1.s[3]
349 FMLA v19.4s, v27.4s, v1.s[3]
350 FMLA v28.4s, v26.4s, v2.s[3]
351 FMLA v29.4s, v27.4s, v2.s[3]
352 FMLA v30.4s, v26.4s, v3.s[3]
353 FMLA v31.4s, v27.4s, v3.s[3]
354
3555:
356 # Remainder- 2 floats of A
357 TBZ x0, 3, 6f
358
359 LDR d0, [x20], 8
360 LDP q20, q21, [x5], 32
361 LDR d1, [x13], 8
362 LDR d2, [x14], 8
363 LDR d3, [x15], 8
364 FMLA v16.4s, v20.4s, v0.s[0]
365 FMLA v17.4s, v21.4s, v0.s[0]
366 LDP q22, q23, [x5], 32
367 FMLA v18.4s, v20.4s, v1.s[0]
368 FMLA v19.4s, v21.4s, v1.s[0]
369 FMLA v28.4s, v20.4s, v2.s[0]
370 FMLA v29.4s, v21.4s, v2.s[0]
371 FMLA v30.4s, v20.4s, v3.s[0]
372 FMLA v31.4s, v21.4s, v3.s[0]
373 FMLA v16.4s, v22.4s, v0.s[1]
374 FMLA v17.4s, v23.4s, v0.s[1]
375 FMLA v18.4s, v22.4s, v1.s[1]
376 FMLA v19.4s, v23.4s, v1.s[1]
377 FMLA v28.4s, v22.4s, v2.s[1]
378 FMLA v29.4s, v23.4s, v2.s[1]
379 FMLA v30.4s, v22.4s, v3.s[1]
380 FMLA v31.4s, v23.4s, v3.s[1]
381
3826:
383 # Remainder- 1 float of A
384 TBZ x0, 2, 7f
385
386 LDR s0, [x20], 4
387 LDP q20, q21, [x5], 32
388 LDR s1, [x13], 4
389 LDR s2, [x14], 4
390 LDR s3, [x15], 4
391 FMLA v16.4s, v20.4s, v0.s[0]
392 FMLA v17.4s, v21.4s, v0.s[0]
393 FMLA v18.4s, v20.4s, v1.s[0]
394 FMLA v19.4s, v21.4s, v1.s[0]
395 FMLA v28.4s, v20.4s, v2.s[0]
396 FMLA v29.4s, v21.4s, v2.s[0]
397 FMLA v30.4s, v20.4s, v3.s[0]
398 FMLA v31.4s, v21.4s, v3.s[0]
399
4007:
401 # ks loop
402 SUBS x9, x9, 32 // ks -= MR * sizeof(void*)
403 B.NE 1b
404
405 # Clamp
406 FMIN v16.4s, v16.4s, v4.4s
407 FMIN v17.4s, v17.4s, v4.4s
408 FMIN v18.4s, v18.4s, v4.4s
409 FMIN v19.4s, v19.4s, v4.4s
410 FMIN v28.4s, v28.4s, v4.4s
411 FMIN v29.4s, v29.4s, v4.4s
412 FMIN v30.4s, v30.4s, v4.4s
413 FMIN v31.4s, v31.4s, v4.4s
414 FMAX v16.4s, v16.4s, v5.4s
415 FMAX v17.4s, v17.4s, v5.4s
416 FMAX v18.4s, v18.4s, v5.4s
417 FMAX v19.4s, v19.4s, v5.4s
418 FMAX v28.4s, v28.4s, v5.4s
419 FMAX v29.4s, v29.4s, v5.4s
420 FMAX v30.4s, v30.4s, v5.4s
421 FMAX v31.4s, v31.4s, v5.4s
422
423 # Store full 4 x 8
424 CMP x1, 8
425 B.LO 8f
426
427 STP q30, q31, [x7]
428 ADD x7, x7, x10
429 STP q28, q29, [x17]
430 ADD x17, x17, x10
431 STP q18, q19, [x16]
432 ADD x16, x16, x10
433 STP q16, q17, [x6]
434 ADD x6, x6, x10
435
436 SUB x4, x4, x3 // a -= ks
437
438 # nc loop
439 SUBS x1, x1, 8
440 B.HI 0b
441
442 # Restore d8-d15 from stack
443 LDP d14, d15, [sp, 64]
444 LDP d12, d13, [sp, 48]
445 LDP d10, d11, [sp, 32]
446 LDP d8, d9, [sp, 16]
447
448 # Restore x20 from stack
449 LDR x20, [sp], 80
450 RET
451
452 # Store odd width
4538:
454 TBZ x1, 2, 9f
455 STR q30, [x7], 16
456 MOV v30.16b, v31.16b
457 STR q28, [x17], 16
458 MOV v28.16b, v29.16b
459 STR q18, [x16], 16
460 MOV v18.16b, v19.16b
461 STR q16, [x6], 16
462 MOV v16.16b, v17.16b
463
4649:
465 TBZ x1, 1, 10f
466 STR d30, [x7], 8
467 DUP d30, v30.d[1]
468 STR d28, [x17], 8
469 DUP d28, v28.d[1]
470 STR d18, [x16], 8
471 DUP d18, v18.d[1]
472 STR d16, [x6], 8
473 DUP d16, v16.d[1]
474
47510:
476 TBZ x1, 0, 11f
477 STR s30, [x7]
478 STR s28, [x17]
479 STR s18, [x16]
480 STR s16, [x6]
48111:
482 # Restore d8-d15 from stack
483 LDP d14, d15, [sp, 64]
484 LDP d12, d13, [sp, 48]
485 LDP d10, d11, [sp, 32]
486 LDP d8, d9, [sp, 16]
487
488 # Restore x20 from stack
489 LDR x20, [sp], 80
490 RET
491
492END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75
493
494#ifdef __ELF__
495.section ".note.GNU-stack","",%progbits
496#endif