blob: 3479796a8a17887bbafe0f6216109260ac989348 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0 v6
45# A1 v1 v7
46# A2 v2 v8
47# A3 v3 v9
48# A4 v4 v10
49# A5 v5 v11
50# B v12 v13 v14 v15
51# B v16 v17 v18 v19
52# C v20 v21
53# C v22 v23
54# C v24 v25
55# C v26 v27
56# C v28 v29
57# C v30 v31
58# Clamp v6 v7
59
60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73
61
62 # Clamp A and C pointers / Save d8-d15 on stack
63 STP d8, d9, [sp, -64]!
Frank Barchard684bbb02019-11-16 14:14:42 -080064 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 ADD x9, x3, x4 // a1 = a0 + a_stride
66 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 CSEL x9, x3, x9, LO // a1 = a0
68 CSEL x16, x6, x16, LO // c1 = c0
69
70 STP d10, d11, [sp, 16]
71 ADD x10, x9, x4 // a2 = a1 + a_stride
72 ADD x17, x16, x7 // c2 = c1 + cm_stride
73 // if mr <= 2
74 CSEL x10, x9, x10, LS // a2 = a1
75 CSEL x17, x16, x17, LS // c2 = c1
76
77 STP d12, d13, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080078 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 ADD x11, x10, x4 // a3 = a2 + a_stride
80 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 CSEL x11, x10, x11, LO // a3 = a2
82 CSEL x18, x17, x18, LO // c3 = c2
83
84 STP d14, d15, [sp, 48]
85 ADD x12, x11, x4 // a4 = a3 + a_stride
86 ADD x13, x18, x7 // c4 = c3 + cm_stride
87 // if mr <= 5
88 CSEL x12, x11, x12, LS // a4 = a3
89 CSEL x13, x18, x13, LS // c4 = c3
90
91 $if INC:
92 # Load acc, params pointer
93 LDP x15, x8, [sp, 72]
94 $else:
95 # Load params pointer
96 LDR x8, [sp, 72]
97
Frank Barchard684bbb02019-11-16 14:14:42 -080098 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070099 ADD x4, x12, x4 // a5 = a4 + a_stride
100 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101 CSEL x4, x12, x4, LO // a5 = a4
102 CSEL x7, x13, x7, LO // c5 = c4
103
104 # Load cn_stride
105 LDR x14, [sp, 64]
106
107 .p2align 3
1080:
109 $if INC:
110 # Load initial accumulators
111 LDP q20, q21, [x15], 32
112 LDP q22, q23, [x15], 32
113 LDP q24, q25, [x15], 32
114 LDP q26, q27, [x15], 32
115 LDP q28, q29, [x15], 32
116 LDP q30, q31, [x15], 32
117 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
118 PRFM PLDL1KEEP, [x5, 64]
119 PRFM PLDL1KEEP, [x5, 128]
120 PRFM PLDL1KEEP, [x5, 192]
121 PRFM PLDL1KEEP, [x3] // Prefetch A
122 PRFM PLDL1KEEP, [x9]
123 PRFM PLDL1KEEP, [x10]
124 PRFM PLDL1KEEP, [x11]
125 PRFM PLDL1KEEP, [x12]
126 PRFM PLDL1KEEP, [x4]
127 $else:
128 # Load initial bias from w into accumulators
129 LDP q20, q21, [x5], 32
130 MOV v22.16b, v20.16b
131 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
132 MOV v23.16b, v21.16b
133 PRFM PLDL1KEEP, [x5, 64]
134 MOV v24.16b, v20.16b
135 PRFM PLDL1KEEP, [x5, 128]
136 MOV v25.16b, v21.16b
137 PRFM PLDL1KEEP, [x5, 192]
138 MOV v26.16b, v20.16b
139 PRFM PLDL1KEEP, [x3] // Prefetch A
140 MOV v27.16b, v21.16b
141 PRFM PLDL1KEEP, [x9]
142 MOV v28.16b, v20.16b
143 PRFM PLDL1KEEP, [x10]
144 MOV v29.16b, v21.16b
145 PRFM PLDL1KEEP, [x11]
146 MOV v30.16b, v20.16b
147 PRFM PLDL1KEEP, [x12]
148 MOV v31.16b, v21.16b
149 PRFM PLDL1KEEP, [x4]
150
151 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
152 SUBS x0, x2, 32 // k = kc - 32
153 B.LO 4f
154
155 # Prologue - loads for main loop of 96 FMA
156 # load A0 to A4 but not A5
157 LDP q0, q6, [x3], 32
158 LDP q1, q7, [x9], 32
159 LDP q2, q8, [x10], 32
160 LDP q3, q9, [x11], 32
161 LDP q4, q10, [x12], 32
162 # load first set of B
163 LDP q12, q13, [x5], 32
164 LDP q14, q15, [x5], 32
165
166 # Is there at least 8 floats (32 bytes) for main loop?
167 SUBS x0, x0, 32
168 B.LO 2f
169
170 # Main loop - 8 floats of A (32 bytes)
171 # 96 FMA + 6 LDP A + 8 LDP B
172 .p2align 3
1731:
174 # First group of 4 A. 48 FMA. Loads A5
175
176 LDP q5, q11, [x4], 32
177 FMLA v20.4s, v12.4s, v0.s[0]
178 FMLA v22.4s, v12.4s, v1.s[0]
179 LDP q16, q17, [x5], 32
180 FMLA v24.4s, v12.4s, v2.s[0]
181 FMLA v26.4s, v12.4s, v3.s[0]
182 LDP q18, q19, [x5], 32
183 FMLA v28.4s, v12.4s, v4.s[0]
184 FMLA v30.4s, v12.4s, v5.s[0]
185 FMLA v21.4s, v13.4s, v0.s[0]
186 FMLA v23.4s, v13.4s, v1.s[0]
187 FMLA v25.4s, v13.4s, v2.s[0]
188 FMLA v27.4s, v13.4s, v3.s[0]
189 FMLA v29.4s, v13.4s, v4.s[0]
190 FMLA v31.4s, v13.4s, v5.s[0]
191
192 FMLA v20.4s, v14.4s, v0.s[1]
193 FMLA v22.4s, v14.4s, v1.s[1]
194 FMLA v24.4s, v14.4s, v2.s[1]
195 FMLA v26.4s, v14.4s, v3.s[1]
196 FMLA v28.4s, v14.4s, v4.s[1]
197 FMLA v30.4s, v14.4s, v5.s[1]
198 FMLA v21.4s, v15.4s, v0.s[1]
199 FMLA v23.4s, v15.4s, v1.s[1]
200 FMLA v25.4s, v15.4s, v2.s[1]
201 FMLA v27.4s, v15.4s, v3.s[1]
202 FMLA v29.4s, v15.4s, v4.s[1]
203 FMLA v31.4s, v15.4s, v5.s[1]
204
205 LDP q12, q13, [x5], 32
206 FMLA v20.4s, v16.4s, v0.s[2]
207 FMLA v22.4s, v16.4s, v1.s[2]
208 LDP q14, q15, [x5], 32
209 FMLA v24.4s, v16.4s, v2.s[2]
210 FMLA v26.4s, v16.4s, v3.s[2]
211 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
212 FMLA v28.4s, v16.4s, v4.s[2]
213 FMLA v30.4s, v16.4s, v5.s[2]
214 PRFM PLDL1KEEP, [x5, 256]
215 FMLA v21.4s, v17.4s, v0.s[2]
216 FMLA v23.4s, v17.4s, v1.s[2]
217 FMLA v25.4s, v17.4s, v2.s[2]
218 FMLA v27.4s, v17.4s, v3.s[2]
219 FMLA v29.4s, v17.4s, v4.s[2]
220 FMLA v31.4s, v17.4s, v5.s[2]
221
222 FMLA v20.4s, v18.4s, v0.s[3]
223 FMLA v22.4s, v18.4s, v1.s[3]
224 FMLA v24.4s, v18.4s, v2.s[3]
225 FMLA v26.4s, v18.4s, v3.s[3]
226 FMLA v28.4s, v18.4s, v4.s[3]
227 FMLA v30.4s, v18.4s, v5.s[3]
228 FMLA v21.4s, v19.4s, v0.s[3]
229 FMLA v23.4s, v19.4s, v1.s[3]
230 FMLA v25.4s, v19.4s, v2.s[3]
231 FMLA v27.4s, v19.4s, v3.s[3]
232 FMLA v29.4s, v19.4s, v4.s[3]
233 FMLA v31.4s, v19.4s, v5.s[3]
234
235 # Second group of 4 A. 48 FMA. Loads A0 - A4
236
237 LDP q16, q17, [x5], 32
238 FMLA v20.4s, v12.4s, v6.s[0]
239 FMLA v22.4s, v12.4s, v7.s[0]
240 LDP q18, q19, [x5], 32
241 FMLA v24.4s, v12.4s, v8.s[0]
242 FMLA v26.4s, v12.4s, v9.s[0]
243 FMLA v28.4s, v12.4s, v10.s[0]
244 FMLA v30.4s, v12.4s, v11.s[0]
245 FMLA v21.4s, v13.4s, v6.s[0]
246 FMLA v23.4s, v13.4s, v7.s[0]
247 FMLA v25.4s, v13.4s, v8.s[0]
248 FMLA v27.4s, v13.4s, v9.s[0]
249 FMLA v29.4s, v13.4s, v10.s[0]
250 FMLA v31.4s, v13.4s, v11.s[0]
251
252 FMLA v20.4s, v14.4s, v6.s[1]
253 FMLA v22.4s, v14.4s, v7.s[1]
254 FMLA v24.4s, v14.4s, v8.s[1]
255 FMLA v26.4s, v14.4s, v9.s[1]
256 FMLA v28.4s, v14.4s, v10.s[1]
257 FMLA v30.4s, v14.4s, v11.s[1]
258 FMLA v21.4s, v15.4s, v6.s[1]
259 FMLA v23.4s, v15.4s, v7.s[1]
260 FMLA v25.4s, v15.4s, v8.s[1]
261 FMLA v27.4s, v15.4s, v9.s[1]
262 FMLA v29.4s, v15.4s, v10.s[1]
263 FMLA v31.4s, v15.4s, v11.s[1]
264
265 LDP q12, q13, [x5], 32
266 FMLA v20.4s, v16.4s, v6.s[2]
267 FMLA v20.4s, v18.4s, v6.s[3]
268 LDP q14, q15, [x5], 32
269 FMLA v21.4s, v17.4s, v6.s[2]
270 FMLA v21.4s, v19.4s, v6.s[3]
271 LDP q0, q6, [x3], 32
272 FMLA v22.4s, v16.4s, v7.s[2]
273 FMLA v22.4s, v18.4s, v7.s[3]
274 FMLA v23.4s, v17.4s, v7.s[2]
275 FMLA v23.4s, v19.4s, v7.s[3]
276 LDP q1, q7, [x9], 32
277 FMLA v24.4s, v16.4s, v8.s[2]
278 FMLA v24.4s, v18.4s, v8.s[3]
279 FMLA v25.4s, v17.4s, v8.s[2]
280 FMLA v25.4s, v19.4s, v8.s[3]
281 LDP q2, q8, [x10], 32
282 FMLA v26.4s, v16.4s, v9.s[2]
283 FMLA v26.4s, v18.4s, v9.s[3]
284 FMLA v27.4s, v17.4s, v9.s[2]
285 FMLA v27.4s, v19.4s, v9.s[3]
286 LDP q3, q9, [x11], 32
287 FMLA v28.4s, v16.4s, v10.s[2]
288 FMLA v28.4s, v18.4s, v10.s[3]
289 FMLA v29.4s, v17.4s, v10.s[2]
290 FMLA v29.4s, v19.4s, v10.s[3]
291 LDP q4, q10, [x12], 32
292 FMLA v30.4s, v16.4s, v11.s[2]
293 FMLA v30.4s, v18.4s, v11.s[3]
294 SUBS x0, x0, 32
295 FMLA v31.4s, v17.4s, v11.s[2]
296 FMLA v31.4s, v19.4s, v11.s[3]
297 B.HS 1b
298
299 # Epilogue - 8 floats of A (32 bytes)
300 # 96 FMA + 6 LDP A + 8 LDP B
301 # First block same as main loop. Second block has no preloads.
3022:
303 # First group of 4 A. 48 FMA. Loads A5
304
305 LDP q5, q11, [x4], 32
306 FMLA v20.4s, v12.4s, v0.s[0]
307 FMLA v22.4s, v12.4s, v1.s[0]
308 LDP q16, q17, [x5], 32
309 FMLA v24.4s, v12.4s, v2.s[0]
310 FMLA v26.4s, v12.4s, v3.s[0]
311 LDP q18, q19, [x5], 32
312 FMLA v28.4s, v12.4s, v4.s[0]
313 FMLA v30.4s, v12.4s, v5.s[0]
314 FMLA v21.4s, v13.4s, v0.s[0]
315 FMLA v23.4s, v13.4s, v1.s[0]
316 FMLA v25.4s, v13.4s, v2.s[0]
317 FMLA v27.4s, v13.4s, v3.s[0]
318 FMLA v29.4s, v13.4s, v4.s[0]
319 FMLA v31.4s, v13.4s, v5.s[0]
320
321 FMLA v20.4s, v14.4s, v0.s[1]
322 FMLA v22.4s, v14.4s, v1.s[1]
323 FMLA v24.4s, v14.4s, v2.s[1]
324 FMLA v26.4s, v14.4s, v3.s[1]
325 FMLA v28.4s, v14.4s, v4.s[1]
326 FMLA v30.4s, v14.4s, v5.s[1]
327 FMLA v21.4s, v15.4s, v0.s[1]
328 FMLA v23.4s, v15.4s, v1.s[1]
329 FMLA v25.4s, v15.4s, v2.s[1]
330 FMLA v27.4s, v15.4s, v3.s[1]
331 FMLA v29.4s, v15.4s, v4.s[1]
332 FMLA v31.4s, v15.4s, v5.s[1]
333
334 LDP q12, q13, [x5], 32
335 FMLA v20.4s, v16.4s, v0.s[2]
336 FMLA v22.4s, v16.4s, v1.s[2]
337 LDP q14, q15, [x5], 32
338 FMLA v24.4s, v16.4s, v2.s[2]
339 FMLA v26.4s, v16.4s, v3.s[2]
340 FMLA v28.4s, v16.4s, v4.s[2]
341 FMLA v30.4s, v16.4s, v5.s[2]
342 FMLA v21.4s, v17.4s, v0.s[2]
343 FMLA v23.4s, v17.4s, v1.s[2]
344 FMLA v25.4s, v17.4s, v2.s[2]
345 FMLA v27.4s, v17.4s, v3.s[2]
346 FMLA v29.4s, v17.4s, v4.s[2]
347 FMLA v31.4s, v17.4s, v5.s[2]
348
349 FMLA v20.4s, v18.4s, v0.s[3]
350 FMLA v22.4s, v18.4s, v1.s[3]
351 FMLA v24.4s, v18.4s, v2.s[3]
352 FMLA v26.4s, v18.4s, v3.s[3]
353 FMLA v28.4s, v18.4s, v4.s[3]
354 FMLA v30.4s, v18.4s, v5.s[3]
355 FMLA v21.4s, v19.4s, v0.s[3]
356 FMLA v23.4s, v19.4s, v1.s[3]
357 FMLA v25.4s, v19.4s, v2.s[3]
358 FMLA v27.4s, v19.4s, v3.s[3]
359 FMLA v29.4s, v19.4s, v4.s[3]
360 FMLA v31.4s, v19.4s, v5.s[3]
361
362 # Second group of 4 A. 48 FMA. No A Loads, No last B load
363
364 LDP q16, q17, [x5], 32
365 FMLA v20.4s, v12.4s, v6.s[0]
366 FMLA v22.4s, v12.4s, v7.s[0]
367 LDP q18, q19, [x5], 32
368 FMLA v24.4s, v12.4s, v8.s[0]
369 FMLA v26.4s, v12.4s, v9.s[0]
370 FMLA v28.4s, v12.4s, v10.s[0]
371 FMLA v30.4s, v12.4s, v11.s[0]
372 FMLA v21.4s, v13.4s, v6.s[0]
373 FMLA v23.4s, v13.4s, v7.s[0]
374 FMLA v25.4s, v13.4s, v8.s[0]
375 FMLA v27.4s, v13.4s, v9.s[0]
376 FMLA v29.4s, v13.4s, v10.s[0]
377 FMLA v31.4s, v13.4s, v11.s[0]
378
379 FMLA v20.4s, v14.4s, v6.s[1]
380 FMLA v22.4s, v14.4s, v7.s[1]
381 FMLA v24.4s, v14.4s, v8.s[1]
382 FMLA v26.4s, v14.4s, v9.s[1]
383 FMLA v28.4s, v14.4s, v10.s[1]
384 FMLA v30.4s, v14.4s, v11.s[1]
385 FMLA v21.4s, v15.4s, v6.s[1]
386 FMLA v23.4s, v15.4s, v7.s[1]
387 FMLA v25.4s, v15.4s, v8.s[1]
388 FMLA v27.4s, v15.4s, v9.s[1]
389 FMLA v29.4s, v15.4s, v10.s[1]
390 FMLA v31.4s, v15.4s, v11.s[1]
391
392 # Last part of epilogue has loads removed.
393
394 FMLA v20.4s, v16.4s, v6.s[2]
395 FMLA v22.4s, v16.4s, v7.s[2]
396 FMLA v24.4s, v16.4s, v8.s[2]
397 FMLA v26.4s, v16.4s, v9.s[2]
398 FMLA v28.4s, v16.4s, v10.s[2]
399 FMLA v30.4s, v16.4s, v11.s[2]
400 FMLA v21.4s, v17.4s, v6.s[2]
401 FMLA v23.4s, v17.4s, v7.s[2]
402 FMLA v25.4s, v17.4s, v8.s[2]
403 FMLA v27.4s, v17.4s, v9.s[2]
404 FMLA v29.4s, v17.4s, v10.s[2]
405 FMLA v31.4s, v17.4s, v11.s[2]
406
407 FMLA v20.4s, v18.4s, v6.s[3]
408 FMLA v22.4s, v18.4s, v7.s[3]
409 FMLA v24.4s, v18.4s, v8.s[3]
410 FMLA v26.4s, v18.4s, v9.s[3]
411 FMLA v28.4s, v18.4s, v10.s[3]
412 FMLA v30.4s, v18.4s, v11.s[3]
413 FMLA v21.4s, v19.4s, v6.s[3]
414 FMLA v23.4s, v19.4s, v7.s[3]
415
416 # Load clamping_params values
417 LD2R {v6.4s, v7.4s}, [x8]
418
419 FMLA v25.4s, v19.4s, v8.s[3]
420 FMLA v27.4s, v19.4s, v9.s[3]
421 # Is there a remainder?- 4 floats of A (16 bytes) or less
422 TST x0, 31
423 FMLA v29.4s, v19.4s, v10.s[3]
424 FMLA v31.4s, v19.4s, v11.s[3]
425 B.NE 4f
426
427 .p2align 3
428
429 # Clamp
4303:
431 FMIN v20.4s, v20.4s, v6.4s
432 FMIN v21.4s, v21.4s, v6.4s
433 FMIN v22.4s, v22.4s, v6.4s
434 FMIN v23.4s, v23.4s, v6.4s
435 FMIN v24.4s, v24.4s, v6.4s
436 FMIN v25.4s, v25.4s, v6.4s
437 FMIN v26.4s, v26.4s, v6.4s
438 FMIN v27.4s, v27.4s, v6.4s
439 FMIN v28.4s, v28.4s, v6.4s
440 FMIN v29.4s, v29.4s, v6.4s
441 FMIN v30.4s, v30.4s, v6.4s
442 FMIN v31.4s, v31.4s, v6.4s
443 FMAX v20.4s, v20.4s, v7.4s
444 FMAX v21.4s, v21.4s, v7.4s
445 FMAX v22.4s, v22.4s, v7.4s
446 FMAX v23.4s, v23.4s, v7.4s
447 FMAX v24.4s, v24.4s, v7.4s
448 FMAX v25.4s, v25.4s, v7.4s
449 FMAX v26.4s, v26.4s, v7.4s
450 FMAX v27.4s, v27.4s, v7.4s
451 FMAX v28.4s, v28.4s, v7.4s
452 FMAX v29.4s, v29.4s, v7.4s
453 FMAX v30.4s, v30.4s, v7.4s
454 FMAX v31.4s, v31.4s, v7.4s
455
456 # Store full 6 x 8
457 CMP x1, 8
458 B.LO 7f
459
460 $if INC:
461 STP q30, q31, [x7]
462 ADD x7, x7, x14
463 SUB x3, x3, x2 // a0 -= kc
464 STP q28, q29, [x13]
465 ADD x13, x13, x14
466 SUB x9, x9, x2 // a1 -= kc
467 STP q26, q27, [x18]
468 ADD x18, x18, x14
469 SUB x10, x10, x2 // a2 -= kc
470 STP q24, q25, [x17]
471 ADD x17, x17, x14
472 SUB x11, x11, x2 // a3 -= kc
473 STP q22, q23, [x16]
474 ADD x16, x16, x14
475 SUB x12, x12, x2 // a4 -= kc
476 STP q20, q21, [x6]
477 ADD x6, x6, x14
478 SUB x4, x4, x2 // a5 -= kc
479 $else:
480 STP q20, q21, [x6]
481 ADD x6, x6, x14
482 SUB x3, x3, x2 // a0 -= kc
483 STP q22, q23, [x16]
484 ADD x16, x16, x14
485 SUB x9, x9, x2 // a1 -= kc
486 STP q24, q25, [x17]
487 ADD x17, x17, x14
488 SUB x10, x10, x2 // a2 -= kc
489 STP q26, q27, [x18]
490 ADD x18, x18, x14
491 SUB x11, x11, x2 // a3 -= kc
492 STP q28, q29, [x13]
493 ADD x13, x13, x14
494 SUB x12, x12, x2 // a4 -= kc
495 STP q30, q31, [x7]
496 ADD x7, x7, x14
497 SUB x4, x4, x2 // a5 -= kc
498
499 SUBS x1, x1, 8
500 NOP
501 B.HI 0b
502
503 # Restore d8-d15 from stack
504 LDP d14, d15, [sp, 48]
505 LDP d12, d13, [sp, 32]
506 LDP d10, d11, [sp, 16]
507 LDP d8, d9, [sp], 64
508 RET
509
510 .p2align 3
5114:
512 # Load clamping_params values
513 LD2R {v6.4s, v7.4s}, [x8]
514
515 # Is there a remainder?- 4 floats of A (16 bytes)
516 TBZ x0, 4, 5f
517
518 # Remainder- 4 floats of A (16 bytes)
519 # Load A
520 LDR q0, [x3], 16
521 LDR q1, [x9], 16
522 LDR q2, [x10], 16
523 LDR q3, [x11], 16
524 LDR q4, [x12], 16
525 LDR q5, [x4], 16
526 # Load B
527 LDP q12, q13, [x5], 32
528 LDP q14, q15, [x5], 32
529 LDP q16, q17, [x5], 32
530 LDP q18, q19, [x5], 32
531
532 FMLA v20.4s, v12.4s, v0.s[0]
533 FMLA v22.4s, v12.4s, v1.s[0]
534 FMLA v24.4s, v12.4s, v2.s[0]
535 FMLA v26.4s, v12.4s, v3.s[0]
536 FMLA v28.4s, v12.4s, v4.s[0]
537 FMLA v30.4s, v12.4s, v5.s[0]
538 FMLA v21.4s, v13.4s, v0.s[0]
539 FMLA v23.4s, v13.4s, v1.s[0]
540 FMLA v25.4s, v13.4s, v2.s[0]
541 FMLA v27.4s, v13.4s, v3.s[0]
542 FMLA v29.4s, v13.4s, v4.s[0]
543 FMLA v31.4s, v13.4s, v5.s[0]
544
545 FMLA v20.4s, v14.4s, v0.s[1]
546 FMLA v22.4s, v14.4s, v1.s[1]
547 FMLA v24.4s, v14.4s, v2.s[1]
548 FMLA v26.4s, v14.4s, v3.s[1]
549 FMLA v28.4s, v14.4s, v4.s[1]
550 FMLA v30.4s, v14.4s, v5.s[1]
551 FMLA v21.4s, v15.4s, v0.s[1]
552 FMLA v23.4s, v15.4s, v1.s[1]
553 FMLA v25.4s, v15.4s, v2.s[1]
554 FMLA v27.4s, v15.4s, v3.s[1]
555 FMLA v29.4s, v15.4s, v4.s[1]
556 FMLA v31.4s, v15.4s, v5.s[1]
557
558 FMLA v20.4s, v16.4s, v0.s[2]
559 FMLA v22.4s, v16.4s, v1.s[2]
560 FMLA v24.4s, v16.4s, v2.s[2]
561 FMLA v26.4s, v16.4s, v3.s[2]
562 FMLA v28.4s, v16.4s, v4.s[2]
563 FMLA v30.4s, v16.4s, v5.s[2]
564 FMLA v21.4s, v17.4s, v0.s[2]
565 FMLA v23.4s, v17.4s, v1.s[2]
566 FMLA v25.4s, v17.4s, v2.s[2]
567 FMLA v27.4s, v17.4s, v3.s[2]
568 FMLA v29.4s, v17.4s, v4.s[2]
569 FMLA v31.4s, v17.4s, v5.s[2]
570
571 FMLA v20.4s, v18.4s, v0.s[3]
572 FMLA v22.4s, v18.4s, v1.s[3]
573 FMLA v24.4s, v18.4s, v2.s[3]
574 FMLA v26.4s, v18.4s, v3.s[3]
575 FMLA v28.4s, v18.4s, v4.s[3]
576 FMLA v30.4s, v18.4s, v5.s[3]
577 FMLA v21.4s, v19.4s, v0.s[3]
578 FMLA v23.4s, v19.4s, v1.s[3]
579 FMLA v25.4s, v19.4s, v2.s[3]
580 FMLA v27.4s, v19.4s, v3.s[3]
581 FMLA v29.4s, v19.4s, v4.s[3]
582 FMLA v31.4s, v19.4s, v5.s[3]
583
584 # Is there a remainder?- 2 floats of A (8 bytes)
5855:
586 TBZ x0, 3, 6f
587
588 # Remainder- 2 floats of A (8 bytes)
589 # Load A
590 LDR d0, [x3], 8
591 LDR d1, [x9], 8
592 LDR d2, [x10], 8
593 LDR d3, [x11], 8
594 LDR d4, [x12], 8
595 LDR d5, [x4], 8
596 # Load B
597 LDP q12, q13, [x5], 32
598 LDP q14, q15, [x5], 32
599
600 FMLA v20.4s, v12.4s, v0.s[0]
601 FMLA v22.4s, v12.4s, v1.s[0]
602 FMLA v24.4s, v12.4s, v2.s[0]
603 FMLA v26.4s, v12.4s, v3.s[0]
604 FMLA v28.4s, v12.4s, v4.s[0]
605 FMLA v30.4s, v12.4s, v5.s[0]
606 FMLA v21.4s, v13.4s, v0.s[0]
607 FMLA v23.4s, v13.4s, v1.s[0]
608 FMLA v25.4s, v13.4s, v2.s[0]
609 FMLA v27.4s, v13.4s, v3.s[0]
610 FMLA v29.4s, v13.4s, v4.s[0]
611 FMLA v31.4s, v13.4s, v5.s[0]
612
613 FMLA v20.4s, v14.4s, v0.s[1]
614 FMLA v22.4s, v14.4s, v1.s[1]
615 FMLA v24.4s, v14.4s, v2.s[1]
616 FMLA v26.4s, v14.4s, v3.s[1]
617 FMLA v28.4s, v14.4s, v4.s[1]
618 FMLA v30.4s, v14.4s, v5.s[1]
619 FMLA v21.4s, v15.4s, v0.s[1]
620 FMLA v23.4s, v15.4s, v1.s[1]
621 FMLA v25.4s, v15.4s, v2.s[1]
622 FMLA v27.4s, v15.4s, v3.s[1]
623 FMLA v29.4s, v15.4s, v4.s[1]
624 FMLA v31.4s, v15.4s, v5.s[1]
625
626 # Is there a remainder?- 1 float of A (4 bytes)
6276:
628 TBZ x0, 2, 3b
629
630 # Remainder- 1 float of A (4 bytes)
631 # Load A
632 LDR s0, [x3], 4
633 LDR s1, [x9], 4
634 LDR s2, [x10], 4
635 LDR s3, [x11], 4
636 LDR s4, [x12], 4
637 LDR s5, [x4], 4
638 # Load B
639 LDP q12, q13, [x5], 32
640
641 FMLA v20.4s, v12.4s, v0.s[0]
642 FMLA v22.4s, v12.4s, v1.s[0]
643 FMLA v24.4s, v12.4s, v2.s[0]
644 FMLA v26.4s, v12.4s, v3.s[0]
645 FMLA v28.4s, v12.4s, v4.s[0]
646 FMLA v30.4s, v12.4s, v5.s[0]
647 FMLA v21.4s, v13.4s, v0.s[0]
648 FMLA v23.4s, v13.4s, v1.s[0]
649 FMLA v25.4s, v13.4s, v2.s[0]
650 FMLA v27.4s, v13.4s, v3.s[0]
651 FMLA v29.4s, v13.4s, v4.s[0]
652 FMLA v31.4s, v13.4s, v5.s[0]
653 B 3b
654
655 # Store odd width
6567:
657 TBZ x1, 2, 8f
658 $if INC:
659 STR q30, [x7], 16
660 MOV v30.16b, v31.16b
661 STR q28, [x13], 16
662 MOV v28.16b, v29.16b
663 STR q26, [x18], 16
664 MOV v26.16b, v27.16b
665 STR q24, [x17], 16
666 MOV v24.16b, v25.16b
667 STR q22, [x16], 16
668 MOV v22.16b, v23.16b
669 STR q20, [x6], 16
670 MOV v20.16b, v21.16b
671 $else:
672 STR q20, [x6], 16
673 MOV v20.16b, v21.16b
674 STR q22, [x16], 16
675 MOV v22.16b, v23.16b
676 STR q24, [x17], 16
677 MOV v24.16b, v25.16b
678 STR q26, [x18], 16
679 MOV v26.16b, v27.16b
680 STR q28, [x13], 16
681 MOV v28.16b, v29.16b
682 STR q30, [x7], 16
683 MOV v30.16b, v31.16b
6848:
685 TBZ x1, 1, 9f
686 $if INC:
687 STR d30, [x7], 8
688 DUP d30, v30.d[1]
689 STR d28, [x13], 8
690 DUP d28, v28.d[1]
691 STR d26, [x18], 8
692 DUP d26, v26.d[1]
693 STR d24, [x17], 8
694 DUP d24, v24.d[1]
695 STR d22, [x16], 8
696 DUP d22, v22.d[1]
697 STR d20, [x6], 8
698 DUP d20, v20.d[1]
699 $else:
700 STR d20, [x6], 8
701 DUP d20, v20.d[1]
702 STR d22, [x16], 8
703 DUP d22, v22.d[1]
704 STR d24, [x17], 8
705 DUP d24, v24.d[1]
706 STR d26, [x18], 8
707 DUP d26, v26.d[1]
708 STR d28, [x13], 8
709 DUP d28, v28.d[1]
710 STR d30, [x7], 8
711 DUP d30, v30.d[1]
712
7139:
714 TBZ x1, 0, 10f
715 $if INC:
716 STR s30, [x7]
717 STR s28, [x13]
718 STR s26, [x18]
719 STR s24, [x17]
720 STR s22, [x16]
721 STR s20, [x6]
722 $else:
723 STR s20, [x6]
724 STR s22, [x16]
725 STR s24, [x17]
726 STR s26, [x18]
727 STR s28, [x13]
728 STR s30, [x7]
72910:
730 # Restore d8-d15 from stack
731 LDP d14, d15, [sp, 48]
732 LDP d12, d13, [sp, 32]
733 LDP d10, d11, [sp, 16]
734 LDP d8, d9, [sp], 64
735 RET
736
Marat Dukhan57431932019-11-22 07:50:42 -0800737END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73
XNNPACK Teamb455b122019-09-27 18:10:33 -0700738
739#ifdef __ELF__
740.section ".note.GNU-stack","",%progbits
741#endif