blob: a82343cd9b4ea2fc340c222875e86968a5943f26 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020# const xnn_f32_minmax_params params [sp + 24] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070021
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x14 a0
27# x15 a1
28# x20 a2
29# x21 a3
30# x22 a4
31# x23 a5
32
33# C pointers
34# x6 c0
35# x16 c1
36# x17 c2
37# x18 c3
38# x13 c4
39# x7 c5
40
41# Vector register usage
42# A0 v0 v6
43# A1 v1 v7
44# A2 v2 v8
45# A3 v3 v9
46# A4 v4 v10
47# A5 v5 v11
48# B v12 v13 v14 v15
49# B v16 v17 v18 v19
50# C v20 v21
51# C v22 v23
52# C v24 v25
53# C v26 v27
54# C v28 v29
55# C v30 v31
56# Clamp v6 v7
57
58BEGIN_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73
59
60 # Load cn_stride, a_offset
61 LDP x10, x11, [sp]
62
63 # Load zero, clamping params pointer
64 LDP x12, x8, [sp, 16]
65
66 # Clamp C pointers
67 STP d8, d9, [sp, -96]!
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 CMP x0, 2 // if mr < 2
Frank Barchard684bbb02019-11-16 14:14:42 -080069 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 CSEL x16, x6, x16, LO // c1 = c0
71
72 STP d10, d11, [sp, 16]
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x17, x16, x17, LS // c2 = c1
76
77 STP d12, d13, [sp, 32]
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 CMP x0, 4 // if mr < 4
Frank Barchard684bbb02019-11-16 14:14:42 -080079 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 CSEL x18, x17, x18, LO // c3 = c2
81
82 STP d14, d15, [sp, 48]
83 ADD x13, x18, x7 // c4 = c3 + cm_stride
84 // if mr <= 5
85 CSEL x13, x18, x13, LS // c4 = c3
86
87 # Save x20,x21,x22,x23 on stack
88 STP x20, x21, [sp, 64]
89 STP x22, x23, [sp, 80]
90
XNNPACK Teamb455b122019-09-27 18:10:33 -070091 CMP x0, 6 // if mr < 6
Frank Barchard684bbb02019-11-16 14:14:42 -080092 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070093 CSEL x7, x13, x7, LO // c5 = c4
94
95 # Load zero, clamping params pointer
96 LDP x12, x8, [sp, 112]
97
98 # Load cn_stride, a_offset
99 LDP x10, x11, [sp, 96]
100
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700101 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700102 LD2R {v6.4s, v7.4s}, [x8]
103
1040:
105 # Load initial bias from w into accumulators
106 LD1 {v20.16b, v21.16b}, [x5], 32
107 MOV v22.16b, v20.16b
108 MOV v23.16b, v21.16b
109 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
110 MOV v24.16b, v20.16b
111 MOV v25.16b, v21.16b
112 PRFM PLDL1KEEP, [x5, 64]
113 MOV v26.16b, v20.16b
114 MOV v27.16b, v21.16b
115 PRFM PLDL1KEEP, [x5, 128]
116 MOV v28.16b, v20.16b
117 MOV v29.16b, v21.16b
118 PRFM PLDL1KEEP, [x5, 192]
119 MOV v30.16b, v20.16b
120 MOV v31.16b, v21.16b
121
122 MOV x9, x3 // p = ks
123
1241:
125 # Load next 6 A pointers
126 LDP x14, x15, [x4], 16
127 LDP x20, x21, [x4], 16
128 LDP x22, x23, [x4], 16
129
130 CMP x14, x12 // if a0 == zero
131 ADD x14, x14, x11 // a0 += a_offset
132 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset
133 CMP x15, x12 // if a1 == zero
134 ADD x15, x15, x11 // a1 += a_offset
135 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset
136 CMP x20, x12 // if a2 == zero
137 ADD x20, x20, x11 // a2 += a_offset
138 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset
139 CMP x21, x12 // if a3 == zero
140 ADD x21, x21, x11 // a3 += a_offset
141 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset
142 CMP x22, x12 // if a4 == zero
143 ADD x22, x22, x11 // a4 += a_offset
144 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset
145 CMP x23, x12 // if a5 == zero
146 ADD x23, x23, x11 // a5 += a_offset
147 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset
148
149 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
150 SUBS x0, x2, 32 // k = kc - 32
151 B.LO 5f
152
153 # Prologue - loads for main loop of 96 FMA
154 # load A0 to A4 but not A5
155 LDP q0, q6, [x14], 32
156 LDP q1, q7, [x15], 32
157 LDP q2, q8, [x20], 32
158 LDP q3, q9, [x21], 32
159 LDP q4, q10, [x22], 32
160 # load first set of B
161 LDP q12, q13, [x5], 32
162 LDP q14, q15, [x5], 32
163
164 # Is there at least 8 floats (32 bytes) for main loop?
165 SUBS x0, x0, 32
166 B.LO 3f
167
168 # Main loop - 8 floats of A (32 bytes)
169 # 96 FMA + 6 LDP A + 8 LDP B
1702:
171 # First group of 4 A. 48 FMA. Loads A5
172
173 LDP q5, q11, [x23], 32
174 FMLA v20.4s, v12.4s, v0.s[0]
175 FMLA v22.4s, v12.4s, v1.s[0]
176 LDP q16, q17, [x5], 32
177 FMLA v24.4s, v12.4s, v2.s[0]
178 FMLA v26.4s, v12.4s, v3.s[0]
179 LDP q18, q19, [x5], 32
180 FMLA v28.4s, v12.4s, v4.s[0]
181 FMLA v30.4s, v12.4s, v5.s[0]
182 FMLA v21.4s, v13.4s, v0.s[0]
183 FMLA v23.4s, v13.4s, v1.s[0]
184 FMLA v25.4s, v13.4s, v2.s[0]
185 FMLA v27.4s, v13.4s, v3.s[0]
186 FMLA v29.4s, v13.4s, v4.s[0]
187 FMLA v31.4s, v13.4s, v5.s[0]
188
189 FMLA v20.4s, v14.4s, v0.s[1]
190 FMLA v22.4s, v14.4s, v1.s[1]
191 FMLA v24.4s, v14.4s, v2.s[1]
192 FMLA v26.4s, v14.4s, v3.s[1]
193 FMLA v28.4s, v14.4s, v4.s[1]
194 FMLA v30.4s, v14.4s, v5.s[1]
195 FMLA v21.4s, v15.4s, v0.s[1]
196 FMLA v23.4s, v15.4s, v1.s[1]
197 FMLA v25.4s, v15.4s, v2.s[1]
198 FMLA v27.4s, v15.4s, v3.s[1]
199 FMLA v29.4s, v15.4s, v4.s[1]
200 FMLA v31.4s, v15.4s, v5.s[1]
201
202 LDP q12, q13, [x5], 32
203 FMLA v20.4s, v16.4s, v0.s[2]
204 FMLA v22.4s, v16.4s, v1.s[2]
205 LDP q14, q15, [x5], 32
206 FMLA v24.4s, v16.4s, v2.s[2]
207 FMLA v26.4s, v16.4s, v3.s[2]
208 PRFM PLDL1KEEP, [x5, 128] // Prefetch B
209 FMLA v28.4s, v16.4s, v4.s[2]
210 FMLA v30.4s, v16.4s, v5.s[2]
211 FMLA v21.4s, v17.4s, v0.s[2]
212 FMLA v23.4s, v17.4s, v1.s[2]
213 PRFM PLDL1KEEP, [x5, 256]
214 FMLA v25.4s, v17.4s, v2.s[2]
215 FMLA v27.4s, v17.4s, v3.s[2]
216 FMLA v29.4s, v17.4s, v4.s[2]
217 FMLA v31.4s, v17.4s, v5.s[2]
218
219 FMLA v20.4s, v18.4s, v0.s[3]
220 FMLA v22.4s, v18.4s, v1.s[3]
221 FMLA v24.4s, v18.4s, v2.s[3]
222 FMLA v26.4s, v18.4s, v3.s[3]
223 FMLA v28.4s, v18.4s, v4.s[3]
224 FMLA v30.4s, v18.4s, v5.s[3]
225 FMLA v21.4s, v19.4s, v0.s[3]
226 FMLA v23.4s, v19.4s, v1.s[3]
227 FMLA v25.4s, v19.4s, v2.s[3]
228 FMLA v27.4s, v19.4s, v3.s[3]
229 FMLA v29.4s, v19.4s, v4.s[3]
230 FMLA v31.4s, v19.4s, v5.s[3]
231
232 # Second group of 4 A. 48 FMA. Loads A0 - A4
233
234 LDP q16, q17, [x5], 32
235 FMLA v20.4s, v12.4s, v6.s[0]
236 FMLA v22.4s, v12.4s, v7.s[0]
237 LDP q18, q19, [x5], 32
238 FMLA v24.4s, v12.4s, v8.s[0]
239 FMLA v26.4s, v12.4s, v9.s[0]
240 FMLA v28.4s, v12.4s, v10.s[0]
241 FMLA v30.4s, v12.4s, v11.s[0]
242 FMLA v21.4s, v13.4s, v6.s[0]
243 FMLA v23.4s, v13.4s, v7.s[0]
244 FMLA v25.4s, v13.4s, v8.s[0]
245 FMLA v27.4s, v13.4s, v9.s[0]
246 FMLA v29.4s, v13.4s, v10.s[0]
247 FMLA v31.4s, v13.4s, v11.s[0]
248
249 FMLA v20.4s, v14.4s, v6.s[1]
250 FMLA v22.4s, v14.4s, v7.s[1]
251 FMLA v24.4s, v14.4s, v8.s[1]
252 FMLA v26.4s, v14.4s, v9.s[1]
253 FMLA v28.4s, v14.4s, v10.s[1]
254 FMLA v30.4s, v14.4s, v11.s[1]
255 FMLA v21.4s, v15.4s, v6.s[1]
256 FMLA v23.4s, v15.4s, v7.s[1]
257 FMLA v25.4s, v15.4s, v8.s[1]
258 FMLA v27.4s, v15.4s, v9.s[1]
259 FMLA v29.4s, v15.4s, v10.s[1]
260 FMLA v31.4s, v15.4s, v11.s[1]
261
262 LDP q12, q13, [x5], 32
263 FMLA v20.4s, v16.4s, v6.s[2]
264 FMLA v20.4s, v18.4s, v6.s[3]
265 LDP q14, q15, [x5], 32
266 FMLA v21.4s, v17.4s, v6.s[2]
267 FMLA v21.4s, v19.4s, v6.s[3]
268 LDP q0, q6, [x14], 32
269 FMLA v22.4s, v16.4s, v7.s[2]
270 FMLA v22.4s, v18.4s, v7.s[3]
271 FMLA v23.4s, v17.4s, v7.s[2]
272 FMLA v23.4s, v19.4s, v7.s[3]
273 LDP q1, q7, [x15], 32
274 FMLA v24.4s, v16.4s, v8.s[2]
275 FMLA v24.4s, v18.4s, v8.s[3]
276 FMLA v25.4s, v17.4s, v8.s[2]
277 FMLA v25.4s, v19.4s, v8.s[3]
278 LDP q2, q8, [x20], 32
279 FMLA v26.4s, v16.4s, v9.s[2]
280 FMLA v26.4s, v18.4s, v9.s[3]
281 FMLA v27.4s, v17.4s, v9.s[2]
282 FMLA v27.4s, v19.4s, v9.s[3]
283 LDP q3, q9, [x21], 32
284 FMLA v28.4s, v16.4s, v10.s[2]
285 FMLA v28.4s, v18.4s, v10.s[3]
286 FMLA v29.4s, v17.4s, v10.s[2]
287 FMLA v29.4s, v19.4s, v10.s[3]
288 LDP q4, q10, [x22], 32
289 FMLA v30.4s, v16.4s, v11.s[2]
290 FMLA v30.4s, v18.4s, v11.s[3]
291 SUBS x0, x0, 32
292 FMLA v31.4s, v17.4s, v11.s[2]
293 FMLA v31.4s, v19.4s, v11.s[3]
294 B.HS 2b
295
296 # Epilogue - 8 floats of A (32 bytes)
297 # 96 FMA + 6 LDP A + 8 LDP B
298 # First block same as main loop. Second block has no preloads.
2993:
300 # First group of 4 A. 48 FMA. Loads A5
301
302 LDP q5, q11, [x23], 32
303 FMLA v20.4s, v12.4s, v0.s[0]
304 FMLA v22.4s, v12.4s, v1.s[0]
305 LDP q16, q17, [x5], 32
306 FMLA v24.4s, v12.4s, v2.s[0]
307 FMLA v26.4s, v12.4s, v3.s[0]
308 LDP q18, q19, [x5], 32
309 FMLA v28.4s, v12.4s, v4.s[0]
310 FMLA v30.4s, v12.4s, v5.s[0]
311 FMLA v21.4s, v13.4s, v0.s[0]
312 FMLA v23.4s, v13.4s, v1.s[0]
313 FMLA v25.4s, v13.4s, v2.s[0]
314 FMLA v27.4s, v13.4s, v3.s[0]
315 FMLA v29.4s, v13.4s, v4.s[0]
316 FMLA v31.4s, v13.4s, v5.s[0]
317
318 FMLA v20.4s, v14.4s, v0.s[1]
319 FMLA v22.4s, v14.4s, v1.s[1]
320 FMLA v24.4s, v14.4s, v2.s[1]
321 FMLA v26.4s, v14.4s, v3.s[1]
322 FMLA v28.4s, v14.4s, v4.s[1]
323 FMLA v30.4s, v14.4s, v5.s[1]
324 FMLA v21.4s, v15.4s, v0.s[1]
325 FMLA v23.4s, v15.4s, v1.s[1]
326 FMLA v25.4s, v15.4s, v2.s[1]
327 FMLA v27.4s, v15.4s, v3.s[1]
328 FMLA v29.4s, v15.4s, v4.s[1]
329 FMLA v31.4s, v15.4s, v5.s[1]
330
331 LDP q12, q13, [x5], 32
332 FMLA v20.4s, v16.4s, v0.s[2]
333 FMLA v22.4s, v16.4s, v1.s[2]
334 LDP q14, q15, [x5], 32
335 FMLA v24.4s, v16.4s, v2.s[2]
336 FMLA v26.4s, v16.4s, v3.s[2]
337 FMLA v28.4s, v16.4s, v4.s[2]
338 FMLA v30.4s, v16.4s, v5.s[2]
339 FMLA v21.4s, v17.4s, v0.s[2]
340 FMLA v23.4s, v17.4s, v1.s[2]
341 FMLA v25.4s, v17.4s, v2.s[2]
342 FMLA v27.4s, v17.4s, v3.s[2]
343 FMLA v29.4s, v17.4s, v4.s[2]
344 FMLA v31.4s, v17.4s, v5.s[2]
345
346 FMLA v20.4s, v18.4s, v0.s[3]
347 FMLA v22.4s, v18.4s, v1.s[3]
348 FMLA v24.4s, v18.4s, v2.s[3]
349 FMLA v26.4s, v18.4s, v3.s[3]
350 FMLA v28.4s, v18.4s, v4.s[3]
351 FMLA v30.4s, v18.4s, v5.s[3]
352 FMLA v21.4s, v19.4s, v0.s[3]
353 FMLA v23.4s, v19.4s, v1.s[3]
354 FMLA v25.4s, v19.4s, v2.s[3]
355 FMLA v27.4s, v19.4s, v3.s[3]
356 FMLA v29.4s, v19.4s, v4.s[3]
357 FMLA v31.4s, v19.4s, v5.s[3]
358
359 # Second group of 4 A. 48 FMA. No A Loads, No last B load
360
361 LDP q16, q17, [x5], 32
362 FMLA v20.4s, v12.4s, v6.s[0]
363 FMLA v22.4s, v12.4s, v7.s[0]
364 LDP q18, q19, [x5], 32
365 FMLA v24.4s, v12.4s, v8.s[0]
366 FMLA v26.4s, v12.4s, v9.s[0]
367 FMLA v28.4s, v12.4s, v10.s[0]
368 FMLA v30.4s, v12.4s, v11.s[0]
369 FMLA v21.4s, v13.4s, v6.s[0]
370 FMLA v23.4s, v13.4s, v7.s[0]
371 FMLA v25.4s, v13.4s, v8.s[0]
372 FMLA v27.4s, v13.4s, v9.s[0]
373 FMLA v29.4s, v13.4s, v10.s[0]
374 FMLA v31.4s, v13.4s, v11.s[0]
375
376 FMLA v20.4s, v14.4s, v6.s[1]
377 FMLA v22.4s, v14.4s, v7.s[1]
378 FMLA v24.4s, v14.4s, v8.s[1]
379 FMLA v26.4s, v14.4s, v9.s[1]
380 FMLA v28.4s, v14.4s, v10.s[1]
381 FMLA v30.4s, v14.4s, v11.s[1]
382 FMLA v21.4s, v15.4s, v6.s[1]
383 FMLA v23.4s, v15.4s, v7.s[1]
384 FMLA v25.4s, v15.4s, v8.s[1]
385 FMLA v27.4s, v15.4s, v9.s[1]
386 FMLA v29.4s, v15.4s, v10.s[1]
387 FMLA v31.4s, v15.4s, v11.s[1]
388
389 # Last part of epilogue has loads removed.
390
391 FMLA v20.4s, v16.4s, v6.s[2]
392 FMLA v22.4s, v16.4s, v7.s[2]
393 FMLA v24.4s, v16.4s, v8.s[2]
394 FMLA v26.4s, v16.4s, v9.s[2]
395 FMLA v28.4s, v16.4s, v10.s[2]
396 FMLA v30.4s, v16.4s, v11.s[2]
397 FMLA v21.4s, v17.4s, v6.s[2]
398 FMLA v23.4s, v17.4s, v7.s[2]
399 FMLA v25.4s, v17.4s, v8.s[2]
400 FMLA v27.4s, v17.4s, v9.s[2]
401 FMLA v29.4s, v17.4s, v10.s[2]
402 FMLA v31.4s, v17.4s, v11.s[2]
403
404 FMLA v20.4s, v18.4s, v6.s[3]
405 FMLA v22.4s, v18.4s, v7.s[3]
406 FMLA v24.4s, v18.4s, v8.s[3]
407 FMLA v26.4s, v18.4s, v9.s[3]
408 FMLA v28.4s, v18.4s, v10.s[3]
409 FMLA v30.4s, v18.4s, v11.s[3]
410 FMLA v21.4s, v19.4s, v6.s[3]
411 FMLA v23.4s, v19.4s, v7.s[3]
412
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700413 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700414 LD2R {v6.4s, v7.4s}, [x8]
415
416 FMLA v25.4s, v19.4s, v8.s[3]
417 FMLA v27.4s, v19.4s, v9.s[3]
418 TST x0, 31
419 FMLA v29.4s, v19.4s, v10.s[3]
420 FMLA v31.4s, v19.4s, v11.s[3]
421 B.NE 5f
422
Frank Barchardc6591402019-12-11 12:54:12 -0800423 .p2align 3
XNNPACK Teamb455b122019-09-27 18:10:33 -07004244:
425 # ks loop
426 SUBS x9, x9, 48 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800427 B.HI 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700428
429 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700430 FMAX v20.4s, v20.4s, v6.4s
431 FMAX v21.4s, v21.4s, v6.4s
432 FMAX v22.4s, v22.4s, v6.4s
433 FMAX v23.4s, v23.4s, v6.4s
434 FMAX v24.4s, v24.4s, v6.4s
435 FMAX v25.4s, v25.4s, v6.4s
436 FMAX v26.4s, v26.4s, v6.4s
437 FMAX v27.4s, v27.4s, v6.4s
438 FMAX v28.4s, v28.4s, v6.4s
439 FMAX v29.4s, v29.4s, v6.4s
440 FMAX v30.4s, v30.4s, v6.4s
441 FMAX v31.4s, v31.4s, v6.4s
442 FMIN v20.4s, v20.4s, v7.4s
443 FMIN v21.4s, v21.4s, v7.4s
444 FMIN v22.4s, v22.4s, v7.4s
445 FMIN v23.4s, v23.4s, v7.4s
446 FMIN v24.4s, v24.4s, v7.4s
447 FMIN v25.4s, v25.4s, v7.4s
448 FMIN v26.4s, v26.4s, v7.4s
449 FMIN v27.4s, v27.4s, v7.4s
450 FMIN v28.4s, v28.4s, v7.4s
451 FMIN v29.4s, v29.4s, v7.4s
452 FMIN v30.4s, v30.4s, v7.4s
453 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700454
455 # Store full 6 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800456 SUBS x1, x1, 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700457 B.LO 8f
458
459 STP q30, q31, [x7]
460 ADD x7, x7, x10
461 STP q28, q29, [x13]
462 ADD x13, x13, x10
463 STP q26, q27, [x18]
464 ADD x18, x18, x10
465 STP q24, q25, [x17]
466 ADD x17, x17, x10
467 STP q22, q23, [x16]
468 ADD x16, x16, x10
469 STP q20, q21, [x6]
470 ADD x6, x6, x10
471
472 SUB x4, x4, x3 // a -= ks
473
474 # nc loop
XNNPACK Teamb455b122019-09-27 18:10:33 -0700475 B.HI 0b
476
477 # Restore x20,x21,x22,x23 from stack
478 LDP x22, x23, [sp, 80]
479 LDP x20, x21, [sp, 64]
480
481 # Restore d8-d15 from stack
482 LDP d14, d15, [sp, 48]
483 LDP d12, d13, [sp, 32]
484 LDP d10, d11, [sp, 16]
485 LDP d8, d9, [sp], 96
486 RET
487
Frank Barchard6383f492019-12-04 22:33:49 -0800488 .p2align 3
XNNPACK Teamb455b122019-09-27 18:10:33 -07004895:
490 # Is there a remainder?- 4 floats of A (16 bytes)
491 TBZ x0, 4, 6f
492
493 # Remainder- 4 floats of A (16 bytes)
494 # Load A
495 LDR q0, [x14], 16
496 LDR q1, [x15], 16
497 LDR q2, [x20], 16
498 LDR q3, [x21], 16
499 LDR q4, [x22], 16
500 LDR q5, [x23], 16
501 # Load B
502 LDP q12, q13, [x5], 32
503 LDP q14, q15, [x5], 32
504 LDP q16, q17, [x5], 32
505 LDP q18, q19, [x5], 32
506
507 FMLA v20.4s, v12.4s, v0.s[0]
508 FMLA v22.4s, v12.4s, v1.s[0]
509 FMLA v24.4s, v12.4s, v2.s[0]
510 FMLA v26.4s, v12.4s, v3.s[0]
511 FMLA v28.4s, v12.4s, v4.s[0]
512 FMLA v30.4s, v12.4s, v5.s[0]
513 FMLA v21.4s, v13.4s, v0.s[0]
514 FMLA v23.4s, v13.4s, v1.s[0]
515 FMLA v25.4s, v13.4s, v2.s[0]
516 FMLA v27.4s, v13.4s, v3.s[0]
517 FMLA v29.4s, v13.4s, v4.s[0]
518 FMLA v31.4s, v13.4s, v5.s[0]
519
520 FMLA v20.4s, v14.4s, v0.s[1]
521 FMLA v22.4s, v14.4s, v1.s[1]
522 FMLA v24.4s, v14.4s, v2.s[1]
523 FMLA v26.4s, v14.4s, v3.s[1]
524 FMLA v28.4s, v14.4s, v4.s[1]
525 FMLA v30.4s, v14.4s, v5.s[1]
526 FMLA v21.4s, v15.4s, v0.s[1]
527 FMLA v23.4s, v15.4s, v1.s[1]
528 FMLA v25.4s, v15.4s, v2.s[1]
529 FMLA v27.4s, v15.4s, v3.s[1]
530 FMLA v29.4s, v15.4s, v4.s[1]
531 FMLA v31.4s, v15.4s, v5.s[1]
532
533 FMLA v20.4s, v16.4s, v0.s[2]
534 FMLA v22.4s, v16.4s, v1.s[2]
535 FMLA v24.4s, v16.4s, v2.s[2]
536 FMLA v26.4s, v16.4s, v3.s[2]
537 FMLA v28.4s, v16.4s, v4.s[2]
538 FMLA v30.4s, v16.4s, v5.s[2]
539 FMLA v21.4s, v17.4s, v0.s[2]
540 FMLA v23.4s, v17.4s, v1.s[2]
541 FMLA v25.4s, v17.4s, v2.s[2]
542 FMLA v27.4s, v17.4s, v3.s[2]
543 FMLA v29.4s, v17.4s, v4.s[2]
544 FMLA v31.4s, v17.4s, v5.s[2]
545
546 FMLA v20.4s, v18.4s, v0.s[3]
547 FMLA v22.4s, v18.4s, v1.s[3]
548 FMLA v24.4s, v18.4s, v2.s[3]
549 FMLA v26.4s, v18.4s, v3.s[3]
550 FMLA v28.4s, v18.4s, v4.s[3]
551 FMLA v30.4s, v18.4s, v5.s[3]
552 FMLA v21.4s, v19.4s, v0.s[3]
553 FMLA v23.4s, v19.4s, v1.s[3]
554 FMLA v25.4s, v19.4s, v2.s[3]
555 FMLA v27.4s, v19.4s, v3.s[3]
556 FMLA v29.4s, v19.4s, v4.s[3]
557 FMLA v31.4s, v19.4s, v5.s[3]
558
559 # Is there a remainder?- 2 floats of A (8 bytes)
5606:
561 TBZ x0, 3, 7f
562
563 # Remainder- 2 floats of A (8 bytes)
564 # Load A
565 LDR d0, [x14], 8
566 LDR d1, [x15], 8
567 LDR d2, [x20], 8
568 LDR d3, [x21], 8
569 LDR d4, [x22], 8
570 LDR d5, [x23], 8
571 # Load B
572 LDP q12, q13, [x5], 32
573 LDP q14, q15, [x5], 32
574
575 FMLA v20.4s, v12.4s, v0.s[0]
576 FMLA v22.4s, v12.4s, v1.s[0]
577 FMLA v24.4s, v12.4s, v2.s[0]
578 FMLA v26.4s, v12.4s, v3.s[0]
579 FMLA v28.4s, v12.4s, v4.s[0]
580 FMLA v30.4s, v12.4s, v5.s[0]
581 FMLA v21.4s, v13.4s, v0.s[0]
582 FMLA v23.4s, v13.4s, v1.s[0]
583 FMLA v25.4s, v13.4s, v2.s[0]
584 FMLA v27.4s, v13.4s, v3.s[0]
585 FMLA v29.4s, v13.4s, v4.s[0]
586 FMLA v31.4s, v13.4s, v5.s[0]
587
588 FMLA v20.4s, v14.4s, v0.s[1]
589 FMLA v22.4s, v14.4s, v1.s[1]
590 FMLA v24.4s, v14.4s, v2.s[1]
591 FMLA v26.4s, v14.4s, v3.s[1]
592 FMLA v28.4s, v14.4s, v4.s[1]
593 FMLA v30.4s, v14.4s, v5.s[1]
594 FMLA v21.4s, v15.4s, v0.s[1]
595 FMLA v23.4s, v15.4s, v1.s[1]
596 FMLA v25.4s, v15.4s, v2.s[1]
597 FMLA v27.4s, v15.4s, v3.s[1]
598 FMLA v29.4s, v15.4s, v4.s[1]
599 FMLA v31.4s, v15.4s, v5.s[1]
600
601 # Is there a remainder?- 1 float of A (4 bytes)
6027:
603 TBZ x0, 2, 4b
604
605 # Remainder- 1 float of A (4 bytes)
606 # Load A
607 LDR s0, [x14], 4
608 LDR s1, [x15], 4
609 LDR s2, [x20], 4
610 LDR s3, [x21], 4
611 LDR s4, [x22], 4
612 LDR s5, [x23], 4
613 # Load B
614 LDP q12, q13, [x5], 32
615
616 FMLA v20.4s, v12.4s, v0.s[0]
617 FMLA v22.4s, v12.4s, v1.s[0]
618 FMLA v24.4s, v12.4s, v2.s[0]
619 FMLA v26.4s, v12.4s, v3.s[0]
620 FMLA v28.4s, v12.4s, v4.s[0]
621 FMLA v30.4s, v12.4s, v5.s[0]
622 FMLA v21.4s, v13.4s, v0.s[0]
623 FMLA v23.4s, v13.4s, v1.s[0]
624 FMLA v25.4s, v13.4s, v2.s[0]
625 FMLA v27.4s, v13.4s, v3.s[0]
626 FMLA v29.4s, v13.4s, v4.s[0]
627 FMLA v31.4s, v13.4s, v5.s[0]
628 B 4b
629
630 # Store odd width
6318:
632 TBZ x1, 2, 9f
633 STR q30, [x7], 16
634 MOV v30.16b, v31.16b
635 STR q28, [x13], 16
636 MOV v28.16b, v29.16b
637 STR q26, [x18], 16
638 MOV v26.16b, v27.16b
639 STR q24, [x17], 16
640 MOV v24.16b, v25.16b
641 STR q22, [x16], 16
642 MOV v22.16b, v23.16b
643 STR q20, [x6], 16
644 MOV v20.16b, v21.16b
6459:
646 TBZ x1, 1, 10f
647 STR d30, [x7], 8
648 DUP d30, v30.d[1]
649 STR d28, [x13], 8
650 DUP d28, v28.d[1]
651 STR d26, [x18], 8
652 DUP d26, v26.d[1]
653 STR d24, [x17], 8
654 DUP d24, v24.d[1]
655 STR d22, [x16], 8
656 DUP d22, v22.d[1]
657 STR d20, [x6], 8
658 DUP d20, v20.d[1]
659
66010:
661 TBZ x1, 0, 11f
662 STR s30, [x7]
663 STR s28, [x13]
664 STR s26, [x18]
665 STR s24, [x17]
666 STR s22, [x16]
667 STR s20, [x6]
66811:
669 # Restore x20,x21,x22,x23 from stack
670 LDP x22, x23, [sp, 80]
671 LDP x20, x21, [sp, 64]
672
673 # Restore d8-d15 from stack
674 LDP d14, d15, [sp, 48]
675 LDP d12, d13, [sp, 32]
676 LDP d10, d11, [sp, 16]
677 LDP d8, d9, [sp], 96
678 RET
679
680END_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73
681
682#ifdef __ELF__
683.section ".note.GNU-stack","",%progbits
684#endif