blob: b6b0a02d60267ac7d5ac43e4263c12215d84ffc2 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
Frank Barchard387c2d12019-12-16 19:14:07 -08002// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
XNNPACK Teamb455b122019-09-27 18:10:33 -07003// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34# x4 a5
35
36# C pointers
37# x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42# x7 c5
43
44# Vector register usage
Frank Barchard387c2d12019-12-16 19:14:07 -080045# A0 v0 v6
46# A1 v1 v7
47# A2 v2 v8
48# A3 v3 v9
49# A4 v4 v10
50# A5 v5 v11
51# B v12 v13 v14 v15
52# B v16 v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070053# C v20 v21
54# C v22 v23
55# C v24 v25
56# C v26 v27
57# C v28 v29
58# C v30 v31
59# Clamp v6 v7
XNNPACK Teamb455b122019-09-27 18:10:33 -070060
61BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57
62
Frank Barchard387c2d12019-12-16 19:14:07 -080063 # Clamp A and C pointers / Save d8-d15 on stack
64 STP d8, d9, [sp, -64]!
Frank Barchard684bbb02019-11-16 14:14:42 -080065 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 ADD x9, x3, x4 // a1 = a0 + a_stride
67 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 CSEL x9, x3, x9, LO // a1 = a0
69 CSEL x16, x6, x16, LO // c1 = c0
70
Frank Barchard387c2d12019-12-16 19:14:07 -080071 STP d10, d11, [sp, 16]
XNNPACK Teamb455b122019-09-27 18:10:33 -070072 ADD x10, x9, x4 // a2 = a1 + a_stride
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x10, x9, x10, LS // a2 = a1
76 CSEL x17, x16, x17, LS // c2 = c1
77
Frank Barchard387c2d12019-12-16 19:14:07 -080078 STP d12, d13, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080079 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 ADD x11, x10, x4 // a3 = a2 + a_stride
81 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 CSEL x11, x10, x11, LO // a3 = a2
83 CSEL x18, x17, x18, LO // c3 = c2
84
Frank Barchard387c2d12019-12-16 19:14:07 -080085 STP d14, d15, [sp, 48]
XNNPACK Teamb455b122019-09-27 18:10:33 -070086 ADD x12, x11, x4 // a4 = a3 + a_stride
87 ADD x13, x18, x7 // c4 = c3 + cm_stride
88 // if mr <= 5
89 CSEL x12, x11, x12, LS // a4 = a3
90 CSEL x13, x18, x13, LS // c4 = c3
91
92 # Load acc, params pointer
Frank Barchard387c2d12019-12-16 19:14:07 -080093 LDP x15, x8, [sp, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -070094
Frank Barchard684bbb02019-11-16 14:14:42 -080095 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070096 ADD x4, x12, x4 // a5 = a4 + a_stride
97 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 CSEL x4, x12, x4, LO // a5 = a4
99 CSEL x7, x13, x7, LO // c5 = c4
100
101 # Load cn_stride
Frank Barchard387c2d12019-12-16 19:14:07 -0800102 LDR x14, [sp, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700103
1040:
105 # Load initial accumulators
106 LDP q20, q21, [x15], 32
107 LDP q22, q23, [x15], 32
108 LDP q24, q25, [x15], 32
109 LDP q26, q27, [x15], 32
110 LDP q28, q29, [x15], 32
111 LDP q30, q31, [x15], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700112
Frank Barchard387c2d12019-12-16 19:14:07 -0800113 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
114 SUBS x0, x2, 32 // k = kc - 32
115 B.LO 4f
116
117 # Prologue - loads for main loop of 96 FMA
118 LDR q0, [x3], 16
119 LDR q1, [x9], 16
120 LDR q2, [x10], 16
121 LDR q3, [x11], 16
122 LDR q4, [x12], 16
123 LDR q5, [x4], 16
124 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
125 LDP q14, q15, [x5], 32
126 LDP q16, q17, [x5], 32
127
128 # Is there at least 8 floats (32 bytes) for main loop?
129 SUBS x0, x0, 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700130 B.LO 2f
131
Frank Barchard387c2d12019-12-16 19:14:07 -0800132 # Main loop - 8 floats of A (32 bytes)
133 # 96 FMA + 6 LDP A + 8 LDP B
XNNPACK Teamb455b122019-09-27 18:10:33 -07001341:
Frank Barchard387c2d12019-12-16 19:14:07 -0800135 # First group of 4 A. 48 FMA.
136 FMLA v20.4s, v12.4s, v0.s[0]
137 LDP q18, q19, [x5], 32 // Load last B
138 FMLA v22.4s, v12.4s, v1.s[0]
139 FMLA v24.4s, v12.4s, v2.s[0]
140 FMLA v26.4s, v12.4s, v3.s[0]
141 FMLA v28.4s, v12.4s, v4.s[0]
142 FMLA v30.4s, v12.4s, v5.s[0]
143 FMLA v21.4s, v13.4s, v0.s[0]
144 FMLA v23.4s, v13.4s, v1.s[0]
145 FMLA v25.4s, v13.4s, v2.s[0]
146 FMLA v27.4s, v13.4s, v3.s[0]
147 FMLA v29.4s, v13.4s, v4.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148
Frank Barchard387c2d12019-12-16 19:14:07 -0800149 FMLA v31.4s, v13.4s, v5.s[0]
150 FMLA v20.4s, v14.4s, v0.s[1]
151 FMLA v22.4s, v14.4s, v1.s[1]
152 FMLA v24.4s, v14.4s, v2.s[1]
153 FMLA v26.4s, v14.4s, v3.s[1]
154 FMLA v28.4s, v14.4s, v4.s[1]
155 FMLA v30.4s, v14.4s, v5.s[1]
156 FMLA v21.4s, v15.4s, v0.s[1]
157 FMLA v23.4s, v15.4s, v1.s[1]
158 FMLA v25.4s, v15.4s, v2.s[1]
159 LDR q6, [x3], 16 // Load next 6 A
160 FMLA v27.4s, v15.4s, v3.s[1]
161 FMLA v29.4s, v15.4s, v4.s[1]
162 FMLA v31.4s, v15.4s, v5.s[1]
163 LDR q7, [x9], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700164
Frank Barchard387c2d12019-12-16 19:14:07 -0800165 FMLA v20.4s, v16.4s, v0.s[2]
166 FMLA v22.4s, v16.4s, v1.s[2]
167 FMLA v24.4s, v16.4s, v2.s[2]
168 LDR q8, [x10], 16
169 FMLA v26.4s, v16.4s, v3.s[2]
170 FMLA v28.4s, v16.4s, v4.s[2]
171 FMLA v30.4s, v16.4s, v5.s[2]
172 LDR q9, [x11], 16
173 FMLA v21.4s, v17.4s, v0.s[2]
174 FMLA v23.4s, v17.4s, v1.s[2]
175 FMLA v25.4s, v17.4s, v2.s[2]
176 LDR q10, [x12], 16
177 FMLA v27.4s, v17.4s, v3.s[2]
178 FMLA v29.4s, v17.4s, v4.s[2]
179 FMLA v31.4s, v17.4s, v5.s[2]
180 LDR q11, [x4], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700181
Frank Barchard387c2d12019-12-16 19:14:07 -0800182 FMLA v20.4s, v18.4s, v0.s[3]
183 FMLA v22.4s, v18.4s, v1.s[3]
184 FMLA v24.4s, v18.4s, v2.s[3]
185 LDP q12, q13, [x5], 32 // Load 4 B
186 FMLA v26.4s, v18.4s, v3.s[3]
187 FMLA v28.4s, v18.4s, v4.s[3]
188 FMLA v30.4s, v18.4s, v5.s[3]
189 LDP q14, q15, [x5], 32
190 FMLA v21.4s, v19.4s, v0.s[3]
191 FMLA v23.4s, v19.4s, v1.s[3]
192 FMLA v25.4s, v19.4s, v2.s[3]
193 LDP q16, q17, [x5], 32
194 FMLA v27.4s, v19.4s, v3.s[3]
195 FMLA v29.4s, v19.4s, v4.s[3]
196 FMLA v31.4s, v19.4s, v5.s[3]
197 LDP q18, q19, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700198
Frank Barchard387c2d12019-12-16 19:14:07 -0800199 # Second group of 4 A. 48 FMA.
200 FMLA v20.4s, v12.4s, v6.s[0]
201 FMLA v22.4s, v12.4s, v7.s[0]
202 FMLA v24.4s, v12.4s, v8.s[0]
203 LDR q0, [x3], 16 // Load next 6 A
204 FMLA v26.4s, v12.4s, v9.s[0]
205 FMLA v28.4s, v12.4s, v10.s[0]
206 FMLA v30.4s, v12.4s, v11.s[0]
207 LDR q1, [x9], 16
208 FMLA v21.4s, v13.4s, v6.s[0]
209 FMLA v23.4s, v13.4s, v7.s[0]
210 FMLA v25.4s, v13.4s, v8.s[0]
211 LDR q2, [x10], 16
212 FMLA v27.4s, v13.4s, v9.s[0]
213 FMLA v29.4s, v13.4s, v10.s[0]
214 FMLA v31.4s, v13.4s, v11.s[0]
215 LDR q3, [x11], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700216
Frank Barchard387c2d12019-12-16 19:14:07 -0800217 FMLA v20.4s, v14.4s, v6.s[1]
218 FMLA v22.4s, v14.4s, v7.s[1]
219 FMLA v24.4s, v14.4s, v8.s[1]
220 LDR q4, [x12], 16
221 FMLA v26.4s, v14.4s, v9.s[1]
222 FMLA v28.4s, v14.4s, v10.s[1]
223 FMLA v30.4s, v14.4s, v11.s[1]
224 LDR q5, [x4], 16
225 FMLA v21.4s, v15.4s, v6.s[1]
226 FMLA v23.4s, v15.4s, v7.s[1]
227 FMLA v25.4s, v15.4s, v8.s[1]
228 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
229 FMLA v27.4s, v15.4s, v9.s[1]
230 FMLA v29.4s, v15.4s, v10.s[1]
231 FMLA v31.4s, v15.4s, v11.s[1]
232 LDP q14, q15, [x5], 32
233
234 FMLA v20.4s, v16.4s, v6.s[2]
235 FMLA v22.4s, v16.4s, v7.s[2]
236 FMLA v24.4s, v16.4s, v8.s[2]
237 FMLA v26.4s, v16.4s, v9.s[2]
238 FMLA v28.4s, v16.4s, v10.s[2]
239 FMLA v30.4s, v16.4s, v11.s[2]
240 FMLA v21.4s, v17.4s, v6.s[2]
241 FMLA v23.4s, v17.4s, v7.s[2]
242 FMLA v25.4s, v17.4s, v8.s[2]
243 FMLA v27.4s, v17.4s, v9.s[2]
244 FMLA v29.4s, v17.4s, v10.s[2]
245 FMLA v31.4s, v17.4s, v11.s[2]
246 LDP q16, q17, [x5], 32
247
248 FMLA v20.4s, v18.4s, v6.s[3]
249 FMLA v22.4s, v18.4s, v7.s[3]
250 SUBS x0, x0, 32
251 FMLA v24.4s, v18.4s, v8.s[3]
252 FMLA v26.4s, v18.4s, v9.s[3]
253 FMLA v28.4s, v18.4s, v10.s[3]
254 FMLA v30.4s, v18.4s, v11.s[3]
255 FMLA v21.4s, v19.4s, v6.s[3]
256 FMLA v23.4s, v19.4s, v7.s[3]
257 FMLA v25.4s, v19.4s, v8.s[3]
258 FMLA v27.4s, v19.4s, v9.s[3]
259 FMLA v29.4s, v19.4s, v10.s[3]
260 FMLA v31.4s, v19.4s, v11.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700261 B.HS 1b
262
Frank Barchard387c2d12019-12-16 19:14:07 -0800263 # Epilogue - 8 floats of A (32 bytes)
264 # 96 FMA + 6 LDP A + 8 LDP B
265 # First block same as main loop. Second block has no preloads.
XNNPACK Teamb455b122019-09-27 18:10:33 -07002662:
Frank Barchard387c2d12019-12-16 19:14:07 -0800267 # First group of 4 A. 48 FMA.
268 FMLA v20.4s, v12.4s, v0.s[0]
269 LDP q18, q19, [x5], 32 // Load last B
270 FMLA v22.4s, v12.4s, v1.s[0]
271 FMLA v24.4s, v12.4s, v2.s[0]
272 FMLA v26.4s, v12.4s, v3.s[0]
273 FMLA v28.4s, v12.4s, v4.s[0]
274 FMLA v30.4s, v12.4s, v5.s[0]
275 FMLA v21.4s, v13.4s, v0.s[0]
276 FMLA v23.4s, v13.4s, v1.s[0]
277 FMLA v25.4s, v13.4s, v2.s[0]
278 FMLA v27.4s, v13.4s, v3.s[0]
279 FMLA v29.4s, v13.4s, v4.s[0]
280
281 FMLA v31.4s, v13.4s, v5.s[0]
282 FMLA v20.4s, v14.4s, v0.s[1]
283 FMLA v22.4s, v14.4s, v1.s[1]
284 FMLA v24.4s, v14.4s, v2.s[1]
285 FMLA v26.4s, v14.4s, v3.s[1]
286 FMLA v28.4s, v14.4s, v4.s[1]
287 FMLA v30.4s, v14.4s, v5.s[1]
288 FMLA v21.4s, v15.4s, v0.s[1]
289 FMLA v23.4s, v15.4s, v1.s[1]
290 FMLA v25.4s, v15.4s, v2.s[1]
291 LDR q6, [x3], 16 // Load next 6 A
292 FMLA v27.4s, v15.4s, v3.s[1]
293 FMLA v29.4s, v15.4s, v4.s[1]
294 FMLA v31.4s, v15.4s, v5.s[1]
295 LDR q7, [x9], 16
296
297 FMLA v20.4s, v16.4s, v0.s[2]
298 FMLA v22.4s, v16.4s, v1.s[2]
299 FMLA v24.4s, v16.4s, v2.s[2]
300 LDR q8, [x10], 16
301 FMLA v26.4s, v16.4s, v3.s[2]
302 FMLA v28.4s, v16.4s, v4.s[2]
303 FMLA v30.4s, v16.4s, v5.s[2]
304 LDR q9, [x11], 16
305 FMLA v21.4s, v17.4s, v0.s[2]
306 FMLA v23.4s, v17.4s, v1.s[2]
307 FMLA v25.4s, v17.4s, v2.s[2]
308 LDR q10, [x12], 16
309 FMLA v27.4s, v17.4s, v3.s[2]
310 FMLA v29.4s, v17.4s, v4.s[2]
311 FMLA v31.4s, v17.4s, v5.s[2]
312 LDR q11, [x4], 16
313
314 FMLA v20.4s, v18.4s, v0.s[3]
315 FMLA v22.4s, v18.4s, v1.s[3]
316 FMLA v24.4s, v18.4s, v2.s[3]
317 LDP q12, q13, [x5], 32 // Load 4 B
318 FMLA v26.4s, v18.4s, v3.s[3]
319 FMLA v28.4s, v18.4s, v4.s[3]
320 FMLA v30.4s, v18.4s, v5.s[3]
321 LDP q14, q15, [x5], 32
322 FMLA v21.4s, v19.4s, v0.s[3]
323 FMLA v23.4s, v19.4s, v1.s[3]
324 FMLA v25.4s, v19.4s, v2.s[3]
325 LDP q16, q17, [x5], 32
326 FMLA v27.4s, v19.4s, v3.s[3]
327 FMLA v29.4s, v19.4s, v4.s[3]
328 FMLA v31.4s, v19.4s, v5.s[3]
329 LDP q18, q19, [x5], 32
330
331 # Second group of 4 A. 48 FMA.
332 FMLA v20.4s, v12.4s, v6.s[0]
333 FMLA v22.4s, v12.4s, v7.s[0]
334 FMLA v24.4s, v12.4s, v8.s[0]
335 FMLA v26.4s, v12.4s, v9.s[0]
336 FMLA v28.4s, v12.4s, v10.s[0]
337 FMLA v30.4s, v12.4s, v11.s[0]
338 FMLA v21.4s, v13.4s, v6.s[0]
339 FMLA v23.4s, v13.4s, v7.s[0]
340 FMLA v25.4s, v13.4s, v8.s[0]
341 FMLA v27.4s, v13.4s, v9.s[0]
342 FMLA v29.4s, v13.4s, v10.s[0]
343 FMLA v31.4s, v13.4s, v11.s[0]
344
345 FMLA v20.4s, v14.4s, v6.s[1]
346 FMLA v22.4s, v14.4s, v7.s[1]
347 FMLA v24.4s, v14.4s, v8.s[1]
348 FMLA v26.4s, v14.4s, v9.s[1]
349 FMLA v28.4s, v14.4s, v10.s[1]
350 FMLA v30.4s, v14.4s, v11.s[1]
351 FMLA v21.4s, v15.4s, v6.s[1]
352 FMLA v23.4s, v15.4s, v7.s[1]
353 FMLA v25.4s, v15.4s, v8.s[1]
354 FMLA v27.4s, v15.4s, v9.s[1]
355 FMLA v29.4s, v15.4s, v10.s[1]
356 FMLA v31.4s, v15.4s, v11.s[1]
357
358 FMLA v20.4s, v16.4s, v6.s[2]
359 FMLA v22.4s, v16.4s, v7.s[2]
360 FMLA v24.4s, v16.4s, v8.s[2]
361 FMLA v26.4s, v16.4s, v9.s[2]
362 FMLA v28.4s, v16.4s, v10.s[2]
363 FMLA v30.4s, v16.4s, v11.s[2]
364 FMLA v21.4s, v17.4s, v6.s[2]
365 FMLA v23.4s, v17.4s, v7.s[2]
366 FMLA v25.4s, v17.4s, v8.s[2]
367 FMLA v27.4s, v17.4s, v9.s[2]
368 FMLA v29.4s, v17.4s, v10.s[2]
369 FMLA v31.4s, v17.4s, v11.s[2]
370
371 FMLA v20.4s, v18.4s, v6.s[3]
372 FMLA v22.4s, v18.4s, v7.s[3]
373 FMLA v24.4s, v18.4s, v8.s[3]
374 FMLA v26.4s, v18.4s, v9.s[3]
375 FMLA v28.4s, v18.4s, v10.s[3]
376 FMLA v30.4s, v18.4s, v11.s[3]
377 FMLA v21.4s, v19.4s, v6.s[3]
378 FMLA v23.4s, v19.4s, v7.s[3]
379
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700380 # Load min/max values
Frank Barchard387c2d12019-12-16 19:14:07 -0800381 LD2R {v6.4s, v7.4s}, [x8]
382
383 FMLA v25.4s, v19.4s, v8.s[3]
384 FMLA v27.4s, v19.4s, v9.s[3]
385 # Is there a remainder?- 4 floats of A (16 bytes) or less
386 TST x0, 31
387 FMLA v29.4s, v19.4s, v10.s[3]
388 FMLA v31.4s, v19.4s, v11.s[3]
389 B.NE 4f
390
Frank Barchardbd419712019-10-31 14:15:36 -0700391 # Clamp
Frank Barchard387c2d12019-12-16 19:14:07 -08003923:
Marat Dukhana51cf482020-04-08 16:16:19 -0700393 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800394 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700395 FMAX v21.4s, v21.4s, v6.4s
396 FMAX v22.4s, v22.4s, v6.4s
397 FMAX v23.4s, v23.4s, v6.4s
398 FMAX v24.4s, v24.4s, v6.4s
399 FMAX v25.4s, v25.4s, v6.4s
400 FMAX v26.4s, v26.4s, v6.4s
401 FMAX v27.4s, v27.4s, v6.4s
402 FMAX v28.4s, v28.4s, v6.4s
403 FMAX v29.4s, v29.4s, v6.4s
404 FMAX v30.4s, v30.4s, v6.4s
405 FMAX v31.4s, v31.4s, v6.4s
406 FMIN v20.4s, v20.4s, v7.4s
407 FMIN v21.4s, v21.4s, v7.4s
408 FMIN v22.4s, v22.4s, v7.4s
409 FMIN v23.4s, v23.4s, v7.4s
410 FMIN v24.4s, v24.4s, v7.4s
411 FMIN v25.4s, v25.4s, v7.4s
412 FMIN v26.4s, v26.4s, v7.4s
413 FMIN v27.4s, v27.4s, v7.4s
414 FMIN v28.4s, v28.4s, v7.4s
415 FMIN v29.4s, v29.4s, v7.4s
416 FMIN v30.4s, v30.4s, v7.4s
417 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700418
419 # Store full 6 x 8
Frank Barchard387c2d12019-12-16 19:14:07 -0800420 B.LO 7f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700421
422 STP q30, q31, [x7]
423 ADD x7, x7, x14
424 SUB x3, x3, x2 // a0 -= kc
425 STP q28, q29, [x13]
426 ADD x13, x13, x14
427 SUB x9, x9, x2 // a1 -= kc
428 STP q26, q27, [x18]
429 ADD x18, x18, x14
430 SUB x10, x10, x2 // a2 -= kc
431 STP q24, q25, [x17]
432 ADD x17, x17, x14
433 SUB x11, x11, x2 // a3 -= kc
434 STP q22, q23, [x16]
435 ADD x16, x16, x14
436 SUB x12, x12, x2 // a4 -= kc
437 STP q20, q21, [x6]
438 ADD x6, x6, x14
439 SUB x4, x4, x2 // a5 -= kc
440
XNNPACK Teamb455b122019-09-27 18:10:33 -0700441 B.HI 0b
442
Frank Barchard387c2d12019-12-16 19:14:07 -0800443 # Restore d8-d15 from stack
444 LDP d14, d15, [sp, 48]
445 LDP d12, d13, [sp, 32]
446 LDP d10, d11, [sp, 16]
447 LDP d8, d9, [sp], 64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700448 RET
449
Frank Barchardbd419712019-10-31 14:15:36 -07004504:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700451 # Load min/max values
Frank Barchard387c2d12019-12-16 19:14:07 -0800452 LD2R {v6.4s, v7.4s}, [x8]
453
454 # Is there a remainder?- 4 floats of A (16 bytes)
455 TBZ x0, 4, 5f
456
457 # Remainder- 4 floats of A (16 bytes)
458 # Load A
459 LDR q0, [x3], 16
460 LDR q1, [x9], 16
461 LDR q2, [x10], 16
462 LDR q3, [x11], 16
463 LDR q4, [x12], 16
464 LDR q5, [x4], 16
465 # Load B
466 LDP q12, q13, [x5], 32
467 LDP q14, q15, [x5], 32
468 LDP q16, q17, [x5], 32
469 LDP q18, q19, [x5], 32
470
471 FMLA v20.4s, v12.4s, v0.s[0]
472 FMLA v22.4s, v12.4s, v1.s[0]
473 FMLA v24.4s, v12.4s, v2.s[0]
474 FMLA v26.4s, v12.4s, v3.s[0]
475 FMLA v28.4s, v12.4s, v4.s[0]
476 FMLA v30.4s, v12.4s, v5.s[0]
477 FMLA v21.4s, v13.4s, v0.s[0]
478 FMLA v23.4s, v13.4s, v1.s[0]
479 FMLA v25.4s, v13.4s, v2.s[0]
480 FMLA v27.4s, v13.4s, v3.s[0]
481 FMLA v29.4s, v13.4s, v4.s[0]
482 FMLA v31.4s, v13.4s, v5.s[0]
483
484 FMLA v20.4s, v14.4s, v0.s[1]
485 FMLA v22.4s, v14.4s, v1.s[1]
486 FMLA v24.4s, v14.4s, v2.s[1]
487 FMLA v26.4s, v14.4s, v3.s[1]
488 FMLA v28.4s, v14.4s, v4.s[1]
489 FMLA v30.4s, v14.4s, v5.s[1]
490 FMLA v21.4s, v15.4s, v0.s[1]
491 FMLA v23.4s, v15.4s, v1.s[1]
492 FMLA v25.4s, v15.4s, v2.s[1]
493 FMLA v27.4s, v15.4s, v3.s[1]
494 FMLA v29.4s, v15.4s, v4.s[1]
495 FMLA v31.4s, v15.4s, v5.s[1]
496
497 FMLA v20.4s, v16.4s, v0.s[2]
498 FMLA v22.4s, v16.4s, v1.s[2]
499 FMLA v24.4s, v16.4s, v2.s[2]
500 FMLA v26.4s, v16.4s, v3.s[2]
501 FMLA v28.4s, v16.4s, v4.s[2]
502 FMLA v30.4s, v16.4s, v5.s[2]
503 FMLA v21.4s, v17.4s, v0.s[2]
504 FMLA v23.4s, v17.4s, v1.s[2]
505 FMLA v25.4s, v17.4s, v2.s[2]
506 FMLA v27.4s, v17.4s, v3.s[2]
507 FMLA v29.4s, v17.4s, v4.s[2]
508 FMLA v31.4s, v17.4s, v5.s[2]
509
510 FMLA v20.4s, v18.4s, v0.s[3]
511 FMLA v22.4s, v18.4s, v1.s[3]
512 FMLA v24.4s, v18.4s, v2.s[3]
513 FMLA v26.4s, v18.4s, v3.s[3]
514 FMLA v28.4s, v18.4s, v4.s[3]
515 FMLA v30.4s, v18.4s, v5.s[3]
516 FMLA v21.4s, v19.4s, v0.s[3]
517 FMLA v23.4s, v19.4s, v1.s[3]
518 FMLA v25.4s, v19.4s, v2.s[3]
519 FMLA v27.4s, v19.4s, v3.s[3]
520 FMLA v29.4s, v19.4s, v4.s[3]
521 FMLA v31.4s, v19.4s, v5.s[3]
522
523 # Is there a remainder?- 2 floats of A (8 bytes)
5245:
525 TBZ x0, 3, 6f
526
527 # Remainder- 2 floats of A (8 bytes)
528 # Load A
XNNPACK Teamb455b122019-09-27 18:10:33 -0700529 LDR d0, [x3], 8
Frank Barchard387c2d12019-12-16 19:14:07 -0800530 LDR d1, [x9], 8
531 LDR d2, [x10], 8
532 LDR d3, [x11], 8
533 LDR d4, [x12], 8
534 LDR d5, [x4], 8
535 # Load B
536 LDP q12, q13, [x5], 32
537 LDP q14, q15, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700538
Frank Barchard387c2d12019-12-16 19:14:07 -0800539 FMLA v20.4s, v12.4s, v0.s[0]
540 FMLA v22.4s, v12.4s, v1.s[0]
541 FMLA v24.4s, v12.4s, v2.s[0]
542 FMLA v26.4s, v12.4s, v3.s[0]
543 FMLA v28.4s, v12.4s, v4.s[0]
544 FMLA v30.4s, v12.4s, v5.s[0]
545 FMLA v21.4s, v13.4s, v0.s[0]
546 FMLA v23.4s, v13.4s, v1.s[0]
547 FMLA v25.4s, v13.4s, v2.s[0]
548 FMLA v27.4s, v13.4s, v3.s[0]
549 FMLA v29.4s, v13.4s, v4.s[0]
550 FMLA v31.4s, v13.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700551
Frank Barchard387c2d12019-12-16 19:14:07 -0800552 FMLA v20.4s, v14.4s, v0.s[1]
553 FMLA v22.4s, v14.4s, v1.s[1]
554 FMLA v24.4s, v14.4s, v2.s[1]
555 FMLA v26.4s, v14.4s, v3.s[1]
556 FMLA v28.4s, v14.4s, v4.s[1]
557 FMLA v30.4s, v14.4s, v5.s[1]
558 FMLA v21.4s, v15.4s, v0.s[1]
559 FMLA v23.4s, v15.4s, v1.s[1]
560 FMLA v25.4s, v15.4s, v2.s[1]
561 FMLA v27.4s, v15.4s, v3.s[1]
562 FMLA v29.4s, v15.4s, v4.s[1]
563 FMLA v31.4s, v15.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700564
Frank Barchard387c2d12019-12-16 19:14:07 -0800565 # Is there a remainder?- 1 float of A (4 bytes)
5666:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700567 TBZ x0, 2, 3b
568
Frank Barchard387c2d12019-12-16 19:14:07 -0800569 # Remainder- 1 float of A (4 bytes)
570 # Load A
XNNPACK Teamb455b122019-09-27 18:10:33 -0700571 LDR s0, [x3], 4
Frank Barchard387c2d12019-12-16 19:14:07 -0800572 LDR s1, [x9], 4
573 LDR s2, [x10], 4
574 LDR s3, [x11], 4
575 LDR s4, [x12], 4
576 LDR s5, [x4], 4
577 # Load B
578 LDP q12, q13, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700579
Frank Barchard387c2d12019-12-16 19:14:07 -0800580 FMLA v20.4s, v12.4s, v0.s[0]
581 FMLA v22.4s, v12.4s, v1.s[0]
582 FMLA v24.4s, v12.4s, v2.s[0]
583 FMLA v26.4s, v12.4s, v3.s[0]
584 FMLA v28.4s, v12.4s, v4.s[0]
585 FMLA v30.4s, v12.4s, v5.s[0]
586 FMLA v21.4s, v13.4s, v0.s[0]
587 FMLA v23.4s, v13.4s, v1.s[0]
588 FMLA v25.4s, v13.4s, v2.s[0]
589 FMLA v27.4s, v13.4s, v3.s[0]
590 FMLA v29.4s, v13.4s, v4.s[0]
591 FMLA v31.4s, v13.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700592 B 3b
593
594 # Store odd width
Frank Barchard387c2d12019-12-16 19:14:07 -08005957:
596 TBZ x1, 2, 8f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700597 STR q30, [x7], 16
598 MOV v30.16b, v31.16b
599 STR q28, [x13], 16
600 MOV v28.16b, v29.16b
601 STR q26, [x18], 16
602 MOV v26.16b, v27.16b
603 STR q24, [x17], 16
604 MOV v24.16b, v25.16b
605 STR q22, [x16], 16
606 MOV v22.16b, v23.16b
607 STR q20, [x6], 16
608 MOV v20.16b, v21.16b
Frank Barchard387c2d12019-12-16 19:14:07 -08006098:
610 TBZ x1, 1, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700611 STR d30, [x7], 8
612 DUP d30, v30.d[1]
613 STR d28, [x13], 8
614 DUP d28, v28.d[1]
615 STR d26, [x18], 8
616 DUP d26, v26.d[1]
617 STR d24, [x17], 8
618 DUP d24, v24.d[1]
619 STR d22, [x16], 8
620 DUP d22, v22.d[1]
621 STR d20, [x6], 8
622 DUP d20, v20.d[1]
623
Frank Barchard387c2d12019-12-16 19:14:07 -08006249:
625 TBZ x1, 0, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700626 STR s30, [x7]
627 STR s28, [x13]
628 STR s26, [x18]
629 STR s24, [x17]
630 STR s22, [x16]
631 STR s20, [x6]
Frank Barchard387c2d12019-12-16 19:14:07 -080063210:
633 # Restore d8-d15 from stack
634 LDP d14, d15, [sp, 48]
635 LDP d12, d13, [sp, 32]
636 LDP d10, d11, [sp, 16]
637 LDP d8, d9, [sp], 64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700638 RET
639
Frank Barchardbd419712019-10-31 14:15:36 -0700640END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57
XNNPACK Teamb455b122019-09-27 18:10:33 -0700641
642#ifdef __ELF__
643.section ".note.GNU-stack","",%progbits
644#endif