blob: 0e261e9c8a02b32f9297ab43c5ec021bbe307edb [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
Frank Barchard387c2d12019-12-16 19:14:07 -08002// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
XNNPACK Teamb455b122019-09-27 18:10:33 -07003// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
Frank Barchard387c2d12019-12-16 19:14:07 -080044# A0 v0 v6
45# A1 v1 v7
46# A2 v2 v8
47# A3 v3 v9
48# A4 v4 v10
49# A5 v5 v11
50# B v12 v13 v14 v15
51# B v16 v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070052# C v20 v21
53# C v22 v23
54# C v24 v25
55# C v26 v27
56# C v28 v29
57# C v30 v31
58# Clamp v6 v7
XNNPACK Teamb455b122019-09-27 18:10:33 -070059
60BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57
61
Frank Barchard387c2d12019-12-16 19:14:07 -080062 # Clamp A and C pointers / Save d8-d15 on stack
63 STP d8, d9, [sp, -64]!
Frank Barchard684bbb02019-11-16 14:14:42 -080064 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 ADD x9, x3, x4 // a1 = a0 + a_stride
66 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 CSEL x9, x3, x9, LO // a1 = a0
68 CSEL x16, x6, x16, LO // c1 = c0
69
Frank Barchard387c2d12019-12-16 19:14:07 -080070 STP d10, d11, [sp, 16]
XNNPACK Teamb455b122019-09-27 18:10:33 -070071 ADD x10, x9, x4 // a2 = a1 + a_stride
72 ADD x17, x16, x7 // c2 = c1 + cm_stride
73 // if mr <= 2
74 CSEL x10, x9, x10, LS // a2 = a1
75 CSEL x17, x16, x17, LS // c2 = c1
76
Frank Barchard387c2d12019-12-16 19:14:07 -080077 STP d12, d13, [sp, 32]
Frank Barchard684bbb02019-11-16 14:14:42 -080078 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 ADD x11, x10, x4 // a3 = a2 + a_stride
80 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 CSEL x11, x10, x11, LO // a3 = a2
82 CSEL x18, x17, x18, LO // c3 = c2
83
Frank Barchard387c2d12019-12-16 19:14:07 -080084 STP d14, d15, [sp, 48]
XNNPACK Teamb455b122019-09-27 18:10:33 -070085 ADD x12, x11, x4 // a4 = a3 + a_stride
86 ADD x13, x18, x7 // c4 = c3 + cm_stride
87 // if mr <= 5
88 CSEL x12, x11, x12, LS // a4 = a3
89 CSEL x13, x18, x13, LS // c4 = c3
90
91 # Load params pointer
Frank Barchard387c2d12019-12-16 19:14:07 -080092 LDR x8, [sp, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -070093
Frank Barchard684bbb02019-11-16 14:14:42 -080094 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 ADD x4, x12, x4 // a5 = a4 + a_stride
96 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070097 CSEL x4, x12, x4, LO // a5 = a4
98 CSEL x7, x13, x7, LO // c5 = c4
99
100 # Load cn_stride
Frank Barchard387c2d12019-12-16 19:14:07 -0800101 LDR x14, [sp, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700102
1030:
104 # Load initial bias from w into accumulators
105 LDP q20, q21, [x5], 32
106 MOV v22.16b, v20.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700107 MOV v23.16b, v21.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700108 MOV v24.16b, v20.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700109 MOV v25.16b, v21.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700110 MOV v26.16b, v20.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700111 MOV v27.16b, v21.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700112 MOV v28.16b, v20.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700113 MOV v29.16b, v21.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700114 MOV v30.16b, v20.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700115 MOV v31.16b, v21.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700116
Frank Barchard387c2d12019-12-16 19:14:07 -0800117 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
118 SUBS x0, x2, 32 // k = kc - 32
119 B.LO 4f
120
121 # Prologue - loads for main loop of 96 FMA
122 LDR q0, [x3], 16
123 LDR q1, [x9], 16
124 LDR q2, [x10], 16
125 LDR q3, [x11], 16
126 LDR q4, [x12], 16
127 LDR q5, [x4], 16
128 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
129 LDP q14, q15, [x5], 32
130 LDP q16, q17, [x5], 32
131
132 # Is there at least 8 floats (32 bytes) for main loop?
133 SUBS x0, x0, 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700134 B.LO 2f
135
Frank Barchard387c2d12019-12-16 19:14:07 -0800136 # Main loop - 8 floats of A (32 bytes)
137 # 96 FMA + 6 LDP A + 8 LDP B
XNNPACK Teamb455b122019-09-27 18:10:33 -07001381:
Frank Barchard387c2d12019-12-16 19:14:07 -0800139 # First group of 4 A. 48 FMA.
140 FMLA v20.4s, v12.4s, v0.s[0]
141 LDP q18, q19, [x5], 32 // Load last B
142 FMLA v22.4s, v12.4s, v1.s[0]
143 FMLA v24.4s, v12.4s, v2.s[0]
144 FMLA v26.4s, v12.4s, v3.s[0]
145 FMLA v28.4s, v12.4s, v4.s[0]
146 FMLA v30.4s, v12.4s, v5.s[0]
147 FMLA v21.4s, v13.4s, v0.s[0]
148 FMLA v23.4s, v13.4s, v1.s[0]
149 FMLA v25.4s, v13.4s, v2.s[0]
150 FMLA v27.4s, v13.4s, v3.s[0]
151 FMLA v29.4s, v13.4s, v4.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700152
Frank Barchard387c2d12019-12-16 19:14:07 -0800153 FMLA v31.4s, v13.4s, v5.s[0]
154 FMLA v20.4s, v14.4s, v0.s[1]
155 FMLA v22.4s, v14.4s, v1.s[1]
156 FMLA v24.4s, v14.4s, v2.s[1]
157 FMLA v26.4s, v14.4s, v3.s[1]
158 FMLA v28.4s, v14.4s, v4.s[1]
159 FMLA v30.4s, v14.4s, v5.s[1]
160 FMLA v21.4s, v15.4s, v0.s[1]
161 FMLA v23.4s, v15.4s, v1.s[1]
162 FMLA v25.4s, v15.4s, v2.s[1]
163 LDR q6, [x3], 16 // Load next 6 A
164 FMLA v27.4s, v15.4s, v3.s[1]
165 FMLA v29.4s, v15.4s, v4.s[1]
166 FMLA v31.4s, v15.4s, v5.s[1]
167 LDR q7, [x9], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700168
Frank Barchard387c2d12019-12-16 19:14:07 -0800169 FMLA v20.4s, v16.4s, v0.s[2]
170 FMLA v22.4s, v16.4s, v1.s[2]
171 FMLA v24.4s, v16.4s, v2.s[2]
172 LDR q8, [x10], 16
173 FMLA v26.4s, v16.4s, v3.s[2]
174 FMLA v28.4s, v16.4s, v4.s[2]
175 FMLA v30.4s, v16.4s, v5.s[2]
176 LDR q9, [x11], 16
177 FMLA v21.4s, v17.4s, v0.s[2]
178 FMLA v23.4s, v17.4s, v1.s[2]
179 FMLA v25.4s, v17.4s, v2.s[2]
180 LDR q10, [x12], 16
181 FMLA v27.4s, v17.4s, v3.s[2]
182 FMLA v29.4s, v17.4s, v4.s[2]
183 FMLA v31.4s, v17.4s, v5.s[2]
184 LDR q11, [x4], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700185
Frank Barchard387c2d12019-12-16 19:14:07 -0800186 FMLA v20.4s, v18.4s, v0.s[3]
187 FMLA v22.4s, v18.4s, v1.s[3]
188 FMLA v24.4s, v18.4s, v2.s[3]
189 LDP q12, q13, [x5], 32 // Load 4 B
190 FMLA v26.4s, v18.4s, v3.s[3]
191 FMLA v28.4s, v18.4s, v4.s[3]
192 FMLA v30.4s, v18.4s, v5.s[3]
193 LDP q14, q15, [x5], 32
194 FMLA v21.4s, v19.4s, v0.s[3]
195 FMLA v23.4s, v19.4s, v1.s[3]
196 FMLA v25.4s, v19.4s, v2.s[3]
197 LDP q16, q17, [x5], 32
198 FMLA v27.4s, v19.4s, v3.s[3]
199 FMLA v29.4s, v19.4s, v4.s[3]
200 FMLA v31.4s, v19.4s, v5.s[3]
201 LDP q18, q19, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700202
Frank Barchard387c2d12019-12-16 19:14:07 -0800203 # Second group of 4 A. 48 FMA.
204 FMLA v20.4s, v12.4s, v6.s[0]
205 FMLA v22.4s, v12.4s, v7.s[0]
206 FMLA v24.4s, v12.4s, v8.s[0]
207 LDR q0, [x3], 16 // Load next 6 A
208 FMLA v26.4s, v12.4s, v9.s[0]
209 FMLA v28.4s, v12.4s, v10.s[0]
210 FMLA v30.4s, v12.4s, v11.s[0]
211 LDR q1, [x9], 16
212 FMLA v21.4s, v13.4s, v6.s[0]
213 FMLA v23.4s, v13.4s, v7.s[0]
214 FMLA v25.4s, v13.4s, v8.s[0]
215 LDR q2, [x10], 16
216 FMLA v27.4s, v13.4s, v9.s[0]
217 FMLA v29.4s, v13.4s, v10.s[0]
218 FMLA v31.4s, v13.4s, v11.s[0]
219 LDR q3, [x11], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700220
Frank Barchard387c2d12019-12-16 19:14:07 -0800221 FMLA v20.4s, v14.4s, v6.s[1]
222 FMLA v22.4s, v14.4s, v7.s[1]
223 FMLA v24.4s, v14.4s, v8.s[1]
224 LDR q4, [x12], 16
225 FMLA v26.4s, v14.4s, v9.s[1]
226 FMLA v28.4s, v14.4s, v10.s[1]
227 FMLA v30.4s, v14.4s, v11.s[1]
228 LDR q5, [x4], 16
229 FMLA v21.4s, v15.4s, v6.s[1]
230 FMLA v23.4s, v15.4s, v7.s[1]
231 FMLA v25.4s, v15.4s, v8.s[1]
232 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
233 FMLA v27.4s, v15.4s, v9.s[1]
234 FMLA v29.4s, v15.4s, v10.s[1]
235 FMLA v31.4s, v15.4s, v11.s[1]
236 LDP q14, q15, [x5], 32
237
238 FMLA v20.4s, v16.4s, v6.s[2]
239 FMLA v22.4s, v16.4s, v7.s[2]
240 FMLA v24.4s, v16.4s, v8.s[2]
241 FMLA v26.4s, v16.4s, v9.s[2]
242 FMLA v28.4s, v16.4s, v10.s[2]
243 FMLA v30.4s, v16.4s, v11.s[2]
244 FMLA v21.4s, v17.4s, v6.s[2]
245 FMLA v23.4s, v17.4s, v7.s[2]
246 FMLA v25.4s, v17.4s, v8.s[2]
247 FMLA v27.4s, v17.4s, v9.s[2]
248 FMLA v29.4s, v17.4s, v10.s[2]
249 FMLA v31.4s, v17.4s, v11.s[2]
250 LDP q16, q17, [x5], 32
251
252 FMLA v20.4s, v18.4s, v6.s[3]
253 FMLA v22.4s, v18.4s, v7.s[3]
254 SUBS x0, x0, 32
255 FMLA v24.4s, v18.4s, v8.s[3]
256 FMLA v26.4s, v18.4s, v9.s[3]
257 FMLA v28.4s, v18.4s, v10.s[3]
258 FMLA v30.4s, v18.4s, v11.s[3]
259 FMLA v21.4s, v19.4s, v6.s[3]
260 FMLA v23.4s, v19.4s, v7.s[3]
261 FMLA v25.4s, v19.4s, v8.s[3]
262 FMLA v27.4s, v19.4s, v9.s[3]
263 FMLA v29.4s, v19.4s, v10.s[3]
264 FMLA v31.4s, v19.4s, v11.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700265 B.HS 1b
266
Frank Barchard387c2d12019-12-16 19:14:07 -0800267 # Epilogue - 8 floats of A (32 bytes)
268 # 96 FMA + 6 LDP A + 8 LDP B
269 # First block same as main loop. Second block has no preloads.
XNNPACK Teamb455b122019-09-27 18:10:33 -07002702:
Frank Barchard387c2d12019-12-16 19:14:07 -0800271 # First group of 4 A. 48 FMA.
272 FMLA v20.4s, v12.4s, v0.s[0]
273 LDP q18, q19, [x5], 32 // Load last B
274 FMLA v22.4s, v12.4s, v1.s[0]
275 FMLA v24.4s, v12.4s, v2.s[0]
276 FMLA v26.4s, v12.4s, v3.s[0]
277 FMLA v28.4s, v12.4s, v4.s[0]
278 FMLA v30.4s, v12.4s, v5.s[0]
279 FMLA v21.4s, v13.4s, v0.s[0]
280 FMLA v23.4s, v13.4s, v1.s[0]
281 FMLA v25.4s, v13.4s, v2.s[0]
282 FMLA v27.4s, v13.4s, v3.s[0]
283 FMLA v29.4s, v13.4s, v4.s[0]
284
285 FMLA v31.4s, v13.4s, v5.s[0]
286 FMLA v20.4s, v14.4s, v0.s[1]
287 FMLA v22.4s, v14.4s, v1.s[1]
288 FMLA v24.4s, v14.4s, v2.s[1]
289 FMLA v26.4s, v14.4s, v3.s[1]
290 FMLA v28.4s, v14.4s, v4.s[1]
291 FMLA v30.4s, v14.4s, v5.s[1]
292 FMLA v21.4s, v15.4s, v0.s[1]
293 FMLA v23.4s, v15.4s, v1.s[1]
294 FMLA v25.4s, v15.4s, v2.s[1]
295 LDR q6, [x3], 16 // Load next 6 A
296 FMLA v27.4s, v15.4s, v3.s[1]
297 FMLA v29.4s, v15.4s, v4.s[1]
298 FMLA v31.4s, v15.4s, v5.s[1]
299 LDR q7, [x9], 16
300
301 FMLA v20.4s, v16.4s, v0.s[2]
302 FMLA v22.4s, v16.4s, v1.s[2]
303 FMLA v24.4s, v16.4s, v2.s[2]
304 LDR q8, [x10], 16
305 FMLA v26.4s, v16.4s, v3.s[2]
306 FMLA v28.4s, v16.4s, v4.s[2]
307 FMLA v30.4s, v16.4s, v5.s[2]
308 LDR q9, [x11], 16
309 FMLA v21.4s, v17.4s, v0.s[2]
310 FMLA v23.4s, v17.4s, v1.s[2]
311 FMLA v25.4s, v17.4s, v2.s[2]
312 LDR q10, [x12], 16
313 FMLA v27.4s, v17.4s, v3.s[2]
314 FMLA v29.4s, v17.4s, v4.s[2]
315 FMLA v31.4s, v17.4s, v5.s[2]
316 LDR q11, [x4], 16
317
318 FMLA v20.4s, v18.4s, v0.s[3]
319 FMLA v22.4s, v18.4s, v1.s[3]
320 FMLA v24.4s, v18.4s, v2.s[3]
321 LDP q12, q13, [x5], 32 // Load 4 B
322 FMLA v26.4s, v18.4s, v3.s[3]
323 FMLA v28.4s, v18.4s, v4.s[3]
324 FMLA v30.4s, v18.4s, v5.s[3]
325 LDP q14, q15, [x5], 32
326 FMLA v21.4s, v19.4s, v0.s[3]
327 FMLA v23.4s, v19.4s, v1.s[3]
328 FMLA v25.4s, v19.4s, v2.s[3]
329 LDP q16, q17, [x5], 32
330 FMLA v27.4s, v19.4s, v3.s[3]
331 FMLA v29.4s, v19.4s, v4.s[3]
332 FMLA v31.4s, v19.4s, v5.s[3]
333 LDP q18, q19, [x5], 32
334
335 # Second group of 4 A. 48 FMA.
336 FMLA v20.4s, v12.4s, v6.s[0]
337 FMLA v22.4s, v12.4s, v7.s[0]
338 FMLA v24.4s, v12.4s, v8.s[0]
339 FMLA v26.4s, v12.4s, v9.s[0]
340 FMLA v28.4s, v12.4s, v10.s[0]
341 FMLA v30.4s, v12.4s, v11.s[0]
342 FMLA v21.4s, v13.4s, v6.s[0]
343 FMLA v23.4s, v13.4s, v7.s[0]
344 FMLA v25.4s, v13.4s, v8.s[0]
345 FMLA v27.4s, v13.4s, v9.s[0]
346 FMLA v29.4s, v13.4s, v10.s[0]
347 FMLA v31.4s, v13.4s, v11.s[0]
348
349 FMLA v20.4s, v14.4s, v6.s[1]
350 FMLA v22.4s, v14.4s, v7.s[1]
351 FMLA v24.4s, v14.4s, v8.s[1]
352 FMLA v26.4s, v14.4s, v9.s[1]
353 FMLA v28.4s, v14.4s, v10.s[1]
354 FMLA v30.4s, v14.4s, v11.s[1]
355 FMLA v21.4s, v15.4s, v6.s[1]
356 FMLA v23.4s, v15.4s, v7.s[1]
357 FMLA v25.4s, v15.4s, v8.s[1]
358 FMLA v27.4s, v15.4s, v9.s[1]
359 FMLA v29.4s, v15.4s, v10.s[1]
360 FMLA v31.4s, v15.4s, v11.s[1]
361
362 FMLA v20.4s, v16.4s, v6.s[2]
363 FMLA v22.4s, v16.4s, v7.s[2]
364 FMLA v24.4s, v16.4s, v8.s[2]
365 FMLA v26.4s, v16.4s, v9.s[2]
366 FMLA v28.4s, v16.4s, v10.s[2]
367 FMLA v30.4s, v16.4s, v11.s[2]
368 FMLA v21.4s, v17.4s, v6.s[2]
369 FMLA v23.4s, v17.4s, v7.s[2]
370 FMLA v25.4s, v17.4s, v8.s[2]
371 FMLA v27.4s, v17.4s, v9.s[2]
372 FMLA v29.4s, v17.4s, v10.s[2]
373 FMLA v31.4s, v17.4s, v11.s[2]
374
375 FMLA v20.4s, v18.4s, v6.s[3]
376 FMLA v22.4s, v18.4s, v7.s[3]
377 FMLA v24.4s, v18.4s, v8.s[3]
378 FMLA v26.4s, v18.4s, v9.s[3]
379 FMLA v28.4s, v18.4s, v10.s[3]
380 FMLA v30.4s, v18.4s, v11.s[3]
381 FMLA v21.4s, v19.4s, v6.s[3]
382 FMLA v23.4s, v19.4s, v7.s[3]
383
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700384 # Load min/max values
Frank Barchard387c2d12019-12-16 19:14:07 -0800385 LD2R {v6.4s, v7.4s}, [x8]
386
387 FMLA v25.4s, v19.4s, v8.s[3]
388 FMLA v27.4s, v19.4s, v9.s[3]
389 # Is there a remainder?- 4 floats of A (16 bytes) or less
390 TST x0, 31
391 FMLA v29.4s, v19.4s, v10.s[3]
392 FMLA v31.4s, v19.4s, v11.s[3]
393 B.NE 4f
394
Frank Barchardbd419712019-10-31 14:15:36 -0700395 # Clamp
Frank Barchard387c2d12019-12-16 19:14:07 -08003963:
Marat Dukhana51cf482020-04-08 16:16:19 -0700397 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800398 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700399 FMAX v21.4s, v21.4s, v6.4s
400 FMAX v22.4s, v22.4s, v6.4s
401 FMAX v23.4s, v23.4s, v6.4s
402 FMAX v24.4s, v24.4s, v6.4s
403 FMAX v25.4s, v25.4s, v6.4s
404 FMAX v26.4s, v26.4s, v6.4s
405 FMAX v27.4s, v27.4s, v6.4s
406 FMAX v28.4s, v28.4s, v6.4s
407 FMAX v29.4s, v29.4s, v6.4s
408 FMAX v30.4s, v30.4s, v6.4s
409 FMAX v31.4s, v31.4s, v6.4s
410 FMIN v20.4s, v20.4s, v7.4s
411 FMIN v21.4s, v21.4s, v7.4s
412 FMIN v22.4s, v22.4s, v7.4s
413 FMIN v23.4s, v23.4s, v7.4s
414 FMIN v24.4s, v24.4s, v7.4s
415 FMIN v25.4s, v25.4s, v7.4s
416 FMIN v26.4s, v26.4s, v7.4s
417 FMIN v27.4s, v27.4s, v7.4s
418 FMIN v28.4s, v28.4s, v7.4s
419 FMIN v29.4s, v29.4s, v7.4s
420 FMIN v30.4s, v30.4s, v7.4s
421 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700422
423 # Store full 6 x 8
Frank Barchard387c2d12019-12-16 19:14:07 -0800424 B.LO 7f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700425
426 STP q20, q21, [x6]
427 ADD x6, x6, x14
428 SUB x3, x3, x2 // a0 -= kc
429 STP q22, q23, [x16]
430 ADD x16, x16, x14
431 SUB x9, x9, x2 // a1 -= kc
432 STP q24, q25, [x17]
433 ADD x17, x17, x14
434 SUB x10, x10, x2 // a2 -= kc
435 STP q26, q27, [x18]
436 ADD x18, x18, x14
437 SUB x11, x11, x2 // a3 -= kc
438 STP q28, q29, [x13]
439 ADD x13, x13, x14
440 SUB x12, x12, x2 // a4 -= kc
441 STP q30, q31, [x7]
442 ADD x7, x7, x14
443 SUB x4, x4, x2 // a5 -= kc
444
XNNPACK Teamb455b122019-09-27 18:10:33 -0700445 B.HI 0b
446
Frank Barchard387c2d12019-12-16 19:14:07 -0800447 # Restore d8-d15 from stack
448 LDP d14, d15, [sp, 48]
449 LDP d12, d13, [sp, 32]
450 LDP d10, d11, [sp, 16]
451 LDP d8, d9, [sp], 64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700452 RET
453
Frank Barchardbd419712019-10-31 14:15:36 -07004544:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700455 # Load min/max values
Frank Barchard387c2d12019-12-16 19:14:07 -0800456 LD2R {v6.4s, v7.4s}, [x8]
457
458 # Is there a remainder?- 4 floats of A (16 bytes)
459 TBZ x0, 4, 5f
460
461 # Remainder- 4 floats of A (16 bytes)
462 # Load A
463 LDR q0, [x3], 16
464 LDR q1, [x9], 16
465 LDR q2, [x10], 16
466 LDR q3, [x11], 16
467 LDR q4, [x12], 16
468 LDR q5, [x4], 16
469 # Load B
470 LDP q12, q13, [x5], 32
471 LDP q14, q15, [x5], 32
472 LDP q16, q17, [x5], 32
473 LDP q18, q19, [x5], 32
474
475 FMLA v20.4s, v12.4s, v0.s[0]
476 FMLA v22.4s, v12.4s, v1.s[0]
477 FMLA v24.4s, v12.4s, v2.s[0]
478 FMLA v26.4s, v12.4s, v3.s[0]
479 FMLA v28.4s, v12.4s, v4.s[0]
480 FMLA v30.4s, v12.4s, v5.s[0]
481 FMLA v21.4s, v13.4s, v0.s[0]
482 FMLA v23.4s, v13.4s, v1.s[0]
483 FMLA v25.4s, v13.4s, v2.s[0]
484 FMLA v27.4s, v13.4s, v3.s[0]
485 FMLA v29.4s, v13.4s, v4.s[0]
486 FMLA v31.4s, v13.4s, v5.s[0]
487
488 FMLA v20.4s, v14.4s, v0.s[1]
489 FMLA v22.4s, v14.4s, v1.s[1]
490 FMLA v24.4s, v14.4s, v2.s[1]
491 FMLA v26.4s, v14.4s, v3.s[1]
492 FMLA v28.4s, v14.4s, v4.s[1]
493 FMLA v30.4s, v14.4s, v5.s[1]
494 FMLA v21.4s, v15.4s, v0.s[1]
495 FMLA v23.4s, v15.4s, v1.s[1]
496 FMLA v25.4s, v15.4s, v2.s[1]
497 FMLA v27.4s, v15.4s, v3.s[1]
498 FMLA v29.4s, v15.4s, v4.s[1]
499 FMLA v31.4s, v15.4s, v5.s[1]
500
501 FMLA v20.4s, v16.4s, v0.s[2]
502 FMLA v22.4s, v16.4s, v1.s[2]
503 FMLA v24.4s, v16.4s, v2.s[2]
504 FMLA v26.4s, v16.4s, v3.s[2]
505 FMLA v28.4s, v16.4s, v4.s[2]
506 FMLA v30.4s, v16.4s, v5.s[2]
507 FMLA v21.4s, v17.4s, v0.s[2]
508 FMLA v23.4s, v17.4s, v1.s[2]
509 FMLA v25.4s, v17.4s, v2.s[2]
510 FMLA v27.4s, v17.4s, v3.s[2]
511 FMLA v29.4s, v17.4s, v4.s[2]
512 FMLA v31.4s, v17.4s, v5.s[2]
513
514 FMLA v20.4s, v18.4s, v0.s[3]
515 FMLA v22.4s, v18.4s, v1.s[3]
516 FMLA v24.4s, v18.4s, v2.s[3]
517 FMLA v26.4s, v18.4s, v3.s[3]
518 FMLA v28.4s, v18.4s, v4.s[3]
519 FMLA v30.4s, v18.4s, v5.s[3]
520 FMLA v21.4s, v19.4s, v0.s[3]
521 FMLA v23.4s, v19.4s, v1.s[3]
522 FMLA v25.4s, v19.4s, v2.s[3]
523 FMLA v27.4s, v19.4s, v3.s[3]
524 FMLA v29.4s, v19.4s, v4.s[3]
525 FMLA v31.4s, v19.4s, v5.s[3]
526
527 # Is there a remainder?- 2 floats of A (8 bytes)
5285:
529 TBZ x0, 3, 6f
530
531 # Remainder- 2 floats of A (8 bytes)
532 # Load A
XNNPACK Teamb455b122019-09-27 18:10:33 -0700533 LDR d0, [x3], 8
Frank Barchard387c2d12019-12-16 19:14:07 -0800534 LDR d1, [x9], 8
535 LDR d2, [x10], 8
536 LDR d3, [x11], 8
537 LDR d4, [x12], 8
538 LDR d5, [x4], 8
539 # Load B
540 LDP q12, q13, [x5], 32
541 LDP q14, q15, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700542
Frank Barchard387c2d12019-12-16 19:14:07 -0800543 FMLA v20.4s, v12.4s, v0.s[0]
544 FMLA v22.4s, v12.4s, v1.s[0]
545 FMLA v24.4s, v12.4s, v2.s[0]
546 FMLA v26.4s, v12.4s, v3.s[0]
547 FMLA v28.4s, v12.4s, v4.s[0]
548 FMLA v30.4s, v12.4s, v5.s[0]
549 FMLA v21.4s, v13.4s, v0.s[0]
550 FMLA v23.4s, v13.4s, v1.s[0]
551 FMLA v25.4s, v13.4s, v2.s[0]
552 FMLA v27.4s, v13.4s, v3.s[0]
553 FMLA v29.4s, v13.4s, v4.s[0]
554 FMLA v31.4s, v13.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700555
Frank Barchard387c2d12019-12-16 19:14:07 -0800556 FMLA v20.4s, v14.4s, v0.s[1]
557 FMLA v22.4s, v14.4s, v1.s[1]
558 FMLA v24.4s, v14.4s, v2.s[1]
559 FMLA v26.4s, v14.4s, v3.s[1]
560 FMLA v28.4s, v14.4s, v4.s[1]
561 FMLA v30.4s, v14.4s, v5.s[1]
562 FMLA v21.4s, v15.4s, v0.s[1]
563 FMLA v23.4s, v15.4s, v1.s[1]
564 FMLA v25.4s, v15.4s, v2.s[1]
565 FMLA v27.4s, v15.4s, v3.s[1]
566 FMLA v29.4s, v15.4s, v4.s[1]
567 FMLA v31.4s, v15.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700568
Frank Barchard387c2d12019-12-16 19:14:07 -0800569 # Is there a remainder?- 1 float of A (4 bytes)
5706:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700571 TBZ x0, 2, 3b
572
Frank Barchard387c2d12019-12-16 19:14:07 -0800573 # Remainder- 1 float of A (4 bytes)
574 # Load A
XNNPACK Teamb455b122019-09-27 18:10:33 -0700575 LDR s0, [x3], 4
Frank Barchard387c2d12019-12-16 19:14:07 -0800576 LDR s1, [x9], 4
577 LDR s2, [x10], 4
578 LDR s3, [x11], 4
579 LDR s4, [x12], 4
580 LDR s5, [x4], 4
581 # Load B
582 LDP q12, q13, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700583
Frank Barchard387c2d12019-12-16 19:14:07 -0800584 FMLA v20.4s, v12.4s, v0.s[0]
585 FMLA v22.4s, v12.4s, v1.s[0]
586 FMLA v24.4s, v12.4s, v2.s[0]
587 FMLA v26.4s, v12.4s, v3.s[0]
588 FMLA v28.4s, v12.4s, v4.s[0]
589 FMLA v30.4s, v12.4s, v5.s[0]
590 FMLA v21.4s, v13.4s, v0.s[0]
591 FMLA v23.4s, v13.4s, v1.s[0]
592 FMLA v25.4s, v13.4s, v2.s[0]
593 FMLA v27.4s, v13.4s, v3.s[0]
594 FMLA v29.4s, v13.4s, v4.s[0]
595 FMLA v31.4s, v13.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700596 B 3b
597
598 # Store odd width
Frank Barchard387c2d12019-12-16 19:14:07 -08005997:
600 TBZ x1, 2, 8f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700601 STR q20, [x6], 16
602 MOV v20.16b, v21.16b
603 STR q22, [x16], 16
604 MOV v22.16b, v23.16b
605 STR q24, [x17], 16
606 MOV v24.16b, v25.16b
607 STR q26, [x18], 16
608 MOV v26.16b, v27.16b
609 STR q28, [x13], 16
610 MOV v28.16b, v29.16b
611 STR q30, [x7], 16
612 MOV v30.16b, v31.16b
Frank Barchard387c2d12019-12-16 19:14:07 -08006138:
614 TBZ x1, 1, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700615 STR d20, [x6], 8
616 DUP d20, v20.d[1]
617 STR d22, [x16], 8
618 DUP d22, v22.d[1]
619 STR d24, [x17], 8
620 DUP d24, v24.d[1]
621 STR d26, [x18], 8
622 DUP d26, v26.d[1]
623 STR d28, [x13], 8
624 DUP d28, v28.d[1]
625 STR d30, [x7], 8
626 DUP d30, v30.d[1]
627
Frank Barchard387c2d12019-12-16 19:14:07 -08006289:
629 TBZ x1, 0, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700630 STR s20, [x6]
631 STR s22, [x16]
632 STR s24, [x17]
633 STR s26, [x18]
634 STR s28, [x13]
635 STR s30, [x7]
Frank Barchard387c2d12019-12-16 19:14:07 -080063610:
637 # Restore d8-d15 from stack
638 LDP d14, d15, [sp, 48]
639 LDP d12, d13, [sp, 32]
640 LDP d10, d11, [sp, 16]
641 LDP d8, d9, [sp], 64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700642 RET
643
Frank Barchardbd419712019-10-31 14:15:36 -0700644END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57
XNNPACK Teamb455b122019-09-27 18:10:33 -0700645
646#ifdef __ELF__
647.section ".note.GNU-stack","",%progbits
648#endif