blob: 3361daf77d2f93047874591962d3515d33637297 [file] [log] [blame]
Frank Barchard387c2d12019-12-16 19:14:07 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
Frank Barchard387c2d12019-12-16 19:14:07 -080012# void xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a57(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
Frank Barchard387c2d12019-12-16 19:14:07 -080025# unused compared to 5x8
26# x4 a5
27# x7 c5
XNNPACK Teamb455b122019-09-27 18:10:33 -070028# A5 v10 v11
29# C v30 v31
30
31# d8-d15 need to be preserved if used.
32# x19-x30 need to be preserved if used. x18 is reserved for OS.
33
34# A pointers
Frank Barchard387c2d12019-12-16 19:14:07 -080035# x3 a0
36# x9 a1
37# x10 a2
38# x11 a3
39# x12 a4
XNNPACK Teamb455b122019-09-27 18:10:33 -070040
41# C pointers
42# x6 c0
43# x16 c1
44# x17 c2
45# x13 c3
46# x7 c4
47
48# Vector register usage
49# A0 v0 v1
50# A1 v2 v3
51# A2 v4 v5
52# A3 v6 v7
53# A4 v8 v9
54# B v12 v13 v14 v15
55# B v16 v17 v18 v19
56# C v20 v21
57# C v22 v23
58# C v24 v25
59# C v26 v27
60# C v28 v29
61# Clamp v30 v31
62
Frank Barchard387c2d12019-12-16 19:14:07 -080063BEGIN_FUNCTION xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a57
XNNPACK Teamb455b122019-09-27 18:10:33 -070064
Frank Barchard387c2d12019-12-16 19:14:07 -080065 # Clamp A and C pointers / Save d8-d15 on stack
66 STP d8, d9, [sp, -48]!
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 CMP x0, 2 // if mr < 2
Frank Barchard387c2d12019-12-16 19:14:07 -080068 ADD x9, x3, x4 // a1 = a0 + a_stride
Frank Barchard684bbb02019-11-16 14:14:42 -080069 ADD x16, x6, x7 // c1 = c0 + cm_stride
Frank Barchard387c2d12019-12-16 19:14:07 -080070 CSEL x9, x3, x9, LO // a1 = a0
XNNPACK Teamb455b122019-09-27 18:10:33 -070071 CSEL x16, x6, x16, LO // c1 = c0
72
73 STP d12, d13, [sp, 16]
Frank Barchard387c2d12019-12-16 19:14:07 -080074 ADD x10, x9, x4 // a2 = a1 + a_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070075 ADD x17, x16, x7 // c2 = c1 + cm_stride
76 // if mr <= 2
Frank Barchard387c2d12019-12-16 19:14:07 -080077 CSEL x10, x9, x10, LS // a2 = a1
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 CSEL x17, x16, x17, LS // c2 = c1
79
80 STP d14, d15, [sp, 32]
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 CMP x0, 4 // if mr < 4
Frank Barchard387c2d12019-12-16 19:14:07 -080082 ADD x11, x10, x4 // a3 = a2 + a_stride
Frank Barchard684bbb02019-11-16 14:14:42 -080083 ADD x13, x17, x7 // c3 = c2 + cm_stride
Frank Barchard387c2d12019-12-16 19:14:07 -080084 CSEL x11, x10, x11, LO // a3 = a2
XNNPACK Teamb455b122019-09-27 18:10:33 -070085 CSEL x13, x17, x13, LO // c3 = c2
86
Frank Barchard387c2d12019-12-16 19:14:07 -080087 # Load acc, params pointer
88 LDP x15, x8, [sp, 56]
XNNPACK Teamb455b122019-09-27 18:10:33 -070089
Frank Barchard387c2d12019-12-16 19:14:07 -080090 ADD x12, x11, x4 // a4 = a3 + a_stride
91 ADD x7, x13, x7 // c4 = c3 + cm_stride
92 // if mr <= 5
93 CSEL x12, x11, x12, LS // a4 = a3
94 CSEL x7, x13, x7, LS // c4 = c3
XNNPACK Teamb455b122019-09-27 18:10:33 -070095
96 # Load clamp values
97 LD2R {v30.4s, v31.4s}, [x8]
98
Frank Barchard387c2d12019-12-16 19:14:07 -080099 # Load cn_stride
100 LDR x14, [sp, 48]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101
1020:
Frank Barchard387c2d12019-12-16 19:14:07 -0800103 # Load initial accumulators
104 LDP q20, q21, [x15], 32
105 LDP q22, q23, [x15], 32
106 LDP q24, q25, [x15], 32
107 LDP q26, q27, [x15], 32
108 LDP q28, q29, [x15], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700109
110 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
111 SUBS x0, x2, 32 // k = kc - 32
Frank Barchard387c2d12019-12-16 19:14:07 -0800112 B.LO 4f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700113
Frank Barchard387c2d12019-12-16 19:14:07 -0800114 # Prologue - loads for main loop of 80 FMA
115 LDR q0, [x3], 16
116 LDR q2, [x9], 16
117 LDR q4, [x10], 16
118 LDR q6, [x11], 16
119 LDR q8, [x12], 16
120 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
121 LDP q14, q15, [x5], 32
122 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700123
124 # Is there at least 8 floats (32 bytes) for main loop?
125 SUBS x0, x0, 32
Frank Barchard387c2d12019-12-16 19:14:07 -0800126 B.LO 2f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127
128 # Main loop - 8 floats of A (32 bytes)
129 # 80 FMA + 5 LDP A + 8 LDP B
Frank Barchard387c2d12019-12-16 19:14:07 -08001301:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700131 # First group of 4 A. 40 FMA.
132 FMLA v20.4s, v12.4s, v0.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800133 LDP q18, q19, [x5], 32 // Load last B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700134 FMLA v22.4s, v12.4s, v2.s[0]
135 FMLA v24.4s, v12.4s, v4.s[0]
136 FMLA v26.4s, v12.4s, v6.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700137 FMLA v28.4s, v12.4s, v8.s[0]
138 FMLA v21.4s, v13.4s, v0.s[0]
139 FMLA v23.4s, v13.4s, v2.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700140 FMLA v25.4s, v13.4s, v4.s[0]
141 FMLA v27.4s, v13.4s, v6.s[0]
142 FMLA v29.4s, v13.4s, v8.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800143 LDR q1, [x3], 16 // Load next 5 A
XNNPACK Teamb455b122019-09-27 18:10:33 -0700144
145 FMLA v20.4s, v14.4s, v0.s[1]
146 FMLA v22.4s, v14.4s, v2.s[1]
147 FMLA v24.4s, v14.4s, v4.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800148 LDR q3, [x9], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700149 FMLA v26.4s, v14.4s, v6.s[1]
150 FMLA v28.4s, v14.4s, v8.s[1]
151 FMLA v21.4s, v15.4s, v0.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800152 LDR q5, [x10], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700153 FMLA v23.4s, v15.4s, v2.s[1]
154 FMLA v25.4s, v15.4s, v4.s[1]
155 FMLA v27.4s, v15.4s, v6.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800156 LDR q7, [x11], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700157 FMLA v29.4s, v15.4s, v8.s[1]
158
159 FMLA v20.4s, v16.4s, v0.s[2]
160 FMLA v22.4s, v16.4s, v2.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800161 LDR q9, [x12], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700162 FMLA v24.4s, v16.4s, v4.s[2]
163 FMLA v26.4s, v16.4s, v6.s[2]
164 FMLA v28.4s, v16.4s, v8.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800165 LDP q12, q13, [x5], 32 // Load 4 B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700166 FMLA v21.4s, v17.4s, v0.s[2]
167 FMLA v23.4s, v17.4s, v2.s[2]
168 FMLA v25.4s, v17.4s, v4.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800169 LDP q14, q15, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700170 FMLA v27.4s, v17.4s, v6.s[2]
171 FMLA v29.4s, v17.4s, v8.s[2]
172
173 FMLA v20.4s, v18.4s, v0.s[3]
Frank Barchard387c2d12019-12-16 19:14:07 -0800174 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700175 FMLA v22.4s, v18.4s, v2.s[3]
176 FMLA v24.4s, v18.4s, v4.s[3]
177 FMLA v26.4s, v18.4s, v6.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700178 FMLA v28.4s, v18.4s, v8.s[3]
179 FMLA v21.4s, v19.4s, v0.s[3]
180 FMLA v23.4s, v19.4s, v2.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700181 FMLA v25.4s, v19.4s, v4.s[3]
182 FMLA v27.4s, v19.4s, v6.s[3]
183 FMLA v29.4s, v19.4s, v8.s[3]
Frank Barchard387c2d12019-12-16 19:14:07 -0800184 LDP q18, q19, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700185
186 # Second group of 4 A. 40 FMA.
187 FMLA v20.4s, v12.4s, v1.s[0]
188 FMLA v22.4s, v12.4s, v3.s[0]
189 FMLA v24.4s, v12.4s, v5.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800190 LDR q0, [x3], 16 // Load next 5 A
XNNPACK Teamb455b122019-09-27 18:10:33 -0700191 FMLA v26.4s, v12.4s, v7.s[0]
192 FMLA v28.4s, v12.4s, v9.s[0]
193 FMLA v21.4s, v13.4s, v1.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800194 LDR q2, [x9], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700195 FMLA v23.4s, v13.4s, v3.s[0]
196 FMLA v25.4s, v13.4s, v5.s[0]
197 FMLA v27.4s, v13.4s, v7.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800198 LDR q4, [x10], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700199 FMLA v29.4s, v13.4s, v9.s[0]
200
201 FMLA v20.4s, v14.4s, v1.s[1]
202 FMLA v22.4s, v14.4s, v3.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800203 LDR q6, [x11], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700204 FMLA v24.4s, v14.4s, v5.s[1]
205 FMLA v26.4s, v14.4s, v7.s[1]
206 FMLA v28.4s, v14.4s, v9.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800207 LDR q8, [x12], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700208 FMLA v21.4s, v15.4s, v1.s[1]
209 FMLA v23.4s, v15.4s, v3.s[1]
210 FMLA v25.4s, v15.4s, v5.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800211 LDP q12, q13, [x5], 32 // Load next 3 B (not last)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700212 FMLA v27.4s, v15.4s, v7.s[1]
213 FMLA v29.4s, v15.4s, v9.s[1]
214
215 FMLA v20.4s, v16.4s, v1.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800216 LDP q14, q15, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700217 FMLA v22.4s, v16.4s, v3.s[2]
218 FMLA v24.4s, v16.4s, v5.s[2]
219 FMLA v26.4s, v16.4s, v7.s[2]
220 FMLA v28.4s, v16.4s, v9.s[2]
221 FMLA v21.4s, v17.4s, v1.s[2]
222 FMLA v23.4s, v17.4s, v3.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 FMLA v25.4s, v17.4s, v5.s[2]
224 FMLA v27.4s, v17.4s, v7.s[2]
225 FMLA v29.4s, v17.4s, v9.s[2]
226 LDP q16, q17, [x5], 32
227
228 FMLA v20.4s, v18.4s, v1.s[3]
229 FMLA v22.4s, v18.4s, v3.s[3]
230 SUBS x0, x0, 32
231 FMLA v24.4s, v18.4s, v5.s[3]
232 FMLA v26.4s, v18.4s, v7.s[3]
233 FMLA v28.4s, v18.4s, v9.s[3]
234 FMLA v21.4s, v19.4s, v1.s[3]
235 FMLA v23.4s, v19.4s, v3.s[3]
236 FMLA v25.4s, v19.4s, v5.s[3]
237 FMLA v27.4s, v19.4s, v7.s[3]
238 FMLA v29.4s, v19.4s, v9.s[3]
Frank Barchard387c2d12019-12-16 19:14:07 -0800239 B.HS 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700240
241 # Epilogue - 8 floats of A (32 bytes)
242 # 80 FMA + 5 LDP A + 8 LDP B
243 # First block same as main loop. Second block has no preloads.
Frank Barchard387c2d12019-12-16 19:14:07 -08002442:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700245 # First group of 4 A. 40 FMA.
246 FMLA v20.4s, v12.4s, v0.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800247 LDP q18, q19, [x5], 32 // Load last B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700248 FMLA v22.4s, v12.4s, v2.s[0]
249 FMLA v24.4s, v12.4s, v4.s[0]
250 FMLA v26.4s, v12.4s, v6.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700251 FMLA v28.4s, v12.4s, v8.s[0]
252 FMLA v21.4s, v13.4s, v0.s[0]
253 FMLA v23.4s, v13.4s, v2.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700254 FMLA v25.4s, v13.4s, v4.s[0]
255 FMLA v27.4s, v13.4s, v6.s[0]
256 FMLA v29.4s, v13.4s, v8.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800257 LDR q1, [x3], 16 // Load next 5 A
XNNPACK Teamb455b122019-09-27 18:10:33 -0700258
259 FMLA v20.4s, v14.4s, v0.s[1]
260 FMLA v22.4s, v14.4s, v2.s[1]
261 FMLA v24.4s, v14.4s, v4.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800262 LDR q3, [x9], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700263 FMLA v26.4s, v14.4s, v6.s[1]
264 FMLA v28.4s, v14.4s, v8.s[1]
265 FMLA v21.4s, v15.4s, v0.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800266 LDR q5, [x10], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700267 FMLA v23.4s, v15.4s, v2.s[1]
268 FMLA v25.4s, v15.4s, v4.s[1]
269 FMLA v27.4s, v15.4s, v6.s[1]
Frank Barchard387c2d12019-12-16 19:14:07 -0800270 LDR q7, [x11], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700271 FMLA v29.4s, v15.4s, v8.s[1]
272
273 FMLA v20.4s, v16.4s, v0.s[2]
274 FMLA v22.4s, v16.4s, v2.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800275 LDR q9, [x12], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700276 FMLA v24.4s, v16.4s, v4.s[2]
277 FMLA v26.4s, v16.4s, v6.s[2]
278 FMLA v28.4s, v16.4s, v8.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800279 LDP q12, q13, [x5], 32 // Load 4 B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700280 FMLA v21.4s, v17.4s, v0.s[2]
281 FMLA v23.4s, v17.4s, v2.s[2]
282 FMLA v25.4s, v17.4s, v4.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800283 LDP q14, q15, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700284 FMLA v27.4s, v17.4s, v6.s[2]
285 FMLA v29.4s, v17.4s, v8.s[2]
286
287 FMLA v20.4s, v18.4s, v0.s[3]
Frank Barchard387c2d12019-12-16 19:14:07 -0800288 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700289 FMLA v22.4s, v18.4s, v2.s[3]
290 FMLA v24.4s, v18.4s, v4.s[3]
291 FMLA v26.4s, v18.4s, v6.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700292 FMLA v28.4s, v18.4s, v8.s[3]
293 FMLA v21.4s, v19.4s, v0.s[3]
294 FMLA v23.4s, v19.4s, v2.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700295 FMLA v25.4s, v19.4s, v4.s[3]
296 FMLA v27.4s, v19.4s, v6.s[3]
297 FMLA v29.4s, v19.4s, v8.s[3]
Frank Barchard387c2d12019-12-16 19:14:07 -0800298 LDP q18, q19, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700299
300 # Second group of 4 A. 40 FMA.
301 FMLA v20.4s, v12.4s, v1.s[0]
302 FMLA v22.4s, v12.4s, v3.s[0]
303 FMLA v24.4s, v12.4s, v5.s[0]
304 FMLA v26.4s, v12.4s, v7.s[0]
305 FMLA v28.4s, v12.4s, v9.s[0]
306 FMLA v21.4s, v13.4s, v1.s[0]
307 FMLA v23.4s, v13.4s, v3.s[0]
308 FMLA v25.4s, v13.4s, v5.s[0]
309 FMLA v27.4s, v13.4s, v7.s[0]
310 FMLA v29.4s, v13.4s, v9.s[0]
311
312 FMLA v20.4s, v14.4s, v1.s[1]
313 FMLA v22.4s, v14.4s, v3.s[1]
314 FMLA v24.4s, v14.4s, v5.s[1]
315 FMLA v26.4s, v14.4s, v7.s[1]
316 FMLA v28.4s, v14.4s, v9.s[1]
317 FMLA v21.4s, v15.4s, v1.s[1]
318 FMLA v23.4s, v15.4s, v3.s[1]
319 FMLA v25.4s, v15.4s, v5.s[1]
320 FMLA v27.4s, v15.4s, v7.s[1]
321 FMLA v29.4s, v15.4s, v9.s[1]
322
323 FMLA v20.4s, v16.4s, v1.s[2]
324 FMLA v22.4s, v16.4s, v3.s[2]
325 FMLA v24.4s, v16.4s, v5.s[2]
326 FMLA v26.4s, v16.4s, v7.s[2]
327 FMLA v28.4s, v16.4s, v9.s[2]
328 FMLA v21.4s, v17.4s, v1.s[2]
329 FMLA v23.4s, v17.4s, v3.s[2]
330 FMLA v25.4s, v17.4s, v5.s[2]
331 FMLA v27.4s, v17.4s, v7.s[2]
332 FMLA v29.4s, v17.4s, v9.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800333 TST x0, 31
XNNPACK Teamb455b122019-09-27 18:10:33 -0700334
335 FMLA v20.4s, v18.4s, v1.s[3]
336 FMLA v22.4s, v18.4s, v3.s[3]
337 FMLA v24.4s, v18.4s, v5.s[3]
338 FMLA v26.4s, v18.4s, v7.s[3]
339 FMLA v28.4s, v18.4s, v9.s[3]
340 FMLA v21.4s, v19.4s, v1.s[3]
341 FMLA v23.4s, v19.4s, v3.s[3]
342 FMLA v25.4s, v19.4s, v5.s[3]
343 FMLA v27.4s, v19.4s, v7.s[3]
344 FMLA v29.4s, v19.4s, v9.s[3]
Frank Barchard387c2d12019-12-16 19:14:07 -0800345 B.NE 4f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700346
347 # Clamp
Frank Barchard387c2d12019-12-16 19:14:07 -08003483:
Marat Dukhana51cf482020-04-08 16:16:19 -0700349 FMAX v20.4s, v20.4s, v30.4s
Frank Barchard387c2d12019-12-16 19:14:07 -0800350 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700351 FMAX v21.4s, v21.4s, v30.4s
352 FMAX v22.4s, v22.4s, v30.4s
353 FMAX v23.4s, v23.4s, v30.4s
354 FMAX v24.4s, v24.4s, v30.4s
355 FMAX v25.4s, v25.4s, v30.4s
356 FMAX v26.4s, v26.4s, v30.4s
357 FMAX v27.4s, v27.4s, v30.4s
358 FMAX v28.4s, v28.4s, v30.4s
359 FMAX v29.4s, v29.4s, v30.4s
360 FMIN v20.4s, v20.4s, v31.4s
361 FMIN v21.4s, v21.4s, v31.4s
362 FMIN v22.4s, v22.4s, v31.4s
363 FMIN v23.4s, v23.4s, v31.4s
364 FMIN v24.4s, v24.4s, v31.4s
365 FMIN v25.4s, v25.4s, v31.4s
366 FMIN v26.4s, v26.4s, v31.4s
367 FMIN v27.4s, v27.4s, v31.4s
368 FMIN v28.4s, v28.4s, v31.4s
369 FMIN v29.4s, v29.4s, v31.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370
371 # Store full 5 x 8
Frank Barchard387c2d12019-12-16 19:14:07 -0800372 B.LO 7f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700373
Frank Barchard387c2d12019-12-16 19:14:07 -0800374 SUB x3, x3, x2 // a0 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700375 STP q28, q29, [x7]
Frank Barchard387c2d12019-12-16 19:14:07 -0800376 ADD x7, x7, x14
377 SUB x9, x9, x2 // a1 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700378 STP q26, q27, [x13]
Frank Barchard387c2d12019-12-16 19:14:07 -0800379 ADD x13, x13, x14
380 SUB x10, x10, x2 // a2 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700381 STP q24, q25, [x17]
Frank Barchard387c2d12019-12-16 19:14:07 -0800382 ADD x17, x17, x14
383 SUB x11, x11, x2 // a3 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700384 STP q22, q23, [x16]
Frank Barchard387c2d12019-12-16 19:14:07 -0800385 ADD x16, x16, x14
386 SUB x12, x12, x2 // a4 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700387 STP q20, q21, [x6]
Frank Barchard387c2d12019-12-16 19:14:07 -0800388 ADD x6, x6, x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700389
XNNPACK Teamb455b122019-09-27 18:10:33 -0700390 B.HI 0b
391
XNNPACK Teamb455b122019-09-27 18:10:33 -0700392 # Restore d8-d15 from stack
393 LDP d14, d15, [sp, 32]
394 LDP d12, d13, [sp, 16]
Frank Barchard387c2d12019-12-16 19:14:07 -0800395 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700396 RET
397
Frank Barchard387c2d12019-12-16 19:14:07 -0800398 # Load clamp values
3994:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700400 # Is there a remainder?- 4 floats of A (16 bytes)
Frank Barchard387c2d12019-12-16 19:14:07 -0800401 TBZ x0, 4, 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700402
403 # Remainder- 4 floats of A (16 bytes)
404 # Load A
Frank Barchard387c2d12019-12-16 19:14:07 -0800405 LDR q0, [x3], 16
406 LDR q2, [x9], 16
407 LDR q4, [x10], 16
408 LDR q6, [x11], 16
409 LDR q8, [x12], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700410 # Load B
Frank Barchard387c2d12019-12-16 19:14:07 -0800411 LDP q12, q13, [x5], 32
412 LDP q14, q15, [x5], 32
413 LDP q16, q17, [x5], 32
414 LDP q18, q19, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700415
416 FMLA v20.4s, v12.4s, v0.s[0]
417 FMLA v22.4s, v12.4s, v2.s[0]
418 FMLA v24.4s, v12.4s, v4.s[0]
419 FMLA v26.4s, v12.4s, v6.s[0]
420 FMLA v28.4s, v12.4s, v8.s[0]
421 FMLA v21.4s, v13.4s, v0.s[0]
422 FMLA v23.4s, v13.4s, v2.s[0]
423 FMLA v25.4s, v13.4s, v4.s[0]
424 FMLA v27.4s, v13.4s, v6.s[0]
425 FMLA v29.4s, v13.4s, v8.s[0]
426
427 FMLA v20.4s, v14.4s, v0.s[1]
428 FMLA v22.4s, v14.4s, v2.s[1]
429 FMLA v24.4s, v14.4s, v4.s[1]
430 FMLA v26.4s, v14.4s, v6.s[1]
431 FMLA v28.4s, v14.4s, v8.s[1]
432 FMLA v21.4s, v15.4s, v0.s[1]
433 FMLA v23.4s, v15.4s, v2.s[1]
434 FMLA v25.4s, v15.4s, v4.s[1]
435 FMLA v27.4s, v15.4s, v6.s[1]
436 FMLA v29.4s, v15.4s, v8.s[1]
437
438 FMLA v20.4s, v16.4s, v0.s[2]
439 FMLA v22.4s, v16.4s, v2.s[2]
440 FMLA v24.4s, v16.4s, v4.s[2]
441 FMLA v26.4s, v16.4s, v6.s[2]
442 FMLA v28.4s, v16.4s, v8.s[2]
443 FMLA v21.4s, v17.4s, v0.s[2]
444 FMLA v23.4s, v17.4s, v2.s[2]
445 FMLA v25.4s, v17.4s, v4.s[2]
446 FMLA v27.4s, v17.4s, v6.s[2]
447 FMLA v29.4s, v17.4s, v8.s[2]
448
449 FMLA v20.4s, v18.4s, v0.s[3]
450 FMLA v22.4s, v18.4s, v2.s[3]
451 FMLA v24.4s, v18.4s, v4.s[3]
452 FMLA v26.4s, v18.4s, v6.s[3]
453 FMLA v28.4s, v18.4s, v8.s[3]
454 FMLA v21.4s, v19.4s, v0.s[3]
455 FMLA v23.4s, v19.4s, v2.s[3]
456 FMLA v25.4s, v19.4s, v4.s[3]
457 FMLA v27.4s, v19.4s, v6.s[3]
458 FMLA v29.4s, v19.4s, v8.s[3]
459
460 # Is there a remainder?- 2 floats of A (8 bytes)
Frank Barchard387c2d12019-12-16 19:14:07 -08004615:
462 TBZ x0, 3, 6f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700463
464 # Remainder- 2 floats of A (8 bytes)
465 # Load A
Frank Barchard387c2d12019-12-16 19:14:07 -0800466 LDR d0, [x3], 8
467 LDR d2, [x9], 8
468 LDR d4, [x10], 8
469 LDR d6, [x11], 8
470 LDR d8, [x12], 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700471 # Load B
Frank Barchard387c2d12019-12-16 19:14:07 -0800472 LDP q12, q13, [x5], 32
473 LDP q14, q15, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700474
475 FMLA v20.4s, v12.4s, v0.s[0]
476 FMLA v22.4s, v12.4s, v2.s[0]
477 FMLA v24.4s, v12.4s, v4.s[0]
478 FMLA v26.4s, v12.4s, v6.s[0]
479 FMLA v28.4s, v12.4s, v8.s[0]
480 FMLA v21.4s, v13.4s, v0.s[0]
481 FMLA v23.4s, v13.4s, v2.s[0]
482 FMLA v25.4s, v13.4s, v4.s[0]
483 FMLA v27.4s, v13.4s, v6.s[0]
484 FMLA v29.4s, v13.4s, v8.s[0]
485
486 FMLA v20.4s, v14.4s, v0.s[1]
487 FMLA v22.4s, v14.4s, v2.s[1]
488 FMLA v24.4s, v14.4s, v4.s[1]
489 FMLA v26.4s, v14.4s, v6.s[1]
490 FMLA v28.4s, v14.4s, v8.s[1]
491 FMLA v21.4s, v15.4s, v0.s[1]
492 FMLA v23.4s, v15.4s, v2.s[1]
493 FMLA v25.4s, v15.4s, v4.s[1]
494 FMLA v27.4s, v15.4s, v6.s[1]
495 FMLA v29.4s, v15.4s, v8.s[1]
496
497 # Is there a remainder?- 1 float of A (4 bytes)
Frank Barchard387c2d12019-12-16 19:14:07 -08004986:
499 TBZ x0, 2, 3b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700500
501 # Remainder- 1 float of A (4 bytes)
502 # Load A
Frank Barchard387c2d12019-12-16 19:14:07 -0800503 LDR s0, [x3], 4
504 LDR s2, [x9], 4
505 LDR s4, [x10], 4
506 LDR s6, [x11], 4
507 LDR s8, [x12], 4
XNNPACK Teamb455b122019-09-27 18:10:33 -0700508 # Load B
Frank Barchard387c2d12019-12-16 19:14:07 -0800509 LDP q12, q13, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700510
511 FMLA v20.4s, v12.4s, v0.s[0]
512 FMLA v22.4s, v12.4s, v2.s[0]
513 FMLA v24.4s, v12.4s, v4.s[0]
514 FMLA v26.4s, v12.4s, v6.s[0]
515 FMLA v28.4s, v12.4s, v8.s[0]
516 FMLA v21.4s, v13.4s, v0.s[0]
517 FMLA v23.4s, v13.4s, v2.s[0]
518 FMLA v25.4s, v13.4s, v4.s[0]
519 FMLA v27.4s, v13.4s, v6.s[0]
520 FMLA v29.4s, v13.4s, v8.s[0]
Frank Barchard387c2d12019-12-16 19:14:07 -0800521 B 3b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700522
523 # Store odd width
Frank Barchard387c2d12019-12-16 19:14:07 -08005247:
525 TBZ x1, 2, 8f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700526 STR q28, [x7], 16
527 MOV v28.16b, v29.16b
528 STR q26, [x13], 16
529 MOV v26.16b, v27.16b
530 STR q24, [x17], 16
531 MOV v24.16b, v25.16b
532 STR q22, [x16], 16
533 MOV v22.16b, v23.16b
534 STR q20, [x6], 16
535 MOV v20.16b, v21.16b
Frank Barchard387c2d12019-12-16 19:14:07 -08005368:
537 TBZ x1, 1, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700538 STR d28, [x7], 8
539 DUP d28, v28.d[1]
540 STR d26, [x13], 8
541 DUP d26, v26.d[1]
542 STR d24, [x17], 8
543 DUP d24, v24.d[1]
544 STR d22, [x16], 8
545 DUP d22, v22.d[1]
546 STR d20, [x6], 8
547 DUP d20, v20.d[1]
548
Frank Barchard387c2d12019-12-16 19:14:07 -08005499:
550 TBZ x1, 0, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700551 STR s28, [x7]
552 STR s26, [x13]
553 STR s24, [x17]
554 STR s22, [x16]
555 STR s20, [x6]
Frank Barchard387c2d12019-12-16 19:14:07 -080055610:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700557 # Restore d8-d15 from stack
558 LDP d14, d15, [sp, 32]
559 LDP d12, d13, [sp, 16]
Frank Barchard387c2d12019-12-16 19:14:07 -0800560 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700561 RET
562
Frank Barchard387c2d12019-12-16 19:14:07 -0800563END_FUNCTION xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a57
XNNPACK Teamb455b122019-09-27 18:10:33 -0700564
565#ifdef __ELF__
566.section ".note.GNU-stack","",%progbits
567#endif