blob: f7c96e261dc79599fdcad6b398151eeddbf64ce7 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
Marat Dukhande06f492020-04-09 00:19:31 -07008# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
XNNPACK Teamb455b122019-09-27 18:10:33 -07009# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const float*restrict w, x5
15# float*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020# const xnn_f32_minmax_params params [sp + 24] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070021
Frank Barchard909564c2020-06-09 03:54:33 -070022# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# A pointers
25# x20 a0
26# x13 a1
27# x14 a2
28# x15 a3
29
30# C pointers
31# x6 c0
32# x16 c1
33# x17 c2
34# x7 c3 / cm_stride
35
36# Vector register usage
37# A0 v0 v4
38# A1 v1 v5
39# A2 v2 v6
40# A3 v3 v7
41# B v8 v9 v10 v11
42# B v12 v13 v14 v15
43# B v20 v21 v22 v23
44# B v24 v25 v26 v27
45# C v16 v17
46# C v18 v19
47# C v28 v29
48# C v30 v31
49# Clamp v4 v5
50
Marat Dukhande06f492020-04-09 00:19:31 -070051BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
XNNPACK Teamb455b122019-09-27 18:10:33 -070052
53 # Load cn_stride, a_offset
54 LDP x10, x11, [sp]
55
Frank Barchardb3390452020-05-13 01:26:21 -070056 # Load zero, params pointer
XNNPACK Teamb455b122019-09-27 18:10:33 -070057 LDP x12, x8, [sp, 16]
58
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070059 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070060 LD2R {v4.4s, v5.4s}, [x8]
61
62 # Save x20 on stack
63 STR x20, [sp, -80]!
64
65 # Save d8-d15 on stack
66 STP d8, d9, [sp, 16]
67 STP d10, d11, [sp, 32]
68 STP d12, d13, [sp, 48]
69 STP d14, d15, [sp, 64]
70
71 # Clamp C pointers
XNNPACK Teamb455b122019-09-27 18:10:33 -070072 CMP x0, 2 // if mr < 2
Frank Barchard684bbb02019-11-16 14:14:42 -080073 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070074 CSEL x16, x6, x16, LO // c1 = c0
75
76 ADD x17, x16, x7 // c2 = c1 + cm_stride
77 // if mr <= 2
78 CSEL x17, x16, x17, LS // c2 = c1
79
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 CMP x0, 4 // if mr < 4
Frank Barchard684bbb02019-11-16 14:14:42 -080081 ADD x7, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 CSEL x7, x17, x7, LO // c3 = c2
83
840:
85 # Load initial bias from w into accumulators
86 LDP q16, q17, [x5], 32
87 MOV v18.16b, v16.16b
88 MOV v19.16b, v17.16b
89 MOV v28.16b, v16.16b
90 MOV v29.16b, v17.16b
91 MOV v30.16b, v16.16b
92 MOV v31.16b, v17.16b
93
94 MOV x9, x3 // p = ks
95
961:
97 # Load next 4 A pointers
98 LDP x20, x13, [x4], 16
99 LDP x14, x15, [x4], 16
100
101 CMP x20, x12 // if a0 == zero
102 ADD x20, x20, x11 // a0 += a_offset
103 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset
104 CMP x13, x12 // if a1 == zero
105 ADD x13, x13, x11 // a1 += a_offset
106 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset
107 CMP x14, x12 // if a2 == zero
108 ADD x14, x14, x11 // a2 += a_offset
109 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset
110 CMP x15, x12 // if a3 == zero
111 ADD x15, x15, x11 // a3 += a_offset
112 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset
113
114 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
115 SUBS x0, x2, 32 // k = kc - 32
116 B.LO 4f
117
118 # 16 prologue
119 # Read first block of 4 A and B.
120 LDR q0, [x20], 16
121 LDP q20, q21, [x5], 32
122 LDR q1, [x13], 16
123 LDR q2, [x14], 16
124 LDR q3, [x15], 16
125 LDP q22, q23, [x5], 32
126 LDP q24, q25, [x5], 32
127 LDP q26, q27, [x5], 32
128
129 # Is there at least 32. yes do main loop
130 SUBS x0, x0, 32
131 B.LO 3f
132
133 # Main loop - 8 floats of A
1342:
135 # First block of 4. FMA for first 4, loads for 2nd block of 4.
136 FMLA v16.4s, v20.4s, v0.s[0]
137 LDP q8, q9, [x5], 32
138 FMLA v17.4s, v21.4s, v0.s[0]
139 FMLA v18.4s, v20.4s, v1.s[0]
140 LDP q10, q11, [x5], 32
141 FMLA v19.4s, v21.4s, v1.s[0]
142 FMLA v28.4s, v20.4s, v2.s[0]
143 LDP q12, q13, [x5], 32
144 FMLA v29.4s, v21.4s, v2.s[0]
145 FMLA v30.4s, v20.4s, v3.s[0]
146 LDP q14, q15, [x5], 32
147 FMLA v31.4s, v21.4s, v3.s[0]
148 FMLA v16.4s, v22.4s, v0.s[1]
149 LDR q4, [x20], 16
150 FMLA v17.4s, v23.4s, v0.s[1]
151 FMLA v18.4s, v22.4s, v1.s[1]
152 LDR q5, [x13], 16
153 FMLA v19.4s, v23.4s, v1.s[1]
154 FMLA v28.4s, v22.4s, v2.s[1]
155 LDR q6, [x14], 16
156 FMLA v29.4s, v23.4s, v2.s[1]
157 FMLA v30.4s, v22.4s, v3.s[1]
158 LDR q7, [x15], 16
159 FMLA v31.4s, v23.4s, v3.s[1]
160 FMLA v16.4s, v24.4s, v0.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800161 $if PREFETCH:
162 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700163 FMLA v17.4s, v25.4s, v0.s[2]
164 FMLA v18.4s, v24.4s, v1.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800165 $if PREFETCH:
166 PRFM PLDL1KEEP, [x5, 192]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700167 FMLA v19.4s, v25.4s, v1.s[2]
168 FMLA v28.4s, v24.4s, v2.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800169 $if PREFETCH:
170 PRFM PLDL1KEEP, [x5, 256]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700171 FMLA v29.4s, v25.4s, v2.s[2]
172 FMLA v30.4s, v24.4s, v3.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800173 $if PREFETCH:
174 PRFM PLDL1KEEP, [x5, 320]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700175 FMLA v31.4s, v25.4s, v3.s[2]
176 FMLA v16.4s, v26.4s, v0.s[3]
177 FMLA v17.4s, v27.4s, v0.s[3]
178 FMLA v18.4s, v26.4s, v1.s[3]
179 FMLA v19.4s, v27.4s, v1.s[3]
180 FMLA v28.4s, v26.4s, v2.s[3]
181 FMLA v29.4s, v27.4s, v2.s[3]
182 FMLA v30.4s, v26.4s, v3.s[3]
183 FMLA v31.4s, v27.4s, v3.s[3]
184
185 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
186 FMLA v16.4s, v8.4s, v4.s[0]
187 LDP q20, q21, [x5], 32
188 FMLA v17.4s, v9.4s, v4.s[0]
189 FMLA v18.4s, v8.4s, v5.s[0]
190 LDP q22, q23, [x5], 32
191 FMLA v19.4s, v9.4s, v5.s[0]
192 FMLA v28.4s, v8.4s, v6.s[0]
193 LDP q24, q25, [x5], 32
194 FMLA v29.4s, v9.4s, v6.s[0]
195 FMLA v30.4s, v8.4s, v7.s[0]
196 LDP q26, q27, [x5], 32
197 FMLA v31.4s, v9.4s, v7.s[0]
198 FMLA v16.4s, v10.4s, v4.s[1]
199 LDR q0, [x20], 16
200 FMLA v17.4s, v11.4s, v4.s[1]
201 FMLA v18.4s, v10.4s, v5.s[1]
202 LDR q1, [x13], 16
203 FMLA v19.4s, v11.4s, v5.s[1]
204 FMLA v28.4s, v10.4s, v6.s[1]
205 LDR q2, [x14], 16
206 FMLA v29.4s, v11.4s, v6.s[1]
207 FMLA v30.4s, v10.4s, v7.s[1]
208 LDR q3, [x15], 16
209 FMLA v31.4s, v11.4s, v7.s[1]
210 FMLA v16.4s, v12.4s, v4.s[2]
211 FMLA v17.4s, v13.4s, v4.s[2]
212 FMLA v18.4s, v12.4s, v5.s[2]
213 FMLA v19.4s, v13.4s, v5.s[2]
214 FMLA v28.4s, v12.4s, v6.s[2]
215 FMLA v29.4s, v13.4s, v6.s[2]
216 FMLA v30.4s, v12.4s, v7.s[2]
217 FMLA v31.4s, v13.4s, v7.s[2]
218 FMLA v16.4s, v14.4s, v4.s[3]
219 FMLA v17.4s, v15.4s, v4.s[3]
220 FMLA v18.4s, v14.4s, v5.s[3]
221 FMLA v19.4s, v15.4s, v5.s[3]
222 FMLA v28.4s, v14.4s, v6.s[3]
223 FMLA v29.4s, v15.4s, v6.s[3]
224 SUBS x0, x0, 32
225 FMLA v30.4s, v14.4s, v7.s[3]
226 FMLA v31.4s, v15.4s, v7.s[3]
227
228 B.HS 2b
229
2303:
231 # Epilogue
232 # First block of 4. FMA for first 4, loads for 2nd block of 4.
233 FMLA v16.4s, v20.4s, v0.s[0]
234 LDP q8, q9, [x5], 32
235 FMLA v17.4s, v21.4s, v0.s[0]
236 FMLA v18.4s, v20.4s, v1.s[0]
237 LDP q10, q11, [x5], 32
238 FMLA v19.4s, v21.4s, v1.s[0]
239 FMLA v28.4s, v20.4s, v2.s[0]
240 LDP q12, q13, [x5], 32
241 FMLA v29.4s, v21.4s, v2.s[0]
242 FMLA v30.4s, v20.4s, v3.s[0]
243 LDP q14, q15, [x5], 32
244 FMLA v31.4s, v21.4s, v3.s[0]
245 FMLA v16.4s, v22.4s, v0.s[1]
246 LDR q4, [x20], 16
247 FMLA v17.4s, v23.4s, v0.s[1]
248 FMLA v18.4s, v22.4s, v1.s[1]
249 LDR q5, [x13], 16
250 FMLA v19.4s, v23.4s, v1.s[1]
251 FMLA v28.4s, v22.4s, v2.s[1]
252 LDR q6, [x14], 16
253 FMLA v29.4s, v23.4s, v2.s[1]
254 FMLA v30.4s, v22.4s, v3.s[1]
255 LDR q7, [x15], 16
256 FMLA v31.4s, v23.4s, v3.s[1]
257 FMLA v16.4s, v24.4s, v0.s[2]
258 FMLA v17.4s, v25.4s, v0.s[2]
259 FMLA v18.4s, v24.4s, v1.s[2]
260 FMLA v19.4s, v25.4s, v1.s[2]
261 FMLA v28.4s, v24.4s, v2.s[2]
262 FMLA v29.4s, v25.4s, v2.s[2]
263 FMLA v30.4s, v24.4s, v3.s[2]
264 FMLA v31.4s, v25.4s, v3.s[2]
265 FMLA v16.4s, v26.4s, v0.s[3]
266 FMLA v17.4s, v27.4s, v0.s[3]
267 FMLA v18.4s, v26.4s, v1.s[3]
268 FMLA v19.4s, v27.4s, v1.s[3]
269 FMLA v28.4s, v26.4s, v2.s[3]
270 FMLA v29.4s, v27.4s, v2.s[3]
271 FMLA v30.4s, v26.4s, v3.s[3]
272 FMLA v31.4s, v27.4s, v3.s[3]
273
274 # Second block of 4. FMA for second 4, noloads
275 FMLA v16.4s, v8.4s, v4.s[0]
276 FMLA v17.4s, v9.4s, v4.s[0]
277 FMLA v18.4s, v8.4s, v5.s[0]
278 FMLA v19.4s, v9.4s, v5.s[0]
279 FMLA v28.4s, v8.4s, v6.s[0]
280 FMLA v29.4s, v9.4s, v6.s[0]
281 FMLA v30.4s, v8.4s, v7.s[0]
282 FMLA v31.4s, v9.4s, v7.s[0]
283 FMLA v16.4s, v10.4s, v4.s[1]
284 FMLA v17.4s, v11.4s, v4.s[1]
285 FMLA v18.4s, v10.4s, v5.s[1]
286 FMLA v19.4s, v11.4s, v5.s[1]
287 FMLA v28.4s, v10.4s, v6.s[1]
288 FMLA v29.4s, v11.4s, v6.s[1]
289 FMLA v30.4s, v10.4s, v7.s[1]
290 FMLA v31.4s, v11.4s, v7.s[1]
291 FMLA v16.4s, v12.4s, v4.s[2]
292 FMLA v17.4s, v13.4s, v4.s[2]
293 FMLA v18.4s, v12.4s, v5.s[2]
294 FMLA v19.4s, v13.4s, v5.s[2]
295 FMLA v28.4s, v12.4s, v6.s[2]
296 FMLA v29.4s, v13.4s, v6.s[2]
297 FMLA v30.4s, v12.4s, v7.s[2]
298 FMLA v31.4s, v13.4s, v7.s[2]
299
300 FMLA v16.4s, v14.4s, v4.s[3]
301 FMLA v17.4s, v15.4s, v4.s[3]
302 FMLA v18.4s, v14.4s, v5.s[3]
303 FMLA v19.4s, v15.4s, v5.s[3]
304
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700305 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700306 LD2R {v4.4s, v5.4s}, [x8]
307
308 FMLA v28.4s, v14.4s, v6.s[3]
309 FMLA v29.4s, v15.4s, v6.s[3]
310 FMLA v30.4s, v14.4s, v7.s[3]
311 FMLA v31.4s, v15.4s, v7.s[3]
312
3134:
314 # Remainder- 4 floats of A
315 TBZ x0, 4, 5f
316
317 LDR q0, [x20], 16
318 LDP q20, q21, [x5], 32
319 LDR q1, [x13], 16
320 LDR q2, [x14], 16
321 LDR q3, [x15], 16
322 FMLA v16.4s, v20.4s, v0.s[0]
323 FMLA v17.4s, v21.4s, v0.s[0]
324 LDP q22, q23, [x5], 32
325 FMLA v18.4s, v20.4s, v1.s[0]
326 FMLA v19.4s, v21.4s, v1.s[0]
327 LDP q24, q25, [x5], 32
328 FMLA v28.4s, v20.4s, v2.s[0]
329 FMLA v29.4s, v21.4s, v2.s[0]
330 LDP q26, q27, [x5], 32
331 FMLA v30.4s, v20.4s, v3.s[0]
332 FMLA v31.4s, v21.4s, v3.s[0]
333 FMLA v16.4s, v22.4s, v0.s[1]
334 FMLA v17.4s, v23.4s, v0.s[1]
335 FMLA v18.4s, v22.4s, v1.s[1]
336 FMLA v19.4s, v23.4s, v1.s[1]
337 FMLA v28.4s, v22.4s, v2.s[1]
338 FMLA v29.4s, v23.4s, v2.s[1]
339 FMLA v30.4s, v22.4s, v3.s[1]
340 FMLA v31.4s, v23.4s, v3.s[1]
341 FMLA v16.4s, v24.4s, v0.s[2]
342 FMLA v17.4s, v25.4s, v0.s[2]
343 FMLA v18.4s, v24.4s, v1.s[2]
344 FMLA v19.4s, v25.4s, v1.s[2]
345 FMLA v28.4s, v24.4s, v2.s[2]
346 FMLA v29.4s, v25.4s, v2.s[2]
347 FMLA v30.4s, v24.4s, v3.s[2]
348 FMLA v31.4s, v25.4s, v3.s[2]
349 FMLA v16.4s, v26.4s, v0.s[3]
350 FMLA v17.4s, v27.4s, v0.s[3]
351 FMLA v18.4s, v26.4s, v1.s[3]
352 FMLA v19.4s, v27.4s, v1.s[3]
353 FMLA v28.4s, v26.4s, v2.s[3]
354 FMLA v29.4s, v27.4s, v2.s[3]
355 FMLA v30.4s, v26.4s, v3.s[3]
356 FMLA v31.4s, v27.4s, v3.s[3]
357
3585:
359 # Remainder- 2 floats of A
360 TBZ x0, 3, 6f
361
362 LDR d0, [x20], 8
363 LDP q20, q21, [x5], 32
364 LDR d1, [x13], 8
365 LDR d2, [x14], 8
366 LDR d3, [x15], 8
367 FMLA v16.4s, v20.4s, v0.s[0]
368 FMLA v17.4s, v21.4s, v0.s[0]
369 LDP q22, q23, [x5], 32
370 FMLA v18.4s, v20.4s, v1.s[0]
371 FMLA v19.4s, v21.4s, v1.s[0]
372 FMLA v28.4s, v20.4s, v2.s[0]
373 FMLA v29.4s, v21.4s, v2.s[0]
374 FMLA v30.4s, v20.4s, v3.s[0]
375 FMLA v31.4s, v21.4s, v3.s[0]
376 FMLA v16.4s, v22.4s, v0.s[1]
377 FMLA v17.4s, v23.4s, v0.s[1]
378 FMLA v18.4s, v22.4s, v1.s[1]
379 FMLA v19.4s, v23.4s, v1.s[1]
380 FMLA v28.4s, v22.4s, v2.s[1]
381 FMLA v29.4s, v23.4s, v2.s[1]
382 FMLA v30.4s, v22.4s, v3.s[1]
383 FMLA v31.4s, v23.4s, v3.s[1]
384
3856:
386 # Remainder- 1 float of A
387 TBZ x0, 2, 7f
388
389 LDR s0, [x20], 4
390 LDP q20, q21, [x5], 32
391 LDR s1, [x13], 4
392 LDR s2, [x14], 4
393 LDR s3, [x15], 4
394 FMLA v16.4s, v20.4s, v0.s[0]
395 FMLA v17.4s, v21.4s, v0.s[0]
396 FMLA v18.4s, v20.4s, v1.s[0]
397 FMLA v19.4s, v21.4s, v1.s[0]
398 FMLA v28.4s, v20.4s, v2.s[0]
399 FMLA v29.4s, v21.4s, v2.s[0]
400 FMLA v30.4s, v20.4s, v3.s[0]
401 FMLA v31.4s, v21.4s, v3.s[0]
402
4037:
404 # ks loop
405 SUBS x9, x9, 32 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800406 B.HI 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700407
408 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700409 FMAX v16.4s, v16.4s, v4.4s
410 FMAX v17.4s, v17.4s, v4.4s
411 FMAX v18.4s, v18.4s, v4.4s
412 FMAX v19.4s, v19.4s, v4.4s
413 FMAX v28.4s, v28.4s, v4.4s
414 FMAX v29.4s, v29.4s, v4.4s
415 FMAX v30.4s, v30.4s, v4.4s
416 FMAX v31.4s, v31.4s, v4.4s
417 FMIN v16.4s, v16.4s, v5.4s
418 FMIN v17.4s, v17.4s, v5.4s
419 FMIN v18.4s, v18.4s, v5.4s
420 FMIN v19.4s, v19.4s, v5.4s
421 FMIN v28.4s, v28.4s, v5.4s
422 FMIN v29.4s, v29.4s, v5.4s
423 FMIN v30.4s, v30.4s, v5.4s
424 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700425
426 # Store full 4 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800427 SUBS x1, x1, 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700428 B.LO 8f
429
430 STP q30, q31, [x7]
431 ADD x7, x7, x10
432 STP q28, q29, [x17]
433 ADD x17, x17, x10
434 STP q18, q19, [x16]
435 ADD x16, x16, x10
436 STP q16, q17, [x6]
437 ADD x6, x6, x10
438
439 SUB x4, x4, x3 // a -= ks
440
441 # nc loop
XNNPACK Teamb455b122019-09-27 18:10:33 -0700442 B.HI 0b
443
444 # Restore d8-d15 from stack
445 LDP d14, d15, [sp, 64]
446 LDP d12, d13, [sp, 48]
447 LDP d10, d11, [sp, 32]
448 LDP d8, d9, [sp, 16]
449
450 # Restore x20 from stack
451 LDR x20, [sp], 80
452 RET
453
454 # Store odd width
4558:
456 TBZ x1, 2, 9f
457 STR q30, [x7], 16
458 MOV v30.16b, v31.16b
459 STR q28, [x17], 16
460 MOV v28.16b, v29.16b
461 STR q18, [x16], 16
462 MOV v18.16b, v19.16b
463 STR q16, [x6], 16
464 MOV v16.16b, v17.16b
465
4669:
467 TBZ x1, 1, 10f
468 STR d30, [x7], 8
469 DUP d30, v30.d[1]
470 STR d28, [x17], 8
471 DUP d28, v28.d[1]
472 STR d18, [x16], 8
473 DUP d18, v18.d[1]
474 STR d16, [x6], 8
475 DUP d16, v16.d[1]
476
47710:
478 TBZ x1, 0, 11f
479 STR s30, [x7]
480 STR s28, [x17]
481 STR s18, [x16]
482 STR s16, [x6]
48311:
484 # Restore d8-d15 from stack
485 LDP d14, d15, [sp, 64]
486 LDP d12, d13, [sp, 48]
487 LDP d10, d11, [sp, 32]
488 LDP d8, d9, [sp, 16]
489
490 # Restore x20 from stack
491 LDR x20, [sp], 80
492 RET
493
Marat Dukhande06f492020-04-09 00:19:31 -0700494END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
XNNPACK Teamb455b122019-09-27 18:10:33 -0700495
496#ifdef __ELF__
497.section ".note.GNU-stack","",%progbits
498#endif