blob: dcb42e60c090b4644d133ff1fd9a5c35f0d43a7e [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
Frank Barchard387c2d12019-12-16 19:14:07 -08008# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
XNNPACK Teamb455b122019-09-27 18:10:33 -07009# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const float*restrict w, x5
15# float*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020# const xnn_f32_minmax_params params [sp + 24] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070021
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x20 a0
27# x13 a1
28# x14 a2
29# x15 a3
30
31# C pointers
32# x6 c0
33# x16 c1
34# x17 c2
35# x7 c3 / cm_stride
36
37# Vector register usage
38# A0 v0 v4
39# A1 v1 v5
40# A2 v2 v6
41# A3 v3 v7
42# B v8 v9 v10 v11
43# B v12 v13 v14 v15
44# B v20 v21 v22 v23
45# B v24 v25 v26 v27
46# C v16 v17
47# C v18 v19
48# C v28 v29
49# C v30 v31
50# Clamp v4 v5
51
Frank Barchard387c2d12019-12-16 19:14:07 -080052BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
XNNPACK Teamb455b122019-09-27 18:10:33 -070053
54 # Load cn_stride, a_offset
55 LDP x10, x11, [sp]
56
57 # Load zero, clamping params pointer
58 LDP x12, x8, [sp, 16]
59
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070060 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070061 LD2R {v4.4s, v5.4s}, [x8]
62
63 # Save x20 on stack
64 STR x20, [sp, -80]!
65
66 # Save d8-d15 on stack
67 STP d8, d9, [sp, 16]
68 STP d10, d11, [sp, 32]
69 STP d12, d13, [sp, 48]
70 STP d14, d15, [sp, 64]
71
72 # Clamp C pointers
XNNPACK Teamb455b122019-09-27 18:10:33 -070073 CMP x0, 2 // if mr < 2
Frank Barchard684bbb02019-11-16 14:14:42 -080074 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070075 CSEL x16, x6, x16, LO // c1 = c0
76
77 ADD x17, x16, x7 // c2 = c1 + cm_stride
78 // if mr <= 2
79 CSEL x17, x16, x17, LS // c2 = c1
80
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 CMP x0, 4 // if mr < 4
Frank Barchard684bbb02019-11-16 14:14:42 -080082 ADD x7, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070083 CSEL x7, x17, x7, LO // c3 = c2
84
850:
86 # Load initial bias from w into accumulators
87 LDP q16, q17, [x5], 32
88 MOV v18.16b, v16.16b
89 MOV v19.16b, v17.16b
90 MOV v28.16b, v16.16b
91 MOV v29.16b, v17.16b
92 MOV v30.16b, v16.16b
93 MOV v31.16b, v17.16b
94
95 MOV x9, x3 // p = ks
96
971:
98 # Load next 4 A pointers
99 LDP x20, x13, [x4], 16
100 LDP x14, x15, [x4], 16
101
102 CMP x20, x12 // if a0 == zero
103 ADD x20, x20, x11 // a0 += a_offset
104 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset
105 CMP x13, x12 // if a1 == zero
106 ADD x13, x13, x11 // a1 += a_offset
107 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset
108 CMP x14, x12 // if a2 == zero
109 ADD x14, x14, x11 // a2 += a_offset
110 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset
111 CMP x15, x12 // if a3 == zero
112 ADD x15, x15, x11 // a3 += a_offset
113 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset
114
115 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
116 SUBS x0, x2, 32 // k = kc - 32
117 B.LO 4f
118
119 # 16 prologue
120 # Read first block of 4 A and B.
121 LDR q0, [x20], 16
122 LDP q20, q21, [x5], 32
123 LDR q1, [x13], 16
124 LDR q2, [x14], 16
125 LDR q3, [x15], 16
126 LDP q22, q23, [x5], 32
127 LDP q24, q25, [x5], 32
128 LDP q26, q27, [x5], 32
129
130 # Is there at least 32. yes do main loop
131 SUBS x0, x0, 32
132 B.LO 3f
133
134 # Main loop - 8 floats of A
1352:
136 # First block of 4. FMA for first 4, loads for 2nd block of 4.
137 FMLA v16.4s, v20.4s, v0.s[0]
138 LDP q8, q9, [x5], 32
139 FMLA v17.4s, v21.4s, v0.s[0]
140 FMLA v18.4s, v20.4s, v1.s[0]
141 LDP q10, q11, [x5], 32
142 FMLA v19.4s, v21.4s, v1.s[0]
143 FMLA v28.4s, v20.4s, v2.s[0]
144 LDP q12, q13, [x5], 32
145 FMLA v29.4s, v21.4s, v2.s[0]
146 FMLA v30.4s, v20.4s, v3.s[0]
147 LDP q14, q15, [x5], 32
148 FMLA v31.4s, v21.4s, v3.s[0]
149 FMLA v16.4s, v22.4s, v0.s[1]
150 LDR q4, [x20], 16
151 FMLA v17.4s, v23.4s, v0.s[1]
152 FMLA v18.4s, v22.4s, v1.s[1]
153 LDR q5, [x13], 16
154 FMLA v19.4s, v23.4s, v1.s[1]
155 FMLA v28.4s, v22.4s, v2.s[1]
156 LDR q6, [x14], 16
157 FMLA v29.4s, v23.4s, v2.s[1]
158 FMLA v30.4s, v22.4s, v3.s[1]
159 LDR q7, [x15], 16
160 FMLA v31.4s, v23.4s, v3.s[1]
161 FMLA v16.4s, v24.4s, v0.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800162 $if PREFETCH:
163 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700164 FMLA v17.4s, v25.4s, v0.s[2]
165 FMLA v18.4s, v24.4s, v1.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800166 $if PREFETCH:
167 PRFM PLDL1KEEP, [x5, 192]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700168 FMLA v19.4s, v25.4s, v1.s[2]
169 FMLA v28.4s, v24.4s, v2.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800170 $if PREFETCH:
171 PRFM PLDL1KEEP, [x5, 256]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700172 FMLA v29.4s, v25.4s, v2.s[2]
173 FMLA v30.4s, v24.4s, v3.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800174 $if PREFETCH:
175 PRFM PLDL1KEEP, [x5, 320]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700176 FMLA v31.4s, v25.4s, v3.s[2]
177 FMLA v16.4s, v26.4s, v0.s[3]
178 FMLA v17.4s, v27.4s, v0.s[3]
179 FMLA v18.4s, v26.4s, v1.s[3]
180 FMLA v19.4s, v27.4s, v1.s[3]
181 FMLA v28.4s, v26.4s, v2.s[3]
182 FMLA v29.4s, v27.4s, v2.s[3]
183 FMLA v30.4s, v26.4s, v3.s[3]
184 FMLA v31.4s, v27.4s, v3.s[3]
185
186 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
187 FMLA v16.4s, v8.4s, v4.s[0]
188 LDP q20, q21, [x5], 32
189 FMLA v17.4s, v9.4s, v4.s[0]
190 FMLA v18.4s, v8.4s, v5.s[0]
191 LDP q22, q23, [x5], 32
192 FMLA v19.4s, v9.4s, v5.s[0]
193 FMLA v28.4s, v8.4s, v6.s[0]
194 LDP q24, q25, [x5], 32
195 FMLA v29.4s, v9.4s, v6.s[0]
196 FMLA v30.4s, v8.4s, v7.s[0]
197 LDP q26, q27, [x5], 32
198 FMLA v31.4s, v9.4s, v7.s[0]
199 FMLA v16.4s, v10.4s, v4.s[1]
200 LDR q0, [x20], 16
201 FMLA v17.4s, v11.4s, v4.s[1]
202 FMLA v18.4s, v10.4s, v5.s[1]
203 LDR q1, [x13], 16
204 FMLA v19.4s, v11.4s, v5.s[1]
205 FMLA v28.4s, v10.4s, v6.s[1]
206 LDR q2, [x14], 16
207 FMLA v29.4s, v11.4s, v6.s[1]
208 FMLA v30.4s, v10.4s, v7.s[1]
209 LDR q3, [x15], 16
210 FMLA v31.4s, v11.4s, v7.s[1]
211 FMLA v16.4s, v12.4s, v4.s[2]
212 FMLA v17.4s, v13.4s, v4.s[2]
213 FMLA v18.4s, v12.4s, v5.s[2]
214 FMLA v19.4s, v13.4s, v5.s[2]
215 FMLA v28.4s, v12.4s, v6.s[2]
216 FMLA v29.4s, v13.4s, v6.s[2]
217 FMLA v30.4s, v12.4s, v7.s[2]
218 FMLA v31.4s, v13.4s, v7.s[2]
219 FMLA v16.4s, v14.4s, v4.s[3]
220 FMLA v17.4s, v15.4s, v4.s[3]
221 FMLA v18.4s, v14.4s, v5.s[3]
222 FMLA v19.4s, v15.4s, v5.s[3]
223 FMLA v28.4s, v14.4s, v6.s[3]
224 FMLA v29.4s, v15.4s, v6.s[3]
225 SUBS x0, x0, 32
226 FMLA v30.4s, v14.4s, v7.s[3]
227 FMLA v31.4s, v15.4s, v7.s[3]
228
229 B.HS 2b
230
2313:
232 # Epilogue
233 # First block of 4. FMA for first 4, loads for 2nd block of 4.
234 FMLA v16.4s, v20.4s, v0.s[0]
235 LDP q8, q9, [x5], 32
236 FMLA v17.4s, v21.4s, v0.s[0]
237 FMLA v18.4s, v20.4s, v1.s[0]
238 LDP q10, q11, [x5], 32
239 FMLA v19.4s, v21.4s, v1.s[0]
240 FMLA v28.4s, v20.4s, v2.s[0]
241 LDP q12, q13, [x5], 32
242 FMLA v29.4s, v21.4s, v2.s[0]
243 FMLA v30.4s, v20.4s, v3.s[0]
244 LDP q14, q15, [x5], 32
245 FMLA v31.4s, v21.4s, v3.s[0]
246 FMLA v16.4s, v22.4s, v0.s[1]
247 LDR q4, [x20], 16
248 FMLA v17.4s, v23.4s, v0.s[1]
249 FMLA v18.4s, v22.4s, v1.s[1]
250 LDR q5, [x13], 16
251 FMLA v19.4s, v23.4s, v1.s[1]
252 FMLA v28.4s, v22.4s, v2.s[1]
253 LDR q6, [x14], 16
254 FMLA v29.4s, v23.4s, v2.s[1]
255 FMLA v30.4s, v22.4s, v3.s[1]
256 LDR q7, [x15], 16
257 FMLA v31.4s, v23.4s, v3.s[1]
258 FMLA v16.4s, v24.4s, v0.s[2]
259 FMLA v17.4s, v25.4s, v0.s[2]
260 FMLA v18.4s, v24.4s, v1.s[2]
261 FMLA v19.4s, v25.4s, v1.s[2]
262 FMLA v28.4s, v24.4s, v2.s[2]
263 FMLA v29.4s, v25.4s, v2.s[2]
264 FMLA v30.4s, v24.4s, v3.s[2]
265 FMLA v31.4s, v25.4s, v3.s[2]
266 FMLA v16.4s, v26.4s, v0.s[3]
267 FMLA v17.4s, v27.4s, v0.s[3]
268 FMLA v18.4s, v26.4s, v1.s[3]
269 FMLA v19.4s, v27.4s, v1.s[3]
270 FMLA v28.4s, v26.4s, v2.s[3]
271 FMLA v29.4s, v27.4s, v2.s[3]
272 FMLA v30.4s, v26.4s, v3.s[3]
273 FMLA v31.4s, v27.4s, v3.s[3]
274
275 # Second block of 4. FMA for second 4, noloads
276 FMLA v16.4s, v8.4s, v4.s[0]
277 FMLA v17.4s, v9.4s, v4.s[0]
278 FMLA v18.4s, v8.4s, v5.s[0]
279 FMLA v19.4s, v9.4s, v5.s[0]
280 FMLA v28.4s, v8.4s, v6.s[0]
281 FMLA v29.4s, v9.4s, v6.s[0]
282 FMLA v30.4s, v8.4s, v7.s[0]
283 FMLA v31.4s, v9.4s, v7.s[0]
284 FMLA v16.4s, v10.4s, v4.s[1]
285 FMLA v17.4s, v11.4s, v4.s[1]
286 FMLA v18.4s, v10.4s, v5.s[1]
287 FMLA v19.4s, v11.4s, v5.s[1]
288 FMLA v28.4s, v10.4s, v6.s[1]
289 FMLA v29.4s, v11.4s, v6.s[1]
290 FMLA v30.4s, v10.4s, v7.s[1]
291 FMLA v31.4s, v11.4s, v7.s[1]
292 FMLA v16.4s, v12.4s, v4.s[2]
293 FMLA v17.4s, v13.4s, v4.s[2]
294 FMLA v18.4s, v12.4s, v5.s[2]
295 FMLA v19.4s, v13.4s, v5.s[2]
296 FMLA v28.4s, v12.4s, v6.s[2]
297 FMLA v29.4s, v13.4s, v6.s[2]
298 FMLA v30.4s, v12.4s, v7.s[2]
299 FMLA v31.4s, v13.4s, v7.s[2]
300
301 FMLA v16.4s, v14.4s, v4.s[3]
302 FMLA v17.4s, v15.4s, v4.s[3]
303 FMLA v18.4s, v14.4s, v5.s[3]
304 FMLA v19.4s, v15.4s, v5.s[3]
305
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700306 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700307 LD2R {v4.4s, v5.4s}, [x8]
308
309 FMLA v28.4s, v14.4s, v6.s[3]
310 FMLA v29.4s, v15.4s, v6.s[3]
311 FMLA v30.4s, v14.4s, v7.s[3]
312 FMLA v31.4s, v15.4s, v7.s[3]
313
3144:
315 # Remainder- 4 floats of A
316 TBZ x0, 4, 5f
317
318 LDR q0, [x20], 16
319 LDP q20, q21, [x5], 32
320 LDR q1, [x13], 16
321 LDR q2, [x14], 16
322 LDR q3, [x15], 16
323 FMLA v16.4s, v20.4s, v0.s[0]
324 FMLA v17.4s, v21.4s, v0.s[0]
325 LDP q22, q23, [x5], 32
326 FMLA v18.4s, v20.4s, v1.s[0]
327 FMLA v19.4s, v21.4s, v1.s[0]
328 LDP q24, q25, [x5], 32
329 FMLA v28.4s, v20.4s, v2.s[0]
330 FMLA v29.4s, v21.4s, v2.s[0]
331 LDP q26, q27, [x5], 32
332 FMLA v30.4s, v20.4s, v3.s[0]
333 FMLA v31.4s, v21.4s, v3.s[0]
334 FMLA v16.4s, v22.4s, v0.s[1]
335 FMLA v17.4s, v23.4s, v0.s[1]
336 FMLA v18.4s, v22.4s, v1.s[1]
337 FMLA v19.4s, v23.4s, v1.s[1]
338 FMLA v28.4s, v22.4s, v2.s[1]
339 FMLA v29.4s, v23.4s, v2.s[1]
340 FMLA v30.4s, v22.4s, v3.s[1]
341 FMLA v31.4s, v23.4s, v3.s[1]
342 FMLA v16.4s, v24.4s, v0.s[2]
343 FMLA v17.4s, v25.4s, v0.s[2]
344 FMLA v18.4s, v24.4s, v1.s[2]
345 FMLA v19.4s, v25.4s, v1.s[2]
346 FMLA v28.4s, v24.4s, v2.s[2]
347 FMLA v29.4s, v25.4s, v2.s[2]
348 FMLA v30.4s, v24.4s, v3.s[2]
349 FMLA v31.4s, v25.4s, v3.s[2]
350 FMLA v16.4s, v26.4s, v0.s[3]
351 FMLA v17.4s, v27.4s, v0.s[3]
352 FMLA v18.4s, v26.4s, v1.s[3]
353 FMLA v19.4s, v27.4s, v1.s[3]
354 FMLA v28.4s, v26.4s, v2.s[3]
355 FMLA v29.4s, v27.4s, v2.s[3]
356 FMLA v30.4s, v26.4s, v3.s[3]
357 FMLA v31.4s, v27.4s, v3.s[3]
358
3595:
360 # Remainder- 2 floats of A
361 TBZ x0, 3, 6f
362
363 LDR d0, [x20], 8
364 LDP q20, q21, [x5], 32
365 LDR d1, [x13], 8
366 LDR d2, [x14], 8
367 LDR d3, [x15], 8
368 FMLA v16.4s, v20.4s, v0.s[0]
369 FMLA v17.4s, v21.4s, v0.s[0]
370 LDP q22, q23, [x5], 32
371 FMLA v18.4s, v20.4s, v1.s[0]
372 FMLA v19.4s, v21.4s, v1.s[0]
373 FMLA v28.4s, v20.4s, v2.s[0]
374 FMLA v29.4s, v21.4s, v2.s[0]
375 FMLA v30.4s, v20.4s, v3.s[0]
376 FMLA v31.4s, v21.4s, v3.s[0]
377 FMLA v16.4s, v22.4s, v0.s[1]
378 FMLA v17.4s, v23.4s, v0.s[1]
379 FMLA v18.4s, v22.4s, v1.s[1]
380 FMLA v19.4s, v23.4s, v1.s[1]
381 FMLA v28.4s, v22.4s, v2.s[1]
382 FMLA v29.4s, v23.4s, v2.s[1]
383 FMLA v30.4s, v22.4s, v3.s[1]
384 FMLA v31.4s, v23.4s, v3.s[1]
385
3866:
387 # Remainder- 1 float of A
388 TBZ x0, 2, 7f
389
390 LDR s0, [x20], 4
391 LDP q20, q21, [x5], 32
392 LDR s1, [x13], 4
393 LDR s2, [x14], 4
394 LDR s3, [x15], 4
395 FMLA v16.4s, v20.4s, v0.s[0]
396 FMLA v17.4s, v21.4s, v0.s[0]
397 FMLA v18.4s, v20.4s, v1.s[0]
398 FMLA v19.4s, v21.4s, v1.s[0]
399 FMLA v28.4s, v20.4s, v2.s[0]
400 FMLA v29.4s, v21.4s, v2.s[0]
401 FMLA v30.4s, v20.4s, v3.s[0]
402 FMLA v31.4s, v21.4s, v3.s[0]
403
4047:
405 # ks loop
406 SUBS x9, x9, 32 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800407 B.HI 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700408
409 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700410 FMAX v16.4s, v16.4s, v4.4s
411 FMAX v17.4s, v17.4s, v4.4s
412 FMAX v18.4s, v18.4s, v4.4s
413 FMAX v19.4s, v19.4s, v4.4s
414 FMAX v28.4s, v28.4s, v4.4s
415 FMAX v29.4s, v29.4s, v4.4s
416 FMAX v30.4s, v30.4s, v4.4s
417 FMAX v31.4s, v31.4s, v4.4s
418 FMIN v16.4s, v16.4s, v5.4s
419 FMIN v17.4s, v17.4s, v5.4s
420 FMIN v18.4s, v18.4s, v5.4s
421 FMIN v19.4s, v19.4s, v5.4s
422 FMIN v28.4s, v28.4s, v5.4s
423 FMIN v29.4s, v29.4s, v5.4s
424 FMIN v30.4s, v30.4s, v5.4s
425 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700426
427 # Store full 4 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800428 SUBS x1, x1, 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700429 B.LO 8f
430
431 STP q30, q31, [x7]
432 ADD x7, x7, x10
433 STP q28, q29, [x17]
434 ADD x17, x17, x10
435 STP q18, q19, [x16]
436 ADD x16, x16, x10
437 STP q16, q17, [x6]
438 ADD x6, x6, x10
439
440 SUB x4, x4, x3 // a -= ks
441
442 # nc loop
XNNPACK Teamb455b122019-09-27 18:10:33 -0700443 B.HI 0b
444
445 # Restore d8-d15 from stack
446 LDP d14, d15, [sp, 64]
447 LDP d12, d13, [sp, 48]
448 LDP d10, d11, [sp, 32]
449 LDP d8, d9, [sp, 16]
450
451 # Restore x20 from stack
452 LDR x20, [sp], 80
453 RET
454
455 # Store odd width
4568:
457 TBZ x1, 2, 9f
458 STR q30, [x7], 16
459 MOV v30.16b, v31.16b
460 STR q28, [x17], 16
461 MOV v28.16b, v29.16b
462 STR q18, [x16], 16
463 MOV v18.16b, v19.16b
464 STR q16, [x6], 16
465 MOV v16.16b, v17.16b
466
4679:
468 TBZ x1, 1, 10f
469 STR d30, [x7], 8
470 DUP d30, v30.d[1]
471 STR d28, [x17], 8
472 DUP d28, v28.d[1]
473 STR d18, [x16], 8
474 DUP d18, v18.d[1]
475 STR d16, [x6], 8
476 DUP d16, v16.d[1]
477
47810:
479 TBZ x1, 0, 11f
480 STR s30, [x7]
481 STR s28, [x17]
482 STR s18, [x16]
483 STR s16, [x6]
48411:
485 # Restore d8-d15 from stack
486 LDP d14, d15, [sp, 64]
487 LDP d12, d13, [sp, 48]
488 LDP d10, d11, [sp, 32]
489 LDP d8, d9, [sp, 16]
490
491 # Restore x20 from stack
492 LDR x20, [sp], 80
493 RET
494
Frank Barchard387c2d12019-12-16 19:14:07 -0800495END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
XNNPACK Teamb455b122019-09-27 18:10:33 -0700496
497#ifdef __ELF__
498.section ".note.GNU-stack","",%progbits
499#endif