blob: 19168ea37174641a26cccf8026a0e062252f81a7 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
Marat Dukhande06f492020-04-09 00:19:31 -07008# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
XNNPACK Teamb455b122019-09-27 18:10:33 -07009# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Marat Dukhanf196d012020-04-15 11:50:03 -070020 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070021$else:
Marat Dukhanf196d012020-04-15 11:50:03 -070022 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
39# Vector register usage
40# A0 v0 v4
41# A1 v1 v5
42# A2 v2 v6
43# A3 v3 v7
44# B v8 v9 v10 v11
45# B v12 v13 v14 v15
46# B v20 v21 v22 v23
47# B v24 v25 v26 v27
48# C v16 v17
49# C v18 v19
50# C v28 v29
51# C v30 v31
52# Clamp v4 v5
53
Marat Dukhande06f492020-04-09 00:19:31 -070054BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
XNNPACK Teamb455b122019-09-27 18:10:33 -070055
56 $if INC:
57 # Load cn_stride, acc
58 LDP x14, x15, [sp]
59 # Load params pointer
60 LDR x8, [sp, 16]
61 $else:
62 # Load cn_stride, params pointer
63 LDP x14, x8, [sp]
64
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070065 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 LD2R {v4.4s, v5.4s}, [x8]
67
68 # Save d8-d15 on stack
69 STP d8, d9, [sp, -64]!
70 STP d10, d11, [sp, 16]
71 STP d12, d13, [sp, 32]
72 STP d14, d15, [sp, 48]
73
74 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080075 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070076 ADD x11, x3, x4 // a1 = a0 + a_stride
77 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 CSEL x11, x3, x11, LO // a1 = a0
79 CSEL x9, x6, x9, LO // c1 = c0
80
81 ADD x12, x11, x4 // a2 = a1 + a_stride
82 ADD x10, x9, x7 // c2 = c1 + cm_stride
83 // if mr <= 2
84 CSEL x12, x11, x12, LS // a2 = a1
85 CSEL x10, x9, x10, LS // c2 = c1
86
Frank Barchard684bbb02019-11-16 14:14:42 -080087 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070088 ADD x4, x12, x4 // a3 = a2 + a_stride
89 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070090 CSEL x4, x12, x4, LO // a3 = a2
91 CSEL x7, x10, x7, LO // c3 = c2
92
930:
94 $if INC:
95 # Load initial accumulators
96 LDP q16, q17, [x15], 32
97 LDP q18, q19, [x15], 32
98 LDP q28, q29, [x15], 32
99 LDP q30, q31, [x15], 32
100 $else:
101 # Load initial bias from w into accumulators
102 LDP q16, q17, [x5], 32
103 MOV v18.16b, v16.16b
104 MOV v19.16b, v17.16b
105 MOV v28.16b, v16.16b
106 MOV v29.16b, v17.16b
107 MOV v30.16b, v16.16b
108 MOV v31.16b, v17.16b
109
110 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
111 SUBS x0, x2, 32 // k = kc - 32
112 B.LO 3f
113
114 # 16 prologue
115 # Read first block of 4 A and B.
116 LDR q0, [x3], 16
117 LDP q20, q21, [x5], 32
118 LDR q1, [x11], 16
119 LDR q2, [x12], 16
120 LDR q3, [x4], 16
121 LDP q22, q23, [x5], 32
122 LDP q24, q25, [x5], 32
123 LDP q26, q27, [x5], 32
124
125 # Is there at least 32. yes do main loop
126 SUBS x0, x0, 32
127 B.LO 2f
128
129 # Main loop - 8 floats of A (32 bytes)
1301:
131 # First block of 4. FMA for first 4, loads for 2nd block of 4.
132 FMLA v16.4s, v20.4s, v0.s[0]
133 LDP q8, q9, [x5], 32
134 FMLA v17.4s, v21.4s, v0.s[0]
135 FMLA v18.4s, v20.4s, v1.s[0]
136 LDP q10, q11, [x5], 32
137 FMLA v19.4s, v21.4s, v1.s[0]
138 FMLA v28.4s, v20.4s, v2.s[0]
139 LDP q12, q13, [x5], 32
140 FMLA v29.4s, v21.4s, v2.s[0]
141 FMLA v30.4s, v20.4s, v3.s[0]
142 LDP q14, q15, [x5], 32
143 FMLA v31.4s, v21.4s, v3.s[0]
144 FMLA v16.4s, v22.4s, v0.s[1]
145 LDR q4, [x3], 16
146 FMLA v17.4s, v23.4s, v0.s[1]
147 FMLA v18.4s, v22.4s, v1.s[1]
148 LDR q5, [x11], 16
149 FMLA v19.4s, v23.4s, v1.s[1]
150 FMLA v28.4s, v22.4s, v2.s[1]
151 LDR q6, [x12], 16
152 FMLA v29.4s, v23.4s, v2.s[1]
153 FMLA v30.4s, v22.4s, v3.s[1]
154 LDR q7, [x4], 16
155 FMLA v31.4s, v23.4s, v3.s[1]
156 FMLA v16.4s, v24.4s, v0.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800157 $if PREFETCH:
158 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700159 FMLA v17.4s, v25.4s, v0.s[2]
160 FMLA v18.4s, v24.4s, v1.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800161 $if PREFETCH:
162 PRFM PLDL1KEEP, [x5, 192]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700163 FMLA v19.4s, v25.4s, v1.s[2]
164 FMLA v28.4s, v24.4s, v2.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800165 $if PREFETCH:
166 PRFM PLDL1KEEP, [x5, 256]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700167 FMLA v29.4s, v25.4s, v2.s[2]
168 FMLA v30.4s, v24.4s, v3.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800169 $if PREFETCH:
170 PRFM PLDL1KEEP, [x5, 320]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700171 FMLA v31.4s, v25.4s, v3.s[2]
172 FMLA v16.4s, v26.4s, v0.s[3]
173 FMLA v17.4s, v27.4s, v0.s[3]
174 FMLA v18.4s, v26.4s, v1.s[3]
175 FMLA v19.4s, v27.4s, v1.s[3]
176 FMLA v28.4s, v26.4s, v2.s[3]
177 FMLA v29.4s, v27.4s, v2.s[3]
178 FMLA v30.4s, v26.4s, v3.s[3]
179 FMLA v31.4s, v27.4s, v3.s[3]
180
181 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
182 FMLA v16.4s, v8.4s, v4.s[0]
183 LDP q20, q21, [x5], 32
184 FMLA v17.4s, v9.4s, v4.s[0]
185 FMLA v18.4s, v8.4s, v5.s[0]
186 LDP q22, q23, [x5], 32
187 FMLA v19.4s, v9.4s, v5.s[0]
188 FMLA v28.4s, v8.4s, v6.s[0]
189 LDP q24, q25, [x5], 32
190 FMLA v29.4s, v9.4s, v6.s[0]
191 FMLA v30.4s, v8.4s, v7.s[0]
192 LDP q26, q27, [x5], 32
193 FMLA v31.4s, v9.4s, v7.s[0]
194 FMLA v16.4s, v10.4s, v4.s[1]
195 LDR q0, [x3], 16
196 FMLA v17.4s, v11.4s, v4.s[1]
197 FMLA v18.4s, v10.4s, v5.s[1]
198 LDR q1, [x11], 16
199 FMLA v19.4s, v11.4s, v5.s[1]
200 FMLA v28.4s, v10.4s, v6.s[1]
201 LDR q2, [x12], 16
202 FMLA v29.4s, v11.4s, v6.s[1]
203 FMLA v30.4s, v10.4s, v7.s[1]
204 LDR q3, [x4], 16
205 FMLA v31.4s, v11.4s, v7.s[1]
206 FMLA v16.4s, v12.4s, v4.s[2]
207 FMLA v17.4s, v13.4s, v4.s[2]
208 FMLA v18.4s, v12.4s, v5.s[2]
209 FMLA v19.4s, v13.4s, v5.s[2]
210 FMLA v28.4s, v12.4s, v6.s[2]
211 FMLA v29.4s, v13.4s, v6.s[2]
212 FMLA v30.4s, v12.4s, v7.s[2]
213 FMLA v31.4s, v13.4s, v7.s[2]
214 FMLA v16.4s, v14.4s, v4.s[3]
215 FMLA v17.4s, v15.4s, v4.s[3]
216 FMLA v18.4s, v14.4s, v5.s[3]
217 FMLA v19.4s, v15.4s, v5.s[3]
218 FMLA v28.4s, v14.4s, v6.s[3]
219 FMLA v29.4s, v15.4s, v6.s[3]
220 SUBS x0, x0, 32
221 FMLA v30.4s, v14.4s, v7.s[3]
222 FMLA v31.4s, v15.4s, v7.s[3]
223 B.HS 1b
224
2252:
226 # Epilogue
227 # First block of 4. FMA for first 4, loads for 2nd block of 4.
228 FMLA v16.4s, v20.4s, v0.s[0]
229 LDP q8, q9, [x5], 32
230 FMLA v17.4s, v21.4s, v0.s[0]
231 FMLA v18.4s, v20.4s, v1.s[0]
232 LDP q10, q11, [x5], 32
233 FMLA v19.4s, v21.4s, v1.s[0]
234 FMLA v28.4s, v20.4s, v2.s[0]
235 LDP q12, q13, [x5], 32
236 FMLA v29.4s, v21.4s, v2.s[0]
237 FMLA v30.4s, v20.4s, v3.s[0]
238 LDP q14, q15, [x5], 32
239 FMLA v31.4s, v21.4s, v3.s[0]
240 FMLA v16.4s, v22.4s, v0.s[1]
241 LDR q4, [x3], 16
242 FMLA v17.4s, v23.4s, v0.s[1]
243 FMLA v18.4s, v22.4s, v1.s[1]
244 LDR q5, [x11], 16
245 FMLA v19.4s, v23.4s, v1.s[1]
246 FMLA v28.4s, v22.4s, v2.s[1]
247 LDR q6, [x12], 16
248 FMLA v29.4s, v23.4s, v2.s[1]
249 FMLA v30.4s, v22.4s, v3.s[1]
250 LDR q7, [x4], 16
251 FMLA v31.4s, v23.4s, v3.s[1]
252 FMLA v16.4s, v24.4s, v0.s[2]
253 FMLA v17.4s, v25.4s, v0.s[2]
254 FMLA v18.4s, v24.4s, v1.s[2]
255 FMLA v19.4s, v25.4s, v1.s[2]
256 FMLA v28.4s, v24.4s, v2.s[2]
257 FMLA v29.4s, v25.4s, v2.s[2]
258 FMLA v30.4s, v24.4s, v3.s[2]
259 FMLA v31.4s, v25.4s, v3.s[2]
260 FMLA v16.4s, v26.4s, v0.s[3]
261 FMLA v17.4s, v27.4s, v0.s[3]
262 FMLA v18.4s, v26.4s, v1.s[3]
263 FMLA v19.4s, v27.4s, v1.s[3]
264 FMLA v28.4s, v26.4s, v2.s[3]
265 FMLA v29.4s, v27.4s, v2.s[3]
266 FMLA v30.4s, v26.4s, v3.s[3]
267 FMLA v31.4s, v27.4s, v3.s[3]
268
269 # Second block of 4. FMA for second 4, noloads
270 FMLA v16.4s, v8.4s, v4.s[0]
271 FMLA v17.4s, v9.4s, v4.s[0]
272 FMLA v18.4s, v8.4s, v5.s[0]
273 FMLA v19.4s, v9.4s, v5.s[0]
274 FMLA v28.4s, v8.4s, v6.s[0]
275 FMLA v29.4s, v9.4s, v6.s[0]
276 FMLA v30.4s, v8.4s, v7.s[0]
277 FMLA v31.4s, v9.4s, v7.s[0]
278
279 FMLA v16.4s, v10.4s, v4.s[1]
280 FMLA v17.4s, v11.4s, v4.s[1]
281 FMLA v18.4s, v10.4s, v5.s[1]
282 FMLA v19.4s, v11.4s, v5.s[1]
283 FMLA v28.4s, v10.4s, v6.s[1]
284 FMLA v29.4s, v11.4s, v6.s[1]
285 FMLA v30.4s, v10.4s, v7.s[1]
286 FMLA v31.4s, v11.4s, v7.s[1]
287
288 FMLA v16.4s, v12.4s, v4.s[2]
289 FMLA v17.4s, v13.4s, v4.s[2]
290 FMLA v18.4s, v12.4s, v5.s[2]
291 FMLA v19.4s, v13.4s, v5.s[2]
292 FMLA v28.4s, v12.4s, v6.s[2]
293 FMLA v29.4s, v13.4s, v6.s[2]
294 FMLA v30.4s, v12.4s, v7.s[2]
295 FMLA v31.4s, v13.4s, v7.s[2]
296
297 FMLA v16.4s, v14.4s, v4.s[3]
298 FMLA v17.4s, v15.4s, v4.s[3]
299 FMLA v18.4s, v14.4s, v5.s[3]
300 FMLA v19.4s, v15.4s, v5.s[3]
301
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700302 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -0700303 LD2R {v4.4s, v5.4s}, [x8]
304
305 FMLA v28.4s, v14.4s, v6.s[3]
306 FMLA v29.4s, v15.4s, v6.s[3]
307 FMLA v30.4s, v14.4s, v7.s[3]
308 FMLA v31.4s, v15.4s, v7.s[3]
309
3103:
311 # Remainder- 4 floats of A (16 bytes)
312 TBZ x0, 4, 4f
313
314 LDR q0, [x3], 16
315 LDP q20, q21, [x5], 32
316 LDR q1, [x11], 16
317 LDR q2, [x12], 16
318 LDR q3, [x4], 16
319 FMLA v16.4s, v20.4s, v0.s[0]
320 FMLA v17.4s, v21.4s, v0.s[0]
321 LDP q22, q23, [x5], 32
322 FMLA v18.4s, v20.4s, v1.s[0]
323 FMLA v19.4s, v21.4s, v1.s[0]
324 LDP q24, q25, [x5], 32
325 FMLA v28.4s, v20.4s, v2.s[0]
326 FMLA v29.4s, v21.4s, v2.s[0]
327 LDP q26, q27, [x5], 32
328 FMLA v30.4s, v20.4s, v3.s[0]
329 FMLA v31.4s, v21.4s, v3.s[0]
330 FMLA v16.4s, v22.4s, v0.s[1]
331 FMLA v17.4s, v23.4s, v0.s[1]
332 FMLA v18.4s, v22.4s, v1.s[1]
333 FMLA v19.4s, v23.4s, v1.s[1]
334 FMLA v28.4s, v22.4s, v2.s[1]
335 FMLA v29.4s, v23.4s, v2.s[1]
336 FMLA v30.4s, v22.4s, v3.s[1]
337 FMLA v31.4s, v23.4s, v3.s[1]
338 FMLA v16.4s, v24.4s, v0.s[2]
339 FMLA v17.4s, v25.4s, v0.s[2]
340 FMLA v18.4s, v24.4s, v1.s[2]
341 FMLA v19.4s, v25.4s, v1.s[2]
342 FMLA v28.4s, v24.4s, v2.s[2]
343 FMLA v29.4s, v25.4s, v2.s[2]
344 FMLA v30.4s, v24.4s, v3.s[2]
345 FMLA v31.4s, v25.4s, v3.s[2]
346 FMLA v16.4s, v26.4s, v0.s[3]
347 FMLA v17.4s, v27.4s, v0.s[3]
348 FMLA v18.4s, v26.4s, v1.s[3]
349 FMLA v19.4s, v27.4s, v1.s[3]
350 FMLA v28.4s, v26.4s, v2.s[3]
351 FMLA v29.4s, v27.4s, v2.s[3]
352 FMLA v30.4s, v26.4s, v3.s[3]
353 FMLA v31.4s, v27.4s, v3.s[3]
354
3554:
356 # Remainder- 2 floats of A (8 bytes)
357 TBZ x0, 3, 5f
358
359 LDR d0, [x3], 8
360 LDP q20, q21, [x5], 32
361 LDR d1, [x11], 8
362 LDR d2, [x12], 8
363 LDR d3, [x4], 8
364 FMLA v16.4s, v20.4s, v0.s[0]
365 FMLA v17.4s, v21.4s, v0.s[0]
366 LDP q22, q23, [x5], 32
367 FMLA v18.4s, v20.4s, v1.s[0]
368 FMLA v19.4s, v21.4s, v1.s[0]
369 FMLA v28.4s, v20.4s, v2.s[0]
370 FMLA v29.4s, v21.4s, v2.s[0]
371 FMLA v30.4s, v20.4s, v3.s[0]
372 FMLA v31.4s, v21.4s, v3.s[0]
373 FMLA v16.4s, v22.4s, v0.s[1]
374 FMLA v17.4s, v23.4s, v0.s[1]
375 FMLA v18.4s, v22.4s, v1.s[1]
376 FMLA v19.4s, v23.4s, v1.s[1]
377 FMLA v28.4s, v22.4s, v2.s[1]
378 FMLA v29.4s, v23.4s, v2.s[1]
379 FMLA v30.4s, v22.4s, v3.s[1]
380 FMLA v31.4s, v23.4s, v3.s[1]
381
3825:
383 # Remainder- 1 float of A (4 bytes)
384 TBZ x0, 2, 6f
385
386 LDR s0, [x3], 4
387 LDP q20, q21, [x5], 32
388 LDR s1, [x11], 4
389 LDR s2, [x12], 4
390 LDR s3, [x4], 4
391 FMLA v16.4s, v20.4s, v0.s[0]
392 FMLA v17.4s, v21.4s, v0.s[0]
393 FMLA v18.4s, v20.4s, v1.s[0]
394 FMLA v19.4s, v21.4s, v1.s[0]
395 FMLA v28.4s, v20.4s, v2.s[0]
396 FMLA v29.4s, v21.4s, v2.s[0]
397 FMLA v30.4s, v20.4s, v3.s[0]
398 FMLA v31.4s, v21.4s, v3.s[0]
399
4006:
401 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700402 FMAX v16.4s, v16.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800403 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700404 FMAX v17.4s, v17.4s, v4.4s
405 FMAX v18.4s, v18.4s, v4.4s
406 FMAX v19.4s, v19.4s, v4.4s
407 FMAX v28.4s, v28.4s, v4.4s
408 FMAX v29.4s, v29.4s, v4.4s
409 FMAX v30.4s, v30.4s, v4.4s
410 FMAX v31.4s, v31.4s, v4.4s
411 FMIN v16.4s, v16.4s, v5.4s
412 FMIN v17.4s, v17.4s, v5.4s
413 FMIN v18.4s, v18.4s, v5.4s
414 FMIN v19.4s, v19.4s, v5.4s
415 FMIN v28.4s, v28.4s, v5.4s
416 FMIN v29.4s, v29.4s, v5.4s
417 FMIN v30.4s, v30.4s, v5.4s
418 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700419
420 # Store full 4 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700421 B.LO 7f
422
Frank Barchard19418b52019-11-15 15:15:13 -0800423 $if INC:
424 STP q30, q31, [x7]
425 SUB x3, x3, x2 // a0 -= kc
426 ADD x7, x7, x14
427 STP q28, q29, [x10]
428 SUB x11, x11, x2 // a1 -= kc
429 ADD x10, x10, x14
430 STP q18, q19, [x9]
431 SUB x12, x12, x2 // a2 -= kc
432 ADD x9, x9, x14
433 STP q16, q17, [x6]
434 SUB x4, x4, x2 // a3 -= kc
435 ADD x6, x6, x14
436 $else:
437 STP q16, q17, [x6]
438 SUB x3, x3, x2 // a0 -= kc
439 ADD x6, x6, x14
440 STP q18, q19, [x9]
441 SUB x11, x11, x2 // a1 -= kc
442 ADD x9, x9, x14
443 STP q28, q29, [x10]
444 SUB x12, x12, x2 // a2 -= kc
445 ADD x10, x10, x14
446 STP q30, q31, [x7]
447 SUB x4, x4, x2 // a3 -= kc
448 ADD x7, x7, x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700449
XNNPACK Teamb455b122019-09-27 18:10:33 -0700450 B.HI 0b
451
452 # Restore d8-d15 from stack
453 LDP d14, d15, [sp, 48]
454 LDP d12, d13, [sp, 32]
455 LDP d10, d11, [sp, 16]
456 LDP d8, d9, [sp], 64
457 RET
458
459 # Store odd width
4607:
461 TBZ x1, 2, 8f
Frank Barchard19418b52019-11-15 15:15:13 -0800462 $if INC:
463 STR q30, [x7], 16
464 MOV v30.16b, v31.16b
465 STR q28, [x10], 16
466 MOV v28.16b, v29.16b
467 STR q18, [x9], 16
468 MOV v18.16b, v19.16b
469 STR q16, [x6], 16
470 MOV v16.16b, v17.16b
471 $else:
472 STR q16, [x6], 16
473 MOV v16.16b, v17.16b
474 STR q18, [x9], 16
475 MOV v18.16b, v19.16b
476 STR q28, [x10], 16
477 MOV v28.16b, v29.16b
478 STR q30, [x7], 16
479 MOV v30.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700480
4818:
482 TBZ x1, 1, 9f
Frank Barchard19418b52019-11-15 15:15:13 -0800483 $if INC:
484 STR d30, [x7], 8
485 DUP d30, v30.d[1]
486 STR d28, [x10], 8
487 DUP d28, v28.d[1]
488 STR d18, [x9], 8
489 DUP d18, v18.d[1]
490 STR d16, [x6], 8
491 DUP d16, v16.d[1]
492 $else:
493 STR d16, [x6], 8
494 DUP d16, v16.d[1]
495 STR d18, [x9], 8
496 DUP d18, v18.d[1]
497 STR d28, [x10], 8
498 DUP d28, v28.d[1]
499 STR d30, [x7], 8
500 DUP d30, v30.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700501
5029:
503 TBZ x1, 0, 10f
Frank Barchard19418b52019-11-15 15:15:13 -0800504 $if INC:
505 STR s30, [x7]
506 STR s28, [x10]
507 STR s18, [x9]
508 STR s16, [x6]
509 $else:
510 STR s16, [x6]
511 STR s18, [x9]
512 STR s28, [x10]
513 STR s30, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -070051410:
515 # Restore d8-d15 from stack
516 LDP d14, d15, [sp, 48]
517 LDP d12, d13, [sp, 32]
518 LDP d10, d11, [sp, 16]
519 LDP d8, d9, [sp], 64
520 RET
521
Frank Barchard19418b52019-11-15 15:15:13 -0800522
Marat Dukhande06f492020-04-09 00:19:31 -0700523END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
XNNPACK Teamb455b122019-09-27 18:10:33 -0700524
525#ifdef __ELF__
526.section ".note.GNU-stack","",%progbits
527#endif