blob: 507b448eff69f5e002d20a7c1a4b9bc8575bff17 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
Frank Barchard143a1102021-06-15 09:15:34 -07008# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_${"prfm_" if PREFETCH else ""}cortex_a75(
XNNPACK Teamb455b122019-09-27 18:10:33 -07009# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Marat Dukhanf196d012020-04-15 11:50:03 -070020 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070021$else:
Marat Dukhanf196d012020-04-15 11:50:03 -070022 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
Frank Barchard909564c2020-06-09 03:54:33 -070024# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26# A pointers
27# x3 a0
28# x11 a1
29# x12 a2
30# x4 a3 / a_stride
31
32# C pointers
33# x6 c0
34# x9 c1
35# x10 c2
36# x7 c3 / cm_stride
37
38# Vector register usage
39# A0 v0 v4
40# A1 v1 v5
41# A2 v2 v6
42# A3 v3 v7
43# B v8 v9 v10 v11
44# B v12 v13 v14 v15
45# B v20 v21 v22 v23
46# B v24 v25 v26 v27
47# C v16 v17
48# C v18 v19
49# C v28 v29
50# C v30 v31
51# Clamp v4 v5
52
Frank Barchard143a1102021-06-15 09:15:34 -070053BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_${"prfm_" if PREFETCH else ""}cortex_a75
XNNPACK Teamb455b122019-09-27 18:10:33 -070054
55 $if INC:
56 # Load cn_stride, acc
Frank Barchard76f43f02021-05-12 14:52:01 -070057 LDP x14, x15, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -070058 # Load params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070059 LDR x8, [sp, 16]
XNNPACK Teamb455b122019-09-27 18:10:33 -070060 $else:
61 # Load cn_stride, params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070062 LDP x14, x8, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -070063
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070064 # Load min/max values
Frank Barchard76f43f02021-05-12 14:52:01 -070065 LD2R {v4.4s, v5.4s}, [x8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070066
67 # Save d8-d15 on stack
Frank Barchard76f43f02021-05-12 14:52:01 -070068 STP d8, d9, [sp, -64]!
69 STP d10, d11, [sp, 16]
70 STP d12, d13, [sp, 32]
71 STP d14, d15, [sp, 48]
XNNPACK Teamb455b122019-09-27 18:10:33 -070072
73 # Clamp A and C pointers
Frank Barchard76f43f02021-05-12 14:52:01 -070074 CMP x0, 2 // if mr < 2
75 ADD x11, x3, x4 // a1 = a0 + a_stride
76 ADD x9, x6, x7 // c1 = c0 + cm_stride
77 CSEL x11, x3, x11, LO // a1 = a0
78 CSEL x9, x6, x9, LO // c1 = c0
XNNPACK Teamb455b122019-09-27 18:10:33 -070079
Frank Barchard76f43f02021-05-12 14:52:01 -070080 ADD x12, x11, x4 // a2 = a1 + a_stride
81 ADD x10, x9, x7 // c2 = c1 + cm_stride
Frank Barchard7c9f1f92021-06-04 14:38:55 -070082 // if mr <= 2
Frank Barchard76f43f02021-05-12 14:52:01 -070083 CSEL x12, x11, x12, LS // a2 = a1
84 CSEL x10, x9, x10, LS // c2 = c1
XNNPACK Teamb455b122019-09-27 18:10:33 -070085
Frank Barchard76f43f02021-05-12 14:52:01 -070086 CMP x0, 4 // if mr < 4
87 ADD x4, x12, x4 // a3 = a2 + a_stride
88 ADD x7, x10, x7 // c3 = c2 + cm_stride
89 CSEL x4, x12, x4, LO // a3 = a2
90 CSEL x7, x10, x7, LO // c3 = c2
XNNPACK Teamb455b122019-09-27 18:10:33 -070091
920:
93 $if INC:
94 # Load initial accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -070095 LDP q16, q17, [x15], 32
96 LDP q18, q19, [x15], 32
97 LDP q28, q29, [x15], 32
98 LDP q30, q31, [x15], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -070099 $else:
100 # Load initial bias from w into accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -0700101 LDP q16, q17, [x5], 32
102 MOV v18.16b, v16.16b
103 MOV v19.16b, v17.16b
104 MOV v28.16b, v16.16b
105 MOV v29.16b, v17.16b
106 MOV v30.16b, v16.16b
107 MOV v31.16b, v17.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700108
109 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
Frank Barchard76f43f02021-05-12 14:52:01 -0700110 SUBS x0, x2, 32 // k = kc - 32
111 B.LO 3f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700112
113 # 16 prologue
114 # Read first block of 4 A and B.
Frank Barchard76f43f02021-05-12 14:52:01 -0700115 LDR q0, [x3], 16
116 LDP q20, q21, [x5], 32
117 LDR q1, [x11], 16
118 LDR q2, [x12], 16
119 LDR q3, [x4], 16
120 LDP q22, q23, [x5], 32
121 LDP q24, q25, [x5], 32
122 LDP q26, q27, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700123
124 # Is there at least 32. yes do main loop
Frank Barchard76f43f02021-05-12 14:52:01 -0700125 SUBS x0, x0, 32
126 B.LO 2f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127
128 # Main loop - 8 floats of A (32 bytes)
1291:
130 # First block of 4. FMA for first 4, loads for 2nd block of 4.
Frank Barchard76f43f02021-05-12 14:52:01 -0700131 FMLA v16.4s, v20.4s, v0.s[0]
132 LDP q8, q9, [x5], 32
133 FMLA v17.4s, v21.4s, v0.s[0]
134 FMLA v18.4s, v20.4s, v1.s[0]
135 LDP q10, q11, [x5], 32
136 FMLA v19.4s, v21.4s, v1.s[0]
137 FMLA v28.4s, v20.4s, v2.s[0]
138 LDP q12, q13, [x5], 32
139 FMLA v29.4s, v21.4s, v2.s[0]
140 FMLA v30.4s, v20.4s, v3.s[0]
141 LDP q14, q15, [x5], 32
142 FMLA v31.4s, v21.4s, v3.s[0]
143 FMLA v16.4s, v22.4s, v0.s[1]
144 LDR q4, [x3], 16
145 FMLA v17.4s, v23.4s, v0.s[1]
146 FMLA v18.4s, v22.4s, v1.s[1]
147 LDR q5, [x11], 16
148 FMLA v19.4s, v23.4s, v1.s[1]
149 FMLA v28.4s, v22.4s, v2.s[1]
150 LDR q6, [x12], 16
151 FMLA v29.4s, v23.4s, v2.s[1]
152 FMLA v30.4s, v22.4s, v3.s[1]
153 LDR q7, [x4], 16
154 FMLA v31.4s, v23.4s, v3.s[1]
155 FMLA v16.4s, v24.4s, v0.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800156 $if PREFETCH:
Frank Barchardcbfa3382021-05-07 10:30:05 -0700157 PRFM PLDL1KEEP, [x5, 128]
Frank Barchard76f43f02021-05-12 14:52:01 -0700158 FMLA v17.4s, v25.4s, v0.s[2]
159 FMLA v18.4s, v24.4s, v1.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800160 $if PREFETCH:
Frank Barchardcbfa3382021-05-07 10:30:05 -0700161 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard76f43f02021-05-12 14:52:01 -0700162 FMLA v19.4s, v25.4s, v1.s[2]
163 FMLA v28.4s, v24.4s, v2.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800164 $if PREFETCH:
Frank Barchardcbfa3382021-05-07 10:30:05 -0700165 PRFM PLDL1KEEP, [x5, 256]
Frank Barchard76f43f02021-05-12 14:52:01 -0700166 FMLA v29.4s, v25.4s, v2.s[2]
167 FMLA v30.4s, v24.4s, v3.s[2]
Frank Barchard387c2d12019-12-16 19:14:07 -0800168 $if PREFETCH:
Frank Barchardcbfa3382021-05-07 10:30:05 -0700169 PRFM PLDL1KEEP, [x5, 320]
Frank Barchard76f43f02021-05-12 14:52:01 -0700170 FMLA v31.4s, v25.4s, v3.s[2]
171 FMLA v16.4s, v26.4s, v0.s[3]
172 FMLA v17.4s, v27.4s, v0.s[3]
173 FMLA v18.4s, v26.4s, v1.s[3]
174 FMLA v19.4s, v27.4s, v1.s[3]
175 FMLA v28.4s, v26.4s, v2.s[3]
176 FMLA v29.4s, v27.4s, v2.s[3]
177 FMLA v30.4s, v26.4s, v3.s[3]
178 FMLA v31.4s, v27.4s, v3.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700179
180 # Second block of 4. FMA for second 4, loads for 1nd block of 4.
Frank Barchard76f43f02021-05-12 14:52:01 -0700181 FMLA v16.4s, v8.4s, v4.s[0]
182 LDP q20, q21, [x5], 32
183 FMLA v17.4s, v9.4s, v4.s[0]
184 FMLA v18.4s, v8.4s, v5.s[0]
185 LDP q22, q23, [x5], 32
186 FMLA v19.4s, v9.4s, v5.s[0]
187 FMLA v28.4s, v8.4s, v6.s[0]
188 LDP q24, q25, [x5], 32
189 FMLA v29.4s, v9.4s, v6.s[0]
190 FMLA v30.4s, v8.4s, v7.s[0]
191 LDP q26, q27, [x5], 32
192 FMLA v31.4s, v9.4s, v7.s[0]
193 FMLA v16.4s, v10.4s, v4.s[1]
194 LDR q0, [x3], 16
195 FMLA v17.4s, v11.4s, v4.s[1]
196 FMLA v18.4s, v10.4s, v5.s[1]
197 LDR q1, [x11], 16
198 FMLA v19.4s, v11.4s, v5.s[1]
199 FMLA v28.4s, v10.4s, v6.s[1]
200 LDR q2, [x12], 16
201 FMLA v29.4s, v11.4s, v6.s[1]
202 FMLA v30.4s, v10.4s, v7.s[1]
203 LDR q3, [x4], 16
204 FMLA v31.4s, v11.4s, v7.s[1]
205 FMLA v16.4s, v12.4s, v4.s[2]
206 FMLA v17.4s, v13.4s, v4.s[2]
207 FMLA v18.4s, v12.4s, v5.s[2]
208 FMLA v19.4s, v13.4s, v5.s[2]
209 FMLA v28.4s, v12.4s, v6.s[2]
210 FMLA v29.4s, v13.4s, v6.s[2]
211 FMLA v30.4s, v12.4s, v7.s[2]
212 FMLA v31.4s, v13.4s, v7.s[2]
213 FMLA v16.4s, v14.4s, v4.s[3]
214 FMLA v17.4s, v15.4s, v4.s[3]
215 FMLA v18.4s, v14.4s, v5.s[3]
216 FMLA v19.4s, v15.4s, v5.s[3]
217 FMLA v28.4s, v14.4s, v6.s[3]
218 FMLA v29.4s, v15.4s, v6.s[3]
219 SUBS x0, x0, 32
220 FMLA v30.4s, v14.4s, v7.s[3]
221 FMLA v31.4s, v15.4s, v7.s[3]
222 B.HS 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223
2242:
225 # Epilogue
226 # First block of 4. FMA for first 4, loads for 2nd block of 4.
Frank Barchard76f43f02021-05-12 14:52:01 -0700227 FMLA v16.4s, v20.4s, v0.s[0]
228 LDP q8, q9, [x5], 32
229 FMLA v17.4s, v21.4s, v0.s[0]
230 FMLA v18.4s, v20.4s, v1.s[0]
231 LDP q10, q11, [x5], 32
232 FMLA v19.4s, v21.4s, v1.s[0]
233 FMLA v28.4s, v20.4s, v2.s[0]
234 LDP q12, q13, [x5], 32
235 FMLA v29.4s, v21.4s, v2.s[0]
236 FMLA v30.4s, v20.4s, v3.s[0]
237 LDP q14, q15, [x5], 32
238 FMLA v31.4s, v21.4s, v3.s[0]
239 FMLA v16.4s, v22.4s, v0.s[1]
240 LDR q4, [x3], 16
241 FMLA v17.4s, v23.4s, v0.s[1]
242 FMLA v18.4s, v22.4s, v1.s[1]
243 LDR q5, [x11], 16
244 FMLA v19.4s, v23.4s, v1.s[1]
245 FMLA v28.4s, v22.4s, v2.s[1]
246 LDR q6, [x12], 16
247 FMLA v29.4s, v23.4s, v2.s[1]
248 FMLA v30.4s, v22.4s, v3.s[1]
249 LDR q7, [x4], 16
250 FMLA v31.4s, v23.4s, v3.s[1]
251 FMLA v16.4s, v24.4s, v0.s[2]
252 FMLA v17.4s, v25.4s, v0.s[2]
253 FMLA v18.4s, v24.4s, v1.s[2]
254 FMLA v19.4s, v25.4s, v1.s[2]
255 FMLA v28.4s, v24.4s, v2.s[2]
256 FMLA v29.4s, v25.4s, v2.s[2]
257 FMLA v30.4s, v24.4s, v3.s[2]
258 FMLA v31.4s, v25.4s, v3.s[2]
259 FMLA v16.4s, v26.4s, v0.s[3]
260 FMLA v17.4s, v27.4s, v0.s[3]
261 FMLA v18.4s, v26.4s, v1.s[3]
262 FMLA v19.4s, v27.4s, v1.s[3]
263 FMLA v28.4s, v26.4s, v2.s[3]
264 FMLA v29.4s, v27.4s, v2.s[3]
265 FMLA v30.4s, v26.4s, v3.s[3]
266 FMLA v31.4s, v27.4s, v3.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700267
268 # Second block of 4. FMA for second 4, noloads
Frank Barchard76f43f02021-05-12 14:52:01 -0700269 FMLA v16.4s, v8.4s, v4.s[0]
270 FMLA v17.4s, v9.4s, v4.s[0]
271 FMLA v18.4s, v8.4s, v5.s[0]
272 FMLA v19.4s, v9.4s, v5.s[0]
273 FMLA v28.4s, v8.4s, v6.s[0]
274 FMLA v29.4s, v9.4s, v6.s[0]
275 FMLA v30.4s, v8.4s, v7.s[0]
276 FMLA v31.4s, v9.4s, v7.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700277
Frank Barchard76f43f02021-05-12 14:52:01 -0700278 FMLA v16.4s, v10.4s, v4.s[1]
279 FMLA v17.4s, v11.4s, v4.s[1]
280 FMLA v18.4s, v10.4s, v5.s[1]
281 FMLA v19.4s, v11.4s, v5.s[1]
282 FMLA v28.4s, v10.4s, v6.s[1]
283 FMLA v29.4s, v11.4s, v6.s[1]
284 FMLA v30.4s, v10.4s, v7.s[1]
285 FMLA v31.4s, v11.4s, v7.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700286
Frank Barchard76f43f02021-05-12 14:52:01 -0700287 FMLA v16.4s, v12.4s, v4.s[2]
288 FMLA v17.4s, v13.4s, v4.s[2]
289 FMLA v18.4s, v12.4s, v5.s[2]
290 FMLA v19.4s, v13.4s, v5.s[2]
291 FMLA v28.4s, v12.4s, v6.s[2]
292 FMLA v29.4s, v13.4s, v6.s[2]
293 FMLA v30.4s, v12.4s, v7.s[2]
294 FMLA v31.4s, v13.4s, v7.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700295
Frank Barchard76f43f02021-05-12 14:52:01 -0700296 FMLA v16.4s, v14.4s, v4.s[3]
297 FMLA v17.4s, v15.4s, v4.s[3]
298 FMLA v18.4s, v14.4s, v5.s[3]
299 FMLA v19.4s, v15.4s, v5.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700300
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700301 # Load min/max values
Frank Barchard76f43f02021-05-12 14:52:01 -0700302 LD2R {v4.4s, v5.4s}, [x8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700303
Frank Barchard76f43f02021-05-12 14:52:01 -0700304 FMLA v28.4s, v14.4s, v6.s[3]
305 FMLA v29.4s, v15.4s, v6.s[3]
306 FMLA v30.4s, v14.4s, v7.s[3]
307 FMLA v31.4s, v15.4s, v7.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700308
3093:
310 # Remainder- 4 floats of A (16 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700311 TBZ x0, 4, 4f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700312
Frank Barchard76f43f02021-05-12 14:52:01 -0700313 LDR q0, [x3], 16
314 LDP q20, q21, [x5], 32
315 LDR q1, [x11], 16
316 LDR q2, [x12], 16
317 LDR q3, [x4], 16
318 FMLA v16.4s, v20.4s, v0.s[0]
319 FMLA v17.4s, v21.4s, v0.s[0]
320 LDP q22, q23, [x5], 32
321 FMLA v18.4s, v20.4s, v1.s[0]
322 FMLA v19.4s, v21.4s, v1.s[0]
323 LDP q24, q25, [x5], 32
324 FMLA v28.4s, v20.4s, v2.s[0]
325 FMLA v29.4s, v21.4s, v2.s[0]
326 LDP q26, q27, [x5], 32
327 FMLA v30.4s, v20.4s, v3.s[0]
328 FMLA v31.4s, v21.4s, v3.s[0]
329 FMLA v16.4s, v22.4s, v0.s[1]
330 FMLA v17.4s, v23.4s, v0.s[1]
331 FMLA v18.4s, v22.4s, v1.s[1]
332 FMLA v19.4s, v23.4s, v1.s[1]
333 FMLA v28.4s, v22.4s, v2.s[1]
334 FMLA v29.4s, v23.4s, v2.s[1]
335 FMLA v30.4s, v22.4s, v3.s[1]
336 FMLA v31.4s, v23.4s, v3.s[1]
337 FMLA v16.4s, v24.4s, v0.s[2]
338 FMLA v17.4s, v25.4s, v0.s[2]
339 FMLA v18.4s, v24.4s, v1.s[2]
340 FMLA v19.4s, v25.4s, v1.s[2]
341 FMLA v28.4s, v24.4s, v2.s[2]
342 FMLA v29.4s, v25.4s, v2.s[2]
343 FMLA v30.4s, v24.4s, v3.s[2]
344 FMLA v31.4s, v25.4s, v3.s[2]
345 FMLA v16.4s, v26.4s, v0.s[3]
346 FMLA v17.4s, v27.4s, v0.s[3]
347 FMLA v18.4s, v26.4s, v1.s[3]
348 FMLA v19.4s, v27.4s, v1.s[3]
349 FMLA v28.4s, v26.4s, v2.s[3]
350 FMLA v29.4s, v27.4s, v2.s[3]
351 FMLA v30.4s, v26.4s, v3.s[3]
352 FMLA v31.4s, v27.4s, v3.s[3]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700353
3544:
355 # Remainder- 2 floats of A (8 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700356 TBZ x0, 3, 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700357
Frank Barchard76f43f02021-05-12 14:52:01 -0700358 LDR d0, [x3], 8
359 LDP q20, q21, [x5], 32
360 LDR d1, [x11], 8
361 LDR d2, [x12], 8
362 LDR d3, [x4], 8
363 FMLA v16.4s, v20.4s, v0.s[0]
364 FMLA v17.4s, v21.4s, v0.s[0]
365 LDP q22, q23, [x5], 32
366 FMLA v18.4s, v20.4s, v1.s[0]
367 FMLA v19.4s, v21.4s, v1.s[0]
368 FMLA v28.4s, v20.4s, v2.s[0]
369 FMLA v29.4s, v21.4s, v2.s[0]
370 FMLA v30.4s, v20.4s, v3.s[0]
371 FMLA v31.4s, v21.4s, v3.s[0]
372 FMLA v16.4s, v22.4s, v0.s[1]
373 FMLA v17.4s, v23.4s, v0.s[1]
374 FMLA v18.4s, v22.4s, v1.s[1]
375 FMLA v19.4s, v23.4s, v1.s[1]
376 FMLA v28.4s, v22.4s, v2.s[1]
377 FMLA v29.4s, v23.4s, v2.s[1]
378 FMLA v30.4s, v22.4s, v3.s[1]
379 FMLA v31.4s, v23.4s, v3.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700380
3815:
382 # Remainder- 1 float of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700383 TBZ x0, 2, 6f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700384
Frank Barchard76f43f02021-05-12 14:52:01 -0700385 LDR s0, [x3], 4
386 LDP q20, q21, [x5], 32
387 LDR s1, [x11], 4
388 LDR s2, [x12], 4
389 LDR s3, [x4], 4
390 FMLA v16.4s, v20.4s, v0.s[0]
391 FMLA v17.4s, v21.4s, v0.s[0]
392 FMLA v18.4s, v20.4s, v1.s[0]
393 FMLA v19.4s, v21.4s, v1.s[0]
394 FMLA v28.4s, v20.4s, v2.s[0]
395 FMLA v29.4s, v21.4s, v2.s[0]
396 FMLA v30.4s, v20.4s, v3.s[0]
397 FMLA v31.4s, v21.4s, v3.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700398
3996:
400 # Clamp
Frank Barchard76f43f02021-05-12 14:52:01 -0700401 FMAX v16.4s, v16.4s, v4.4s
402 SUBS x1, x1, 8
403 FMAX v17.4s, v17.4s, v4.4s
404 FMAX v18.4s, v18.4s, v4.4s
405 FMAX v19.4s, v19.4s, v4.4s
406 FMAX v28.4s, v28.4s, v4.4s
407 FMAX v29.4s, v29.4s, v4.4s
408 FMAX v30.4s, v30.4s, v4.4s
409 FMAX v31.4s, v31.4s, v4.4s
410 FMIN v16.4s, v16.4s, v5.4s
411 FMIN v17.4s, v17.4s, v5.4s
412 FMIN v18.4s, v18.4s, v5.4s
413 FMIN v19.4s, v19.4s, v5.4s
414 FMIN v28.4s, v28.4s, v5.4s
415 FMIN v29.4s, v29.4s, v5.4s
416 FMIN v30.4s, v30.4s, v5.4s
417 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700418
419 # Store full 4 x 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700420 B.LO 7f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700421
Frank Barchard19418b52019-11-15 15:15:13 -0800422 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700423 STP q30, q31, [x7]
424 SUB x3, x3, x2 // a0 -= kc
425 ADD x7, x7, x14
426 STP q28, q29, [x10]
427 SUB x11, x11, x2 // a1 -= kc
428 ADD x10, x10, x14
429 STP q18, q19, [x9]
430 SUB x12, x12, x2 // a2 -= kc
431 ADD x9, x9, x14
432 STP q16, q17, [x6]
433 SUB x4, x4, x2 // a3 -= kc
434 ADD x6, x6, x14
Frank Barchard19418b52019-11-15 15:15:13 -0800435 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700436 STP q16, q17, [x6]
437 SUB x3, x3, x2 // a0 -= kc
438 ADD x6, x6, x14
439 STP q18, q19, [x9]
440 SUB x11, x11, x2 // a1 -= kc
441 ADD x9, x9, x14
442 STP q28, q29, [x10]
443 SUB x12, x12, x2 // a2 -= kc
444 ADD x10, x10, x14
445 STP q30, q31, [x7]
446 SUB x4, x4, x2 // a3 -= kc
447 ADD x7, x7, x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700448
Frank Barchard76f43f02021-05-12 14:52:01 -0700449 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700450
451 # Restore d8-d15 from stack
Frank Barchard76f43f02021-05-12 14:52:01 -0700452 LDP d14, d15, [sp, 48]
453 LDP d12, d13, [sp, 32]
454 LDP d10, d11, [sp, 16]
455 LDP d8, d9, [sp], 64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700456 RET
457
458 # Store odd width
4597:
Frank Barchard76f43f02021-05-12 14:52:01 -0700460 TBZ x1, 2, 8f
Frank Barchard19418b52019-11-15 15:15:13 -0800461 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700462 STR q30, [x7], 16
463 MOV v30.16b, v31.16b
464 STR q28, [x10], 16
465 MOV v28.16b, v29.16b
466 STR q18, [x9], 16
467 MOV v18.16b, v19.16b
468 STR q16, [x6], 16
469 MOV v16.16b, v17.16b
Frank Barchard19418b52019-11-15 15:15:13 -0800470 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700471 STR q16, [x6], 16
472 MOV v16.16b, v17.16b
473 STR q18, [x9], 16
474 MOV v18.16b, v19.16b
475 STR q28, [x10], 16
476 MOV v28.16b, v29.16b
477 STR q30, [x7], 16
478 MOV v30.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700479
4808:
Frank Barchard76f43f02021-05-12 14:52:01 -0700481 TBZ x1, 1, 9f
Frank Barchard19418b52019-11-15 15:15:13 -0800482 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700483 STR d30, [x7], 8
484 DUP d30, v30.d[1]
485 STR d28, [x10], 8
486 DUP d28, v28.d[1]
487 STR d18, [x9], 8
488 DUP d18, v18.d[1]
489 STR d16, [x6], 8
490 DUP d16, v16.d[1]
Frank Barchard19418b52019-11-15 15:15:13 -0800491 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700492 STR d16, [x6], 8
493 DUP d16, v16.d[1]
494 STR d18, [x9], 8
495 DUP d18, v18.d[1]
496 STR d28, [x10], 8
497 DUP d28, v28.d[1]
498 STR d30, [x7], 8
499 DUP d30, v30.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700500
5019:
Frank Barchard76f43f02021-05-12 14:52:01 -0700502 TBZ x1, 0, 10f
Frank Barchard19418b52019-11-15 15:15:13 -0800503 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700504 STR s30, [x7]
505 STR s28, [x10]
506 STR s18, [x9]
507 STR s16, [x6]
Frank Barchard19418b52019-11-15 15:15:13 -0800508 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700509 STR s16, [x6]
510 STR s18, [x9]
511 STR s28, [x10]
512 STR s30, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -070051310:
514 # Restore d8-d15 from stack
Frank Barchard76f43f02021-05-12 14:52:01 -0700515 LDP d14, d15, [sp, 48]
516 LDP d12, d13, [sp, 32]
517 LDP d10, d11, [sp, 16]
518 LDP d8, d9, [sp], 64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700519 RET
520
Frank Barchard19418b52019-11-15 15:15:13 -0800521
Frank Barchard143a1102021-06-15 09:15:34 -0700522END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_${"prfm_" if PREFETCH else ""}cortex_a75
XNNPACK Teamb455b122019-09-27 18:10:33 -0700523
524#ifdef __ELF__
525.section ".note.GNU-stack","",%progbits
526#endif