blob: b9f64941e2f8317d6c36da7f1ea69906e7364ec9 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34# x4 a5
35
36# C pointers
37# x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42# x7 c5
43
44# Vector register usage
45# A0 v0
46# A1 v1
47# A2 v2
48# A3 v3
49# A4 v4
50# A5 v5
XNNPACK Teamb455b122019-09-27 18:10:33 -070051# B v16 v17 v18 v19
52# C v20 v21
53# C v22 v23
54# C v24 v25
55# C v26 v27
56# C v28 v29
57# C v30 v31
58# Clamp v6 v7
59# unused A v8 v9 v10 v11
Frank Barchardcaf85442019-10-21 22:11:06 -070060# unused B v12 v13 v14 v15
XNNPACK Teamb455b122019-09-27 18:10:33 -070061
62BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
63
Frank Barchardcaf85442019-10-21 22:11:06 -070064 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080065 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 ADD x9, x3, x4 // a1 = a0 + a_stride
67 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 CSEL x9, x3, x9, LO // a1 = a0
69 CSEL x16, x6, x16, LO // c1 = c0
70
XNNPACK Teamb455b122019-09-27 18:10:33 -070071 ADD x10, x9, x4 // a2 = a1 + a_stride
72 ADD x17, x16, x7 // c2 = c1 + cm_stride
73 // if mr <= 2
74 CSEL x10, x9, x10, LS // a2 = a1
75 CSEL x17, x16, x17, LS // c2 = c1
76
Frank Barchard684bbb02019-11-16 14:14:42 -080077 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 ADD x11, x10, x4 // a3 = a2 + a_stride
79 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 CSEL x11, x10, x11, LO // a3 = a2
81 CSEL x18, x17, x18, LO // c3 = c2
82
XNNPACK Teamb455b122019-09-27 18:10:33 -070083 ADD x12, x11, x4 // a4 = a3 + a_stride
84 ADD x13, x18, x7 // c4 = c3 + cm_stride
85 // if mr <= 5
86 CSEL x12, x11, x12, LS // a4 = a3
87 CSEL x13, x18, x13, LS // c4 = c3
88
89 # Load acc, params pointer
Frank Barchardcaf85442019-10-21 22:11:06 -070090 LDP x15, x8, [sp, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070091
Frank Barchard684bbb02019-11-16 14:14:42 -080092 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070093 ADD x4, x12, x4 // a5 = a4 + a_stride
94 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 CSEL x4, x12, x4, LO // a5 = a4
96 CSEL x7, x13, x7, LO // c5 = c4
97
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070098 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070099 LD2R {v6.4s, v7.4s}, [x8]
100
101 # Load cn_stride
Frank Barchardcaf85442019-10-21 22:11:06 -0700102 LDR x14, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700103
1040:
105 # Load initial accumulators
106 LDP q20, q21, [x15], 32
107 LDP q22, q23, [x15], 32
108 LDP q24, q25, [x15], 32
109 LDP q26, q27, [x15], 32
110 LDP q28, q29, [x15], 32
111 LDP q30, q31, [x15], 32
112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
113 PRFM PLDL1KEEP, [x5, 64]
114 PRFM PLDL1KEEP, [x5, 128]
115 PRFM PLDL1KEEP, [x5, 192]
116 PRFM PLDL1KEEP, [x3] // Prefetch A
117 PRFM PLDL1KEEP, [x9]
118 PRFM PLDL1KEEP, [x10]
119 PRFM PLDL1KEEP, [x11]
120 PRFM PLDL1KEEP, [x12]
121 PRFM PLDL1KEEP, [x4]
122
123 # Is there at least 4 floats (16 bytes)?
124 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -0800125 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700126
127 # Main loop - 4 floats of A (16 bytes)
Frank Barchardcaf85442019-10-21 22:11:06 -0700128 # 48 FMA + 6 ld128 A + 4 LDP B
XNNPACK Teamb455b122019-09-27 18:10:33 -07001291:
130 LDR q0, [x3], 16
Frank Barchardcaf85442019-10-21 22:11:06 -0700131 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700132 LDR q1, [x9], 16
133 LDR q2, [x10], 16
134 LDR q3, [x11], 16
135 LDR q4, [x12], 16
136 LDR q5, [x4], 16
Frank Barchardcaf85442019-10-21 22:11:06 -0700137 FMLA v20.4s, v16.4s, v0.s[0]
138 FMLA v22.4s, v16.4s, v1.s[0]
139 FMLA v24.4s, v16.4s, v2.s[0]
140 FMLA v26.4s, v16.4s, v3.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700141 LDP q18, q19, [x5], 32
Frank Barchardcaf85442019-10-21 22:11:06 -0700142 FMLA v28.4s, v16.4s, v4.s[0]
143 FMLA v30.4s, v16.4s, v5.s[0]
144 FMLA v21.4s, v17.4s, v0.s[0]
145 FMLA v23.4s, v17.4s, v1.s[0]
146 FMLA v25.4s, v17.4s, v2.s[0]
147 FMLA v27.4s, v17.4s, v3.s[0]
148 FMLA v29.4s, v17.4s, v4.s[0]
149 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700150
Frank Barchardcaf85442019-10-21 22:11:06 -0700151 FMLA v20.4s, v18.4s, v0.s[1]
152 LDP q16, q17, [x5], 32
153 FMLA v22.4s, v18.4s, v1.s[1]
154 FMLA v24.4s, v18.4s, v2.s[1]
155 FMLA v26.4s, v18.4s, v3.s[1]
156 FMLA v28.4s, v18.4s, v4.s[1]
157 FMLA v30.4s, v18.4s, v5.s[1]
158 FMLA v21.4s, v19.4s, v0.s[1]
159 FMLA v23.4s, v19.4s, v1.s[1]
160 FMLA v25.4s, v19.4s, v2.s[1]
161 FMLA v27.4s, v19.4s, v3.s[1]
162 FMLA v29.4s, v19.4s, v4.s[1]
163 FMLA v31.4s, v19.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700164
165 FMLA v20.4s, v16.4s, v0.s[2]
Frank Barchardcaf85442019-10-21 22:11:06 -0700166 LDP q18, q19, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700167 FMLA v22.4s, v16.4s, v1.s[2]
168 FMLA v24.4s, v16.4s, v2.s[2]
169 FMLA v26.4s, v16.4s, v3.s[2]
170 FMLA v28.4s, v16.4s, v4.s[2]
171 FMLA v30.4s, v16.4s, v5.s[2]
172 FMLA v21.4s, v17.4s, v0.s[2]
173 FMLA v23.4s, v17.4s, v1.s[2]
174 FMLA v25.4s, v17.4s, v2.s[2]
175 FMLA v27.4s, v17.4s, v3.s[2]
176 FMLA v29.4s, v17.4s, v4.s[2]
177 FMLA v31.4s, v17.4s, v5.s[2]
178
179 FMLA v20.4s, v18.4s, v0.s[3]
180 FMLA v22.4s, v18.4s, v1.s[3]
181 FMLA v24.4s, v18.4s, v2.s[3]
182 FMLA v26.4s, v18.4s, v3.s[3]
183 FMLA v28.4s, v18.4s, v4.s[3]
184 FMLA v30.4s, v18.4s, v5.s[3]
185 FMLA v21.4s, v19.4s, v0.s[3]
186 FMLA v23.4s, v19.4s, v1.s[3]
187 FMLA v25.4s, v19.4s, v2.s[3]
188 FMLA v27.4s, v19.4s, v3.s[3]
189 SUBS x0, x0, 16
190 FMLA v29.4s, v19.4s, v4.s[3]
191 FMLA v31.4s, v19.4s, v5.s[3]
192 B.HS 1b
193
Frank Barchard81558542020-02-11 16:35:26 -0800194 # Is there a remainder?- 2 floats of A (8 bytes) or less
195 TST x0, 15
196 B.NE 5f
197
1984:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700199 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700200 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800201 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700202 FMAX v21.4s, v21.4s, v6.4s
203 FMAX v22.4s, v22.4s, v6.4s
204 FMAX v23.4s, v23.4s, v6.4s
205 FMAX v24.4s, v24.4s, v6.4s
206 FMAX v25.4s, v25.4s, v6.4s
207 FMAX v26.4s, v26.4s, v6.4s
208 FMAX v27.4s, v27.4s, v6.4s
209 FMAX v28.4s, v28.4s, v6.4s
210 FMAX v29.4s, v29.4s, v6.4s
211 FMAX v30.4s, v30.4s, v6.4s
212 FMAX v31.4s, v31.4s, v6.4s
213 FMIN v20.4s, v20.4s, v7.4s
214 FMIN v21.4s, v21.4s, v7.4s
215 FMIN v22.4s, v22.4s, v7.4s
216 FMIN v23.4s, v23.4s, v7.4s
217 FMIN v24.4s, v24.4s, v7.4s
218 FMIN v25.4s, v25.4s, v7.4s
219 FMIN v26.4s, v26.4s, v7.4s
220 FMIN v27.4s, v27.4s, v7.4s
221 FMIN v28.4s, v28.4s, v7.4s
222 FMIN v29.4s, v29.4s, v7.4s
223 FMIN v30.4s, v30.4s, v7.4s
224 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700225
226 # Store full 6 x 8
Frank Barchard81558542020-02-11 16:35:26 -0800227 B.LO 7f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700228
Frank Barcharde67b7832019-11-12 12:48:40 -0800229 ST1 {v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700230 SUB x3, x3, x2 // a0 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800231 ST1 {v28.16b, v29.16b}, [x13], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700232 SUB x9, x9, x2 // a1 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800233 ST1 {v26.16b, v27.16b}, [x18], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700234 SUB x10, x10, x2 // a2 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800235 ST1 {v24.16b, v25.16b}, [x17], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700236 SUB x11, x11, x2 // a3 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800237 ST1 {v22.16b, v23.16b}, [x16], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700238 SUB x12, x12, x2 // a4 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800239 ST1 {v20.16b, v21.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700240 SUB x4, x4, x2 // a5 -= kc
241
XNNPACK Teamb455b122019-09-27 18:10:33 -0700242 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700243 RET
244
Frank Barchard81558542020-02-11 16:35:26 -08002455:
246 # Is there a remainder?- 2 floats of A (8 bytes)
247 TBZ x0, 3, 6f
248
XNNPACK Teamb455b122019-09-27 18:10:33 -0700249 # Remainder- 2 floats of A (8 bytes)
250 LDR d0, [x3], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700251 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700252 LDR d1, [x9], 8
253 LDR d2, [x10], 8
254 LDR d3, [x11], 8
255 LDR d4, [x12], 8
256 LDR d5, [x4], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700257 FMLA v20.4s, v16.4s, v0.s[0]
258 FMLA v22.4s, v16.4s, v1.s[0]
259 FMLA v24.4s, v16.4s, v2.s[0]
260 FMLA v26.4s, v16.4s, v3.s[0]
261 LDP q18, q19, [x5], 32
262 FMLA v28.4s, v16.4s, v4.s[0]
263 FMLA v30.4s, v16.4s, v5.s[0]
264 FMLA v21.4s, v17.4s, v0.s[0]
265 FMLA v23.4s, v17.4s, v1.s[0]
266 FMLA v25.4s, v17.4s, v2.s[0]
267 FMLA v27.4s, v17.4s, v3.s[0]
268 FMLA v29.4s, v17.4s, v4.s[0]
269 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700270
Frank Barchardcaf85442019-10-21 22:11:06 -0700271 FMLA v20.4s, v18.4s, v0.s[1]
272 FMLA v22.4s, v18.4s, v1.s[1]
273 FMLA v24.4s, v18.4s, v2.s[1]
274 FMLA v26.4s, v18.4s, v3.s[1]
275 FMLA v28.4s, v18.4s, v4.s[1]
276 FMLA v30.4s, v18.4s, v5.s[1]
277 FMLA v21.4s, v19.4s, v0.s[1]
278 FMLA v23.4s, v19.4s, v1.s[1]
279 FMLA v25.4s, v19.4s, v2.s[1]
280 FMLA v27.4s, v19.4s, v3.s[1]
281 FMLA v29.4s, v19.4s, v4.s[1]
282 FMLA v31.4s, v19.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700283
Frank Barchard81558542020-02-11 16:35:26 -0800284 # Is there a remainder?- 1 floats of A (4 bytes)
285 TBZ x0, 2, 4b
286
287 # Remainder- 1 float of A (4 bytes)
2886:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700289 LDR s0, [x3], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700290 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700291 LDR s1, [x9], 4
292 LDR s2, [x10], 4
293 LDR s3, [x11], 4
294 LDR s4, [x12], 4
295 LDR s5, [x4], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700296 FMLA v20.4s, v16.4s, v0.s[0]
297 FMLA v22.4s, v16.4s, v1.s[0]
298 FMLA v24.4s, v16.4s, v2.s[0]
299 FMLA v26.4s, v16.4s, v3.s[0]
300 FMLA v28.4s, v16.4s, v4.s[0]
301 FMLA v30.4s, v16.4s, v5.s[0]
302 FMLA v21.4s, v17.4s, v0.s[0]
303 FMLA v23.4s, v17.4s, v1.s[0]
304 FMLA v25.4s, v17.4s, v2.s[0]
305 FMLA v27.4s, v17.4s, v3.s[0]
306 FMLA v29.4s, v17.4s, v4.s[0]
307 FMLA v31.4s, v17.4s, v5.s[0]
Frank Barchard81558542020-02-11 16:35:26 -0800308 B 4b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700309
310 # Store odd width
Frank Barchard81558542020-02-11 16:35:26 -08003117:
312 TBZ x1, 2, 8f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700313 STR q30, [x7], 16
314 MOV v30.16b, v31.16b
315 STR q28, [x13], 16
316 MOV v28.16b, v29.16b
317 STR q26, [x18], 16
318 MOV v26.16b, v27.16b
319 STR q24, [x17], 16
320 MOV v24.16b, v25.16b
321 STR q22, [x16], 16
322 MOV v22.16b, v23.16b
323 STR q20, [x6], 16
324 MOV v20.16b, v21.16b
325
Frank Barchard81558542020-02-11 16:35:26 -08003268:
327 TBZ x1, 1, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700328 STR d30, [x7], 8
329 DUP d30, v30.d[1]
330 STR d28, [x13], 8
331 DUP d28, v28.d[1]
332 STR d26, [x18], 8
333 DUP d26, v26.d[1]
334 STR d24, [x17], 8
335 DUP d24, v24.d[1]
336 STR d22, [x16], 8
337 DUP d22, v22.d[1]
338 STR d20, [x6], 8
339 DUP d20, v20.d[1]
340
Frank Barchard81558542020-02-11 16:35:26 -08003419:
342 TBZ x1, 0, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700343 STR s30, [x7]
344 STR s28, [x13]
345 STR s26, [x18]
346 STR s24, [x17]
347 STR s22, [x16]
348 STR s20, [x6]
Frank Barchard81558542020-02-11 16:35:26 -080034910:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700350 RET
351
Marat Dukhan57431932019-11-22 07:50:42 -0800352END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
XNNPACK Teamb455b122019-09-27 18:10:33 -0700353
354#ifdef __ELF__
355.section ".note.GNU-stack","",%progbits
356#endif