blob: f12b3d08b9dd43f5ad4be692ac53d2f08616dcee [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
XNNPACK Teamb455b122019-09-27 18:10:33 -070050# B v16 v17 v18 v19
51# C v20 v21
52# C v22 v23
53# C v24 v25
54# C v26 v27
55# C v28 v29
56# C v30 v31
57# Clamp v6 v7
58# unused A v8 v9 v10 v11
Frank Barchardcaf85442019-10-21 22:11:06 -070059# unused B v12 v13 v14 v15
XNNPACK Teamb455b122019-09-27 18:10:33 -070060
61BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128
62
Frank Barchardcaf85442019-10-21 22:11:06 -070063 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080064 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 ADD x9, x3, x4 // a1 = a0 + a_stride
66 ADD x16, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 CSEL x9, x3, x9, LO // a1 = a0
68 CSEL x16, x6, x16, LO // c1 = c0
69
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 ADD x10, x9, x4 // a2 = a1 + a_stride
71 ADD x17, x16, x7 // c2 = c1 + cm_stride
72 // if mr <= 2
73 CSEL x10, x9, x10, LS // a2 = a1
74 CSEL x17, x16, x17, LS // c2 = c1
75
Frank Barchard684bbb02019-11-16 14:14:42 -080076 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070077 ADD x11, x10, x4 // a3 = a2 + a_stride
78 ADD x18, x17, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 CSEL x11, x10, x11, LO // a3 = a2
80 CSEL x18, x17, x18, LO // c3 = c2
81
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 ADD x12, x11, x4 // a4 = a3 + a_stride
83 ADD x13, x18, x7 // c4 = c3 + cm_stride
84 // if mr <= 5
85 CSEL x12, x11, x12, LS // a4 = a3
86 CSEL x13, x18, x13, LS // c4 = c3
87
88 # Load params pointer
Frank Barchardcaf85442019-10-21 22:11:06 -070089 LDR x8, [sp, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070090
Frank Barchard684bbb02019-11-16 14:14:42 -080091 CMP x0, 6 // if mr < 6
XNNPACK Teamb455b122019-09-27 18:10:33 -070092 ADD x4, x12, x4 // a5 = a4 + a_stride
93 ADD x7, x13, x7 // c5 = c4 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070094 CSEL x4, x12, x4, LO // a5 = a4
95 CSEL x7, x13, x7, LO // c5 = c4
96
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070097 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 LD2R {v6.4s, v7.4s}, [x8]
99
100 # Load cn_stride
Frank Barchardcaf85442019-10-21 22:11:06 -0700101 LDR x14, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700102
1030:
104 # Load initial bias from w into accumulators
105 LDP q20, q21, [x5], 32
106 MOV v22.16b, v20.16b
107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
108 MOV v23.16b, v21.16b
109 PRFM PLDL1KEEP, [x5, 64]
110 MOV v24.16b, v20.16b
111 PRFM PLDL1KEEP, [x5, 128]
112 MOV v25.16b, v21.16b
113 PRFM PLDL1KEEP, [x5, 192]
114 MOV v26.16b, v20.16b
115 PRFM PLDL1KEEP, [x3] // Prefetch A
116 MOV v27.16b, v21.16b
117 PRFM PLDL1KEEP, [x9]
118 MOV v28.16b, v20.16b
119 PRFM PLDL1KEEP, [x10]
120 MOV v29.16b, v21.16b
121 PRFM PLDL1KEEP, [x11]
122 MOV v30.16b, v20.16b
123 PRFM PLDL1KEEP, [x12]
124 MOV v31.16b, v21.16b
125 PRFM PLDL1KEEP, [x4]
126
127 # Is there at least 4 floats (16 bytes)?
128 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -0800129 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700130
131 # Main loop - 4 floats of A (16 bytes)
Frank Barchardcaf85442019-10-21 22:11:06 -0700132 # 48 FMA + 6 ld128 A + 4 LDP B
XNNPACK Teamb455b122019-09-27 18:10:33 -07001331:
134 LDR q0, [x3], 16
Frank Barchardcaf85442019-10-21 22:11:06 -0700135 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700136 LDR q1, [x9], 16
137 LDR q2, [x10], 16
138 LDR q3, [x11], 16
139 LDR q4, [x12], 16
140 LDR q5, [x4], 16
Frank Barchardcaf85442019-10-21 22:11:06 -0700141 FMLA v20.4s, v16.4s, v0.s[0]
142 FMLA v22.4s, v16.4s, v1.s[0]
143 FMLA v24.4s, v16.4s, v2.s[0]
144 FMLA v26.4s, v16.4s, v3.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700145 LDP q18, q19, [x5], 32
Frank Barchardcaf85442019-10-21 22:11:06 -0700146 FMLA v28.4s, v16.4s, v4.s[0]
147 FMLA v30.4s, v16.4s, v5.s[0]
148 FMLA v21.4s, v17.4s, v0.s[0]
149 FMLA v23.4s, v17.4s, v1.s[0]
150 FMLA v25.4s, v17.4s, v2.s[0]
151 FMLA v27.4s, v17.4s, v3.s[0]
152 FMLA v29.4s, v17.4s, v4.s[0]
153 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700154
Frank Barchardcaf85442019-10-21 22:11:06 -0700155 FMLA v20.4s, v18.4s, v0.s[1]
156 LDP q16, q17, [x5], 32
157 FMLA v22.4s, v18.4s, v1.s[1]
158 FMLA v24.4s, v18.4s, v2.s[1]
159 FMLA v26.4s, v18.4s, v3.s[1]
160 FMLA v28.4s, v18.4s, v4.s[1]
161 FMLA v30.4s, v18.4s, v5.s[1]
162 FMLA v21.4s, v19.4s, v0.s[1]
163 FMLA v23.4s, v19.4s, v1.s[1]
164 FMLA v25.4s, v19.4s, v2.s[1]
165 FMLA v27.4s, v19.4s, v3.s[1]
166 FMLA v29.4s, v19.4s, v4.s[1]
167 FMLA v31.4s, v19.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700168
169 FMLA v20.4s, v16.4s, v0.s[2]
Frank Barchardcaf85442019-10-21 22:11:06 -0700170 LDP q18, q19, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700171 FMLA v22.4s, v16.4s, v1.s[2]
172 FMLA v24.4s, v16.4s, v2.s[2]
173 FMLA v26.4s, v16.4s, v3.s[2]
174 FMLA v28.4s, v16.4s, v4.s[2]
175 FMLA v30.4s, v16.4s, v5.s[2]
176 FMLA v21.4s, v17.4s, v0.s[2]
177 FMLA v23.4s, v17.4s, v1.s[2]
178 FMLA v25.4s, v17.4s, v2.s[2]
179 FMLA v27.4s, v17.4s, v3.s[2]
180 FMLA v29.4s, v17.4s, v4.s[2]
181 FMLA v31.4s, v17.4s, v5.s[2]
182
183 FMLA v20.4s, v18.4s, v0.s[3]
184 FMLA v22.4s, v18.4s, v1.s[3]
185 FMLA v24.4s, v18.4s, v2.s[3]
186 FMLA v26.4s, v18.4s, v3.s[3]
187 FMLA v28.4s, v18.4s, v4.s[3]
188 FMLA v30.4s, v18.4s, v5.s[3]
189 FMLA v21.4s, v19.4s, v0.s[3]
190 FMLA v23.4s, v19.4s, v1.s[3]
191 FMLA v25.4s, v19.4s, v2.s[3]
192 FMLA v27.4s, v19.4s, v3.s[3]
193 SUBS x0, x0, 16
194 FMLA v29.4s, v19.4s, v4.s[3]
195 FMLA v31.4s, v19.4s, v5.s[3]
196 B.HS 1b
197
Frank Barchard81558542020-02-11 16:35:26 -0800198 # Is there a remainder?- 2 floats of A (8 bytes) or less
199 TST x0, 15
200 B.NE 5f
201
2024:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700203 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700204 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800205 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700206 FMAX v21.4s, v21.4s, v6.4s
207 FMAX v22.4s, v22.4s, v6.4s
208 FMAX v23.4s, v23.4s, v6.4s
209 FMAX v24.4s, v24.4s, v6.4s
210 FMAX v25.4s, v25.4s, v6.4s
211 FMAX v26.4s, v26.4s, v6.4s
212 FMAX v27.4s, v27.4s, v6.4s
213 FMAX v28.4s, v28.4s, v6.4s
214 FMAX v29.4s, v29.4s, v6.4s
215 FMAX v30.4s, v30.4s, v6.4s
216 FMAX v31.4s, v31.4s, v6.4s
217 FMIN v20.4s, v20.4s, v7.4s
218 FMIN v21.4s, v21.4s, v7.4s
219 FMIN v22.4s, v22.4s, v7.4s
220 FMIN v23.4s, v23.4s, v7.4s
221 FMIN v24.4s, v24.4s, v7.4s
222 FMIN v25.4s, v25.4s, v7.4s
223 FMIN v26.4s, v26.4s, v7.4s
224 FMIN v27.4s, v27.4s, v7.4s
225 FMIN v28.4s, v28.4s, v7.4s
226 FMIN v29.4s, v29.4s, v7.4s
227 FMIN v30.4s, v30.4s, v7.4s
228 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700229
230 # Store full 6 x 8
Frank Barchard81558542020-02-11 16:35:26 -0800231 B.LO 7f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700232
Frank Barcharde67b7832019-11-12 12:48:40 -0800233 ST1 {v20.16b, v21.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700234 SUB x3, x3, x2 // a0 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800235 ST1 {v22.16b, v23.16b}, [x16], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700236 SUB x9, x9, x2 // a1 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800237 ST1 {v24.16b, v25.16b}, [x17], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700238 SUB x10, x10, x2 // a2 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800239 ST1 {v26.16b, v27.16b}, [x18], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700240 SUB x11, x11, x2 // a3 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800241 ST1 {v28.16b, v29.16b}, [x13], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700242 SUB x12, x12, x2 // a4 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800243 ST1 {v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700244 SUB x4, x4, x2 // a5 -= kc
245
XNNPACK Teamb455b122019-09-27 18:10:33 -0700246 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700247 RET
248
Frank Barchard81558542020-02-11 16:35:26 -08002495:
250 # Is there a remainder?- 2 floats of A (8 bytes)
251 TBZ x0, 3, 6f
252
XNNPACK Teamb455b122019-09-27 18:10:33 -0700253 # Remainder- 2 floats of A (8 bytes)
254 LDR d0, [x3], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700255 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700256 LDR d1, [x9], 8
257 LDR d2, [x10], 8
258 LDR d3, [x11], 8
259 LDR d4, [x12], 8
260 LDR d5, [x4], 8
Frank Barchardcaf85442019-10-21 22:11:06 -0700261 FMLA v20.4s, v16.4s, v0.s[0]
262 FMLA v22.4s, v16.4s, v1.s[0]
263 FMLA v24.4s, v16.4s, v2.s[0]
264 FMLA v26.4s, v16.4s, v3.s[0]
265 LDP q18, q19, [x5], 32
266 FMLA v28.4s, v16.4s, v4.s[0]
267 FMLA v30.4s, v16.4s, v5.s[0]
268 FMLA v21.4s, v17.4s, v0.s[0]
269 FMLA v23.4s, v17.4s, v1.s[0]
270 FMLA v25.4s, v17.4s, v2.s[0]
271 FMLA v27.4s, v17.4s, v3.s[0]
272 FMLA v29.4s, v17.4s, v4.s[0]
273 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700274
Frank Barchardcaf85442019-10-21 22:11:06 -0700275 FMLA v20.4s, v18.4s, v0.s[1]
276 FMLA v22.4s, v18.4s, v1.s[1]
277 FMLA v24.4s, v18.4s, v2.s[1]
278 FMLA v26.4s, v18.4s, v3.s[1]
279 FMLA v28.4s, v18.4s, v4.s[1]
280 FMLA v30.4s, v18.4s, v5.s[1]
281 FMLA v21.4s, v19.4s, v0.s[1]
282 FMLA v23.4s, v19.4s, v1.s[1]
283 FMLA v25.4s, v19.4s, v2.s[1]
284 FMLA v27.4s, v19.4s, v3.s[1]
285 FMLA v29.4s, v19.4s, v4.s[1]
286 FMLA v31.4s, v19.4s, v5.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700287
Frank Barchard81558542020-02-11 16:35:26 -0800288 # Is there a remainder?- 1 floats of A (4 bytes)
289 TBZ x0, 2, 4b
290
291 # Remainder- 1 float of A (4 bytes)
2926:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700293 LDR s0, [x3], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700294 LDP q16, q17, [x5], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700295 LDR s1, [x9], 4
296 LDR s2, [x10], 4
297 LDR s3, [x11], 4
298 LDR s4, [x12], 4
299 LDR s5, [x4], 4
Frank Barchardcaf85442019-10-21 22:11:06 -0700300 FMLA v20.4s, v16.4s, v0.s[0]
301 FMLA v22.4s, v16.4s, v1.s[0]
302 FMLA v24.4s, v16.4s, v2.s[0]
303 FMLA v26.4s, v16.4s, v3.s[0]
304 FMLA v28.4s, v16.4s, v4.s[0]
305 FMLA v30.4s, v16.4s, v5.s[0]
306 FMLA v21.4s, v17.4s, v0.s[0]
307 FMLA v23.4s, v17.4s, v1.s[0]
308 FMLA v25.4s, v17.4s, v2.s[0]
309 FMLA v27.4s, v17.4s, v3.s[0]
310 FMLA v29.4s, v17.4s, v4.s[0]
311 FMLA v31.4s, v17.4s, v5.s[0]
Frank Barchard81558542020-02-11 16:35:26 -0800312 B 4b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700313
314 # Store odd width
Frank Barchard81558542020-02-11 16:35:26 -08003157:
316 TBZ x1, 2, 8f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700317 STR q20, [x6], 16
318 MOV v20.16b, v21.16b
319 STR q22, [x16], 16
320 MOV v22.16b, v23.16b
321 STR q24, [x17], 16
322 MOV v24.16b, v25.16b
323 STR q26, [x18], 16
324 MOV v26.16b, v27.16b
325 STR q28, [x13], 16
326 MOV v28.16b, v29.16b
327 STR q30, [x7], 16
328 MOV v30.16b, v31.16b
329
Frank Barchard81558542020-02-11 16:35:26 -08003308:
331 TBZ x1, 1, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700332 STR d20, [x6], 8
333 DUP d20, v20.d[1]
334 STR d22, [x16], 8
335 DUP d22, v22.d[1]
336 STR d24, [x17], 8
337 DUP d24, v24.d[1]
338 STR d26, [x18], 8
339 DUP d26, v26.d[1]
340 STR d28, [x13], 8
341 DUP d28, v28.d[1]
342 STR d30, [x7], 8
343 DUP d30, v30.d[1]
344
Frank Barchard81558542020-02-11 16:35:26 -08003459:
346 TBZ x1, 0, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700347 STR s20, [x6]
348 STR s22, [x16]
349 STR s24, [x17]
350 STR s26, [x18]
351 STR s28, [x13]
352 STR s30, [x7]
Frank Barchard81558542020-02-11 16:35:26 -080035310:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700354 RET
355
Marat Dukhan57431932019-11-22 07:50:42 -0800356END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128
XNNPACK Teamb455b122019-09-27 18:10:33 -0700357
358#ifdef __ELF__
359.section ".note.GNU-stack","",%progbits
360#endif