blob: a4b55d9f8442588764e1b1d0604a1c2aba773895 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53(
13# size_t mr, (x0) - unused. mr = 1
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, (x4) - unused
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, (x7) - unused
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
Frank Barchard1b0421b2019-10-03 16:35:50 -070027# A pointer
XNNPACK Teamb455b122019-09-27 18:10:33 -070028# x3 a0
29
Frank Barchard1b0421b2019-10-03 16:35:50 -070030# C pointer
XNNPACK Teamb455b122019-09-27 18:10:33 -070031# x6 c0
32
33# Vector register usage and GPR shadows
34# a0 v0 first set of A
35# a0 v1 second set of A
36# B v2 v3 v4 x7 x10 x16 first set of B
Frank Barchard6f8c9662020-03-23 04:16:51 -070037# B v5 v6 v7 x17 x13 x9
XNNPACK Teamb455b122019-09-27 18:10:33 -070038# B v23 v24 v25 x7 x10 x16 second set of B (same x as first set)
Frank Barchard6f8c9662020-03-23 04:16:51 -070039# B v17 v18 v19 x17 x13 x9
XNNPACK Teamb455b122019-09-27 18:10:33 -070040# C v20 v21 v22
41
42BEGIN_FUNCTION xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53
43
44 # Load cn_stride, params pointer
45 LDP x14, x8, [sp]
46
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070047 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070048 LD2R {v30.4s, v31.4s}, [x8]
49
500:
51 # Load initial bias from w into accumulators
52 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
53
54 PRFM PLDL1KEEP, [x5]
55 PRFM PLDL1KEEP, [x5, 64]
56 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -070057
58 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
59 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -080060 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -070061
62 # Prologue - loads for first group of 6 fma
63
Frank Barchard1b0421b2019-10-03 16:35:50 -070064 # Read first block of 1 A.
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 LDR d0, [x3], 8 // a0
66
67 LDR d2, [x5] // vb0x0123
68 LDR x7, [x5, 8]
69
70 LDR d3, [x5, 16] // vb0x4567
71 LDR x10, [x5, 24]
72
73 LDR d4, [x5, 32] // vb0x89AB
74 LDR x16, [x5, 40]
75
76 LDR d5, [x5, 48] // vb1x0123
77 LDR x17, [x5, 56]
78
79 LDR d6, [x5, 64] // vb1x4567
Frank Barchard6f8c9662020-03-23 04:16:51 -070080 LDR x13, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -070081
82 LDR d7, [x5, 80] // vb1x89AB
83 LDR x9, [x5, 88]
84 INS v2.d[1], x7
85 ADD x5, x5, 96
86
87 # Is there at least 4 floats (16 bytes) for main loop?
88 SUBS x0, x0, 16
89 B.LO 2f
90
91 # Main loop - 4 floats of A (16 bytes)
921:
93 # First group of 6 fma.
94 # A is loaded for 2nd group into v1
95
96 # BLOCK 0
97 LDR d1, [x3], 8 // a0
98 INS v3.d[1], x10
99 FMLA v20.4s, v2.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700100 PRFM PLDL1KEEP, [x5, 96]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700101
102 # BLOCK 1
103 INS v4.d[1], x16
104 FMLA v21.4s, v3.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700105 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700106
107 # BLOCK 2
108 LDR d23, [x5] // vb0x0123
109 INS v5.d[1], x17
110 LDR x7, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700111 FMLA v22.4s, v4.4s, v0.s[0]
112
113 # BLOCK 3
114 LDR d24, [x5, 16] // vb0x4567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700115 INS v6.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700116 LDR x10, [x5, 24]
117
118 # BLOCK 4
119 LDR d25, [x5, 32] // vb0x89AB
120 INS v7.d[1], x9
121 FMLA v20.4s, v5.4s, v0.s[1]
122 LDR x16, [x5, 40]
123
124 # BLOCK 5
125 LDR d17, [x5, 48] // vb1x0123
126 LDR x17, [x5, 56]
127 FMLA v21.4s, v6.4s, v0.s[1]
128
129 # BLOCK 6
130 LDR d18, [x5, 64] // vb1x4567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700131 LDR x13, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700132 FMLA v22.4s, v7.4s, v0.s[1]
133
134 # BLOCK 7
135 LDR d19, [x5, 80] // vb1x89AB
136 INS v23.d[1], x7 // v23 was loaded in block 2
137 LDR x9, [x5, 88]
138
139 # Second group of 6 fma.
140 # A is loaded for 1st group into v0
141
142 # BLOCK 0
143 LDR d0, [x3], 8 // a0
144 INS v24.d[1], x10
145 FMLA v20.4s, v23.4s, v1.s[0]
146
147 # BLOCK 1
148 INS v25.d[1], x16
149 FMLA v21.4s, v24.4s, v1.s[0]
150
151 # BLOCK 2
152 LDR d2, [x5, 96] // vb0x0123
153 INS v17.d[1], x17
154 LDR x7, [x5, 104]
155 FMLA v22.4s, v25.4s, v1.s[0]
156
157 # BLOCK 3
158 LDR d3, [x5, 112] // vb0x4567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700159 INS v18.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700160 LDR x10, [x5, 120]
161
162 # BLOCK 4
163 LDR d4, [x5, 128] // vb0x89AB
164 INS v19.d[1], x9
165 FMLA v20.4s, v17.4s, v1.s[1]
166 LDR x16, [x5, 136]
167
168 # BLOCK 5
169 LDR d5, [x5, 144] // vb1x0123
170 LDR x17, [x5, 152]
171 FMLA v21.4s, v18.4s, v1.s[1]
172
173 # BLOCK 6
174 LDR d6, [x5, 160] // vb1x4567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700175 LDR x13, [x5, 168]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700176 SUBS x0, x0, 16
177 FMLA v22.4s, v19.4s, v1.s[1]
178
179 # BLOCK 7
180 LDR d7, [x5, 176] // vb1x89AB
181 INS v2.d[1], x7
182 LDR x9, [x5, 184]
183 ADD x5, x5, 192
184 B.HS 1b
185
186 # Epilogue
187 # First block same as main loop. Second block has no loads.
1882:
189 # BLOCK 0
190 LDR d1, [x3], 8 // a0
191 INS v3.d[1], x10
192 FMLA v20.4s, v2.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700193 PRFM PLDL1KEEP, [x5, 96]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700194
195 # BLOCK 1
196 INS v4.d[1], x16
197 FMLA v21.4s, v3.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700198 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700199
200 # BLOCK 2
201 LDR d23, [x5] // vb0x0123
202 INS v5.d[1], x17
203 LDR x7, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700204 FMLA v22.4s, v4.4s, v0.s[0]
205
206 # BLOCK 3
207 LDR d24, [x5, 16] // vb0x4567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700208 INS v6.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700209 LDR x10, [x5, 24]
210
211 # BLOCK 4
212 LDR d25, [x5, 32] // vb0x89AB
213 INS v7.d[1], x9
214 FMLA v20.4s, v5.4s, v0.s[1]
215 LDR x16, [x5, 40]
216
217 # BLOCK 5
218 LDR d17, [x5, 48] // vb1x0123
219 LDR x17, [x5, 56]
220 FMLA v21.4s, v6.4s, v0.s[1]
221
222 # BLOCK 6
223 LDR d18, [x5, 64] // vb1x4567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700224 LDR x13, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700225 FMLA v22.4s, v7.4s, v0.s[1]
226
227 # BLOCK 7
228 LDR d19, [x5, 80] // vb1x89AB
229 INS v23.d[1], x7 // v23 was loaded in block 2
230 LDR x9, [x5, 88]
231 ADD x5, x5, 96
232
233 # Second group of 6 fma. 8 blocks of 4 cycles.
234 # Epilogue version does no loads
235
236 # BLOCK 0
237 INS v24.d[1], x10
238 FMLA v20.4s, v23.4s, v1.s[0]
239
240 # BLOCK 1
241 INS v25.d[1], x16
242 FMLA v21.4s, v24.4s, v1.s[0]
243
244 # BLOCK 2
245 INS v17.d[1], x17
246 FMLA v22.4s, v25.4s, v1.s[0]
247
248 # BLOCK 3
Frank Barchard6f8c9662020-03-23 04:16:51 -0700249 INS v18.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700250
251 # BLOCK 4
252 INS v19.d[1], x9
253 FMLA v20.4s, v17.4s, v1.s[1]
Frank Barchard81558542020-02-11 16:35:26 -0800254 TST x0, 15
XNNPACK Teamb455b122019-09-27 18:10:33 -0700255
256 # BLOCK 5
257 FMLA v21.4s, v18.4s, v1.s[1]
258
259 # BLOCK 6
260 FMLA v22.4s, v19.4s, v1.s[1]
261
262 # BLOCK 7
Frank Barchard81558542020-02-11 16:35:26 -0800263 # Is there a remainder?- 2 floats of A (8 bytes) or less
264 B.NE 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700265
2664:
267 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700268 FMAX v20.4s, v20.4s, v30.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800269 SUBS x1, x1, 12
Marat Dukhana51cf482020-04-08 16:16:19 -0700270 FMAX v21.4s, v21.4s, v30.4s
271 FMAX v22.4s, v22.4s, v30.4s
272 FMIN v20.4s, v20.4s, v31.4s
273 FMIN v21.4s, v21.4s, v31.4s
274 FMIN v22.4s, v22.4s, v31.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700275
276 # Store full 1 x 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700277 B.LO 7f
278
279 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700280 SUB x3, x3, x2 // a0 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700281 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700282 RET
283
2845:
Frank Barchard81558542020-02-11 16:35:26 -0800285 # Is there a remainder?- 2 floats of A (8 bytes)
286 TBZ x0, 3, 6f
287
XNNPACK Teamb455b122019-09-27 18:10:33 -0700288 # Remainder - 2 floats of A (8 bytes)
Frank Barchard1b0421b2019-10-03 16:35:50 -0700289 # Read first block of 1 A.
XNNPACK Teamb455b122019-09-27 18:10:33 -0700290 LDR d0, [x3], 8 // a0
291 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
292 LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48
293
294 # First block of 3 B
295 FMLA v20.4s, v2.4s, v0.s[0]
296 FMLA v21.4s, v3.4s, v0.s[0]
297 FMLA v22.4s, v4.4s, v0.s[0]
298
299 # Second block of 3 B
300 FMLA v20.4s, v5.4s, v0.s[1]
301 FMLA v21.4s, v6.4s, v0.s[1]
302 FMLA v22.4s, v7.4s, v0.s[1]
303
304 TBZ x0, 2, 4b
3056:
306 # Remainder - 1 float of A (4 bytes)
307 LDR s0, [x3], 4 // a0
308 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
309
310 FMLA v20.4s, v2.4s, v0.s[0]
311 FMLA v21.4s, v3.4s, v0.s[0]
312 FMLA v22.4s, v4.4s, v0.s[0]
313 B 4b
314
3157:
Frank Barchard6383f492019-12-04 22:33:49 -0800316 ADD x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700317 # Store odd channels
318 TBZ x1, 3, 8f
Frank Barchard5abe43c2019-11-13 16:02:01 -0800319 STP q20, q21, [x6], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700320 MOV v20.16b, v22.16b
321
3228:
323 TBZ x1, 2, 9f
324 STR q20, [x6], 16
325 MOV v20.16b, v21.16b
326
3279:
328 TBZ x1, 1, 10f
329 STR d20, [x6], 8
330 DUP d20, v20.d[1]
331
33210:
333 TBZ x1, 0, 11f
334 STR s20, [x6]
33511:
336 RET
337
338END_FUNCTION xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53
339
340#ifdef __ELF__
341.section ".note.GNU-stack","",%progbits
342#endif