blob: fc9bf22a57e1b697a961e9478c92d9cf0540ab75 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53(
13# size_t mr, (x0) - unused. mr = 1
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, (x4) - unused
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, (x7) - unused
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
Frank Barchard1b0421b2019-10-03 16:35:50 -070028# A pointer
XNNPACK Teamb455b122019-09-27 18:10:33 -070029# x3 a0
30
Frank Barchard1b0421b2019-10-03 16:35:50 -070031# C pointer
XNNPACK Teamb455b122019-09-27 18:10:33 -070032# x6 c0
33
34# Vector register usage and GPR shadows
35# a0 v0 first set of A
36# a0 v1 second set of A
37# B v2 v3 v4 x7 x10 x16 first set of B
38# B v5 v6 v7 x17 x18 x9
39# B v23 v24 v25 x7 x10 x16 second set of B (same x as first set)
40# B v17 v18 v19 x17 x18 x9
41# C v20 v21 v22
42
43BEGIN_FUNCTION xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53
44
45 # Load cn_stride, acc
46 LDP x14, x15, [sp]
47 # Load params pointer
48 LDR x8, [sp, 16]
49
50 # Load clamping_params values
51 LD2R {v30.4s, v31.4s}, [x8]
52
530:
54 # Load initial accumulators
55 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
56
57 PRFM PLDL1KEEP, [x5]
58 PRFM PLDL1KEEP, [x5, 64]
59 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -070060
61 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
62 SUBS x0, x2, 16 // k = kc - 16
Haibo Huang016db6e2020-02-12 19:44:10 +000063 B.LO 3f
XNNPACK Teamb455b122019-09-27 18:10:33 -070064
65 # Prologue - loads for first group of 6 fma
66
Frank Barchard1b0421b2019-10-03 16:35:50 -070067 # Read first block of 1 A.
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 LDR d0, [x3], 8 // a0
69
70 LDR d2, [x5] // vb0x0123
71 LDR x7, [x5, 8]
72
73 LDR d3, [x5, 16] // vb0x4567
74 LDR x10, [x5, 24]
75
76 LDR d4, [x5, 32] // vb0x89AB
77 LDR x16, [x5, 40]
78
79 LDR d5, [x5, 48] // vb1x0123
80 LDR x17, [x5, 56]
81
82 LDR d6, [x5, 64] // vb1x4567
83 LDR x18, [x5, 72]
84
85 LDR d7, [x5, 80] // vb1x89AB
86 LDR x9, [x5, 88]
87 INS v2.d[1], x7
88 ADD x5, x5, 96
89
90 # Is there at least 4 floats (16 bytes) for main loop?
91 SUBS x0, x0, 16
92 B.LO 2f
93
94 # Main loop - 4 floats of A (16 bytes)
951:
96 # First group of 6 fma.
97 # A is loaded for 2nd group into v1
98
99 # BLOCK 0
100 LDR d1, [x3], 8 // a0
101 INS v3.d[1], x10
102 FMLA v20.4s, v2.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700103 PRFM PLDL1KEEP, [x5, 96]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700104
105 # BLOCK 1
106 INS v4.d[1], x16
107 FMLA v21.4s, v3.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700108 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700109
110 # BLOCK 2
111 LDR d23, [x5] // vb0x0123
112 INS v5.d[1], x17
113 LDR x7, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700114 FMLA v22.4s, v4.4s, v0.s[0]
115
116 # BLOCK 3
117 LDR d24, [x5, 16] // vb0x4567
118 INS v6.d[1], x18
119 LDR x10, [x5, 24]
120
121 # BLOCK 4
122 LDR d25, [x5, 32] // vb0x89AB
123 INS v7.d[1], x9
124 FMLA v20.4s, v5.4s, v0.s[1]
125 LDR x16, [x5, 40]
126
127 # BLOCK 5
128 LDR d17, [x5, 48] // vb1x0123
129 LDR x17, [x5, 56]
130 FMLA v21.4s, v6.4s, v0.s[1]
131
132 # BLOCK 6
133 LDR d18, [x5, 64] // vb1x4567
134 LDR x18, [x5, 72]
135 FMLA v22.4s, v7.4s, v0.s[1]
136
137 # BLOCK 7
138 LDR d19, [x5, 80] // vb1x89AB
139 INS v23.d[1], x7 // v23 was loaded in block 2
140 LDR x9, [x5, 88]
141
142 # Second group of 6 fma.
143 # A is loaded for 1st group into v0
144
145 # BLOCK 0
146 LDR d0, [x3], 8 // a0
147 INS v24.d[1], x10
148 FMLA v20.4s, v23.4s, v1.s[0]
149
150 # BLOCK 1
151 INS v25.d[1], x16
152 FMLA v21.4s, v24.4s, v1.s[0]
153
154 # BLOCK 2
155 LDR d2, [x5, 96] // vb0x0123
156 INS v17.d[1], x17
157 LDR x7, [x5, 104]
158 FMLA v22.4s, v25.4s, v1.s[0]
159
160 # BLOCK 3
161 LDR d3, [x5, 112] // vb0x4567
162 INS v18.d[1], x18
163 LDR x10, [x5, 120]
164
165 # BLOCK 4
166 LDR d4, [x5, 128] // vb0x89AB
167 INS v19.d[1], x9
168 FMLA v20.4s, v17.4s, v1.s[1]
169 LDR x16, [x5, 136]
170
171 # BLOCK 5
172 LDR d5, [x5, 144] // vb1x0123
173 LDR x17, [x5, 152]
174 FMLA v21.4s, v18.4s, v1.s[1]
175
176 # BLOCK 6
177 LDR d6, [x5, 160] // vb1x4567
178 LDR x18, [x5, 168]
179 SUBS x0, x0, 16
180 FMLA v22.4s, v19.4s, v1.s[1]
181
182 # BLOCK 7
183 LDR d7, [x5, 176] // vb1x89AB
184 INS v2.d[1], x7
185 LDR x9, [x5, 184]
186 ADD x5, x5, 192
187 B.HS 1b
188
189 # Epilogue
190 # First block same as main loop. Second block has no loads.
1912:
192 # BLOCK 0
193 LDR d1, [x3], 8 // a0
194 INS v3.d[1], x10
195 FMLA v20.4s, v2.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700196 PRFM PLDL1KEEP, [x5, 96]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700197
198 # BLOCK 1
199 INS v4.d[1], x16
200 FMLA v21.4s, v3.4s, v0.s[0]
Frank Barchard1b0421b2019-10-03 16:35:50 -0700201 PRFM PLDL1KEEP, [x5, 128]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700202
203 # BLOCK 2
204 LDR d23, [x5] // vb0x0123
205 INS v5.d[1], x17
206 LDR x7, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700207 FMLA v22.4s, v4.4s, v0.s[0]
208
209 # BLOCK 3
210 LDR d24, [x5, 16] // vb0x4567
211 INS v6.d[1], x18
212 LDR x10, [x5, 24]
213
214 # BLOCK 4
215 LDR d25, [x5, 32] // vb0x89AB
216 INS v7.d[1], x9
217 FMLA v20.4s, v5.4s, v0.s[1]
218 LDR x16, [x5, 40]
219
220 # BLOCK 5
221 LDR d17, [x5, 48] // vb1x0123
222 LDR x17, [x5, 56]
223 FMLA v21.4s, v6.4s, v0.s[1]
224
225 # BLOCK 6
226 LDR d18, [x5, 64] // vb1x4567
227 LDR x18, [x5, 72]
228 FMLA v22.4s, v7.4s, v0.s[1]
229
230 # BLOCK 7
231 LDR d19, [x5, 80] // vb1x89AB
232 INS v23.d[1], x7 // v23 was loaded in block 2
233 LDR x9, [x5, 88]
234 ADD x5, x5, 96
235
236 # Second group of 6 fma. 8 blocks of 4 cycles.
237 # Epilogue version does no loads
238
239 # BLOCK 0
240 INS v24.d[1], x10
241 FMLA v20.4s, v23.4s, v1.s[0]
242
243 # BLOCK 1
244 INS v25.d[1], x16
245 FMLA v21.4s, v24.4s, v1.s[0]
246
247 # BLOCK 2
248 INS v17.d[1], x17
249 FMLA v22.4s, v25.4s, v1.s[0]
250
251 # BLOCK 3
252 INS v18.d[1], x18
253
254 # BLOCK 4
255 INS v19.d[1], x9
256 FMLA v20.4s, v17.4s, v1.s[1]
257
258 # BLOCK 5
259 FMLA v21.4s, v18.4s, v1.s[1]
260
261 # BLOCK 6
262 FMLA v22.4s, v19.4s, v1.s[1]
263
264 # BLOCK 7
Haibo Huang016db6e2020-02-12 19:44:10 +00002653:
266 # Is there a remainder?- 2 floats of A (8 bytes)
267 TBNZ x0, 3, 5f
268 # Is there a remainder?- 1 floats of A (4 bytes)
269 TBNZ x0, 2, 6f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700270
2714:
272 # Clamp
273 FMIN v20.4s, v20.4s, v30.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800274 SUBS x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700275 FMIN v21.4s, v21.4s, v30.4s
276 FMIN v22.4s, v22.4s, v30.4s
277 FMAX v20.4s, v20.4s, v31.4s
278 FMAX v21.4s, v21.4s, v31.4s
279 FMAX v22.4s, v22.4s, v31.4s
280
281 # Store full 1 x 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700282 B.LO 7f
283
284 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700285 SUB x3, x3, x2 // a0 -= kc
Haibo Huang016db6e2020-02-12 19:44:10 +0000286
XNNPACK Teamb455b122019-09-27 18:10:33 -0700287 B.HI 0b
Haibo Huang016db6e2020-02-12 19:44:10 +0000288
XNNPACK Teamb455b122019-09-27 18:10:33 -0700289 RET
290
2915:
292 # Remainder - 2 floats of A (8 bytes)
Frank Barchard1b0421b2019-10-03 16:35:50 -0700293 # Read first block of 1 A.
XNNPACK Teamb455b122019-09-27 18:10:33 -0700294 LDR d0, [x3], 8 // a0
295 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
296 LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48
297
298 # First block of 3 B
299 FMLA v20.4s, v2.4s, v0.s[0]
300 FMLA v21.4s, v3.4s, v0.s[0]
301 FMLA v22.4s, v4.4s, v0.s[0]
302
303 # Second block of 3 B
304 FMLA v20.4s, v5.4s, v0.s[1]
305 FMLA v21.4s, v6.4s, v0.s[1]
306 FMLA v22.4s, v7.4s, v0.s[1]
307
308 TBZ x0, 2, 4b
3096:
310 # Remainder - 1 float of A (4 bytes)
311 LDR s0, [x3], 4 // a0
312 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
313
314 FMLA v20.4s, v2.4s, v0.s[0]
315 FMLA v21.4s, v3.4s, v0.s[0]
316 FMLA v22.4s, v4.4s, v0.s[0]
317 B 4b
318
3197:
Frank Barchard6383f492019-12-04 22:33:49 -0800320 ADD x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700321 # Store odd channels
322 TBZ x1, 3, 8f
Frank Barchard5abe43c2019-11-13 16:02:01 -0800323 STP q20, q21, [x6], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700324 MOV v20.16b, v22.16b
325
3268:
327 TBZ x1, 2, 9f
328 STR q20, [x6], 16
329 MOV v20.16b, v21.16b
330
3319:
332 TBZ x1, 1, 10f
333 STR d20, [x6], 8
334 DUP d20, v20.d[1]
335
33610:
337 TBZ x1, 0, 11f
338 STR s20, [x6]
33911:
340 RET
341
342END_FUNCTION xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53
343
344#ifdef __ELF__
345.section ".note.GNU-stack","",%progbits
346#endif