blob: 025c9ffa62d5270fc6983729a6cc02aca6ef53b2 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53(
9# size_t mr, (x0) - unused. mr = 1
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const float*restrict w, x5
15# float*restrict c, x6
16# size_t cm_stride, (x7) - unused
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020# const xnn_f32_minmax_params params [sp + 24] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070021
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
Frank Barchardafbca9a2019-10-07 18:20:45 -070025# A pointer
26# x8 a0
XNNPACK Teamb455b122019-09-27 18:10:33 -070027
Frank Barchardafbca9a2019-10-07 18:20:45 -070028# C pointer
XNNPACK Teamb455b122019-09-27 18:10:33 -070029# x6 c0
30
31# Vector register usage and GPR shadows
32# a0 v0 first set of A
33# a0 v1 second set of A
34# B v2 v3 v4 x14 x15 x16 first set of B
Frank Barchard6f8c9662020-03-23 04:16:51 -070035# B v5 v6 v7 x17 x13 x7
XNNPACK Teamb455b122019-09-27 18:10:33 -070036# B v23 v24 v25 x14 x15 x16 second set of B (same x as first set)
Frank Barchard6f8c9662020-03-23 04:16:51 -070037# B v17 v18 v19 x17 x13 x7
XNNPACK Teamb455b122019-09-27 18:10:33 -070038# C v20 v21 v22
39
40BEGIN_FUNCTION xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53
41
42 # Load cn_stride, a_offset
43 LDP x10, x11, [sp]
44
45 # Load zero, clamping params pointer
46 LDP x12, x8, [sp, 16]
47
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070048 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070049 LD2R {v30.4s, v31.4s}, [x8]
50
XNNPACK Teamb455b122019-09-27 18:10:33 -0700510:
52 # Load initial bias from w into accumulators
53 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
54
55 PRFM PLDL1KEEP, [x5]
56 PRFM PLDL1KEEP, [x5, 64]
57 PRFM PLDL1KEEP, [x5, 128]
58 PRFM PLDL1KEEP, [x5, 192]
59 PRFM PLDL1KEEP, [x5, 256]
60 PRFM PLDL1KEEP, [x5, 320]
61
62 MOV x9, x3 // p = ks
63
641:
65 # Load next A pointer
Frank Barchardafbca9a2019-10-07 18:20:45 -070066 LDR x8, [x4], 8
XNNPACK Teamb455b122019-09-27 18:10:33 -070067
Frank Barchardafbca9a2019-10-07 18:20:45 -070068 CMP x8, x12 // if a0 == zero
69 ADD x8, x8, x11 // a0 += a_offset
70 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset
XNNPACK Teamb455b122019-09-27 18:10:33 -070071
72 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
73 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -080074 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -070075
76 # Prologue - loads for first group of 6 fma
77
Frank Barchardafbca9a2019-10-07 18:20:45 -070078 # Read first block of 1 A.
79 LDR d0, [x8], 8 // a0
XNNPACK Teamb455b122019-09-27 18:10:33 -070080
81 LDR d2, [x5] // vb0x0123
82 LDR x14, [x5, 8]
83
84 LDR d3, [x5, 16] // vb0x25567
85 LDR x15, [x5, 24]
86
87 LDR d4, [x5, 32] // vb0x89AB
88 LDR x16, [x5, 40]
89
90 LDR d5, [x5, 48] // vb1x0123
91 LDR x17, [x5, 56]
92
93 LDR d6, [x5, 64] // vb1x25567
Frank Barchard6f8c9662020-03-23 04:16:51 -070094 LDR x13, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -070095
96 LDR d7, [x5, 80] // vb1x89AB
97 LDR x7, [x5, 88]
98 INS v2.d[1], x14
99 ADD x5, x5, 96
100
101 # Is there at least 4 floats (16 bytes) for main loop?
102 SUBS x0, x0, 16 // 4 floats for main loop
103 B.LO 3f
104
105 # Main loop - 4 floats of A (16 bytes)
1062:
107 # First group of 6 fma.
108 # A is loaded for 2nd group into v1
109
110 # BLOCK 0
Frank Barchardafbca9a2019-10-07 18:20:45 -0700111 LDR d1, [x8], 8 // a0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700112 INS v3.d[1], x15
113 FMLA v20.4s, v2.4s, v0.s[0]
114 PRFM PLDL1KEEP, [x5, 192]
115
116 # BLOCK 1
117 INS v4.d[1], x16
118 FMLA v21.4s, v3.4s, v0.s[0]
119 PRFM PLDL1KEEP, [x5, 256]
120
121 # BLOCK 2
122 LDR d23, [x5] // vb0x0123
123 INS v5.d[1], x17
124 LDR x14, [x5, 8]
125 PRFM PLDL1KEEP, [x5, 320]
126 FMLA v22.4s, v4.4s, v0.s[0]
127
128 # BLOCK 3
129 LDR d24, [x5, 16] // vb0x25567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700130 INS v6.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700131 LDR x15, [x5, 24]
132
133 # BLOCK 4
134 LDR d25, [x5, 32] // vb0x89AB
135 INS v7.d[1], x7
136 FMLA v20.4s, v5.4s, v0.s[1]
137 LDR x16, [x5, 40]
138
139 # BLOCK 5
140 LDR d17, [x5, 48] // vb1x0123
141 LDR x17, [x5, 56]
142 FMLA v21.4s, v6.4s, v0.s[1]
143
144 # BLOCK 6
145 LDR d18, [x5, 64] // vb1x25567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700146 LDR x13, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700147 FMLA v22.4s, v7.4s, v0.s[1]
148
149 # BLOCK 7
150 LDR d19, [x5, 80] // vb1x89AB
151 INS v23.d[1], x14 // v23 was loaded in block 2
152 LDR x7, [x5, 88]
153
154 # Second group of 6 fma.
155 # A is loaded for 1st group into v0
156
157 # BLOCK 0
Frank Barchardafbca9a2019-10-07 18:20:45 -0700158 LDR d0, [x8], 8 // a0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700159 INS v24.d[1], x15
160 FMLA v20.4s, v23.4s, v1.s[0]
161
162 # BLOCK 1
163 INS v25.d[1], x16
164 FMLA v21.4s, v24.4s, v1.s[0]
165
166 # BLOCK 2
167 LDR d2, [x5, 96] // vb0x0123
168 INS v17.d[1], x17
169 LDR x14, [x5, 104]
170 FMLA v22.4s, v25.4s, v1.s[0]
171
172 # BLOCK 3
173 LDR d3, [x5, 112] // vb0x25567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700174 INS v18.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700175 LDR x15, [x5, 120]
176
177 # BLOCK 4
178 LDR d4, [x5, 128] // vb0x89AB
179 INS v19.d[1], x7
180 FMLA v20.4s, v17.4s, v1.s[1]
181 LDR x16, [x5, 136]
182
183 # BLOCK 5
184 LDR d5, [x5, 144] // vb1x0123
185 LDR x17, [x5, 152]
186 FMLA v21.4s, v18.4s, v1.s[1]
187
188 # BLOCK 6
189 LDR d6, [x5, 160] // vb1x25567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700190 LDR x13, [x5, 168]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700191 SUBS x0, x0, 16
192 FMLA v22.4s, v19.4s, v1.s[1]
193
194 # BLOCK 7
195 LDR d7, [x5, 176] // vb1x89AB
196 INS v2.d[1], x14
197 LDR x7, [x5, 184]
198 ADD x5, x5, 192
199 B.HS 2b
200
201 # Epilogue
202 # First block same as main loop. Second block has no loads.
2033:
204 # BLOCK 0
Frank Barchardafbca9a2019-10-07 18:20:45 -0700205 LDR d1, [x8], 8 // a0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700206 INS v3.d[1], x15
207 FMLA v20.4s, v2.4s, v0.s[0]
208 PRFM PLDL1KEEP, [x5, 192]
209
210 # BLOCK 1
211 INS v4.d[1], x16
212 FMLA v21.4s, v3.4s, v0.s[0]
213 PRFM PLDL1KEEP, [x5, 256]
214
215 # BLOCK 2
216 LDR d23, [x5] // vb0x0123
217 INS v5.d[1], x17
218 LDR x14, [x5, 8]
219 PRFM PLDL1KEEP, [x5, 320]
220 FMLA v22.4s, v4.4s, v0.s[0]
221
222 # BLOCK 3
223 LDR d24, [x5, 16] // vb0x25567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700224 INS v6.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700225 LDR x15, [x5, 24]
226
227 # BLOCK 4
228 LDR d25, [x5, 32] // vb0x89AB
229 INS v7.d[1], x7
230 FMLA v20.4s, v5.4s, v0.s[1]
231 LDR x16, [x5, 40]
232
233 # BLOCK 5
234 LDR d17, [x5, 48] // vb1x0123
235 LDR x17, [x5, 56]
236 FMLA v21.4s, v6.4s, v0.s[1]
237
238 # BLOCK 6
239 LDR d18, [x5, 64] // vb1x25567
Frank Barchard6f8c9662020-03-23 04:16:51 -0700240 LDR x13, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700241 FMLA v22.4s, v7.4s, v0.s[1]
242
243 # BLOCK 7
244 LDR d19, [x5, 80] // vb1x89AB
245 INS v23.d[1], x14 // v23 was loaded in block 2
246 LDR x7, [x5, 88]
247 ADD x5, x5, 96
248
249 # Second group of 6 fma. 8 blocks of 4 cycles.
250 # Epilogue version does no loads
251
252 # BLOCK 0
253 INS v24.d[1], x15
254 FMLA v20.4s, v23.4s, v1.s[0]
255
256 # BLOCK 1
257 INS v25.d[1], x16
258 FMLA v21.4s, v24.4s, v1.s[0]
259
260 # BLOCK 2
261 INS v17.d[1], x17
262 FMLA v22.4s, v25.4s, v1.s[0]
263
264 # BLOCK 3
Frank Barchard6f8c9662020-03-23 04:16:51 -0700265 INS v18.d[1], x13
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266
267 # BLOCK 4
268 INS v19.d[1], x7
269 FMLA v20.4s, v17.4s, v1.s[1]
Frank Barchard81558542020-02-11 16:35:26 -0800270 TST x0, 15
XNNPACK Teamb455b122019-09-27 18:10:33 -0700271
272 # BLOCK 5
273 FMLA v21.4s, v18.4s, v1.s[1]
274
275 # BLOCK 6
276 FMLA v22.4s, v19.4s, v1.s[1]
277
278 # BLOCK 7
Frank Barchard81558542020-02-11 16:35:26 -0800279 # Is there a remainder?- 2 floats of A (8 bytes) or less
280 B.NE 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700281
2824:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700283 # ks loop
284 SUBS x9, x9, 8 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800285 B.HI 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700286
287 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700288 FMAX v20.4s, v20.4s, v30.4s
289 FMAX v21.4s, v21.4s, v30.4s
290 FMAX v22.4s, v22.4s, v30.4s
291 FMIN v20.4s, v20.4s, v31.4s
292 FMIN v21.4s, v21.4s, v31.4s
293 FMIN v22.4s, v22.4s, v31.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700294
295 # Store full 1 x 12
Frank Barchard6383f492019-12-04 22:33:49 -0800296 SUBS x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700297 B.LO 8f
298
299 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x10
XNNPACK Teamb455b122019-09-27 18:10:33 -0700300 SUB x4, x4, x3 // a -= ks
301
302 # nc loop
XNNPACK Teamb455b122019-09-27 18:10:33 -0700303 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700304 RET
305
Frank Barchard81558542020-02-11 16:35:26 -08003065:
307 # Is there a remainder?- 2 floats of A (8 bytes)
308 TBZ x0, 3, 6f
309
310 # Remainder- 2 floats of A (8 bytes)
Frank Barchardafbca9a2019-10-07 18:20:45 -0700311 LDR d0, [x8], 8 // a0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700312 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
313 LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48
314
315 # First block of 3 B
316 FMLA v20.4s, v2.4s, v0.s[0]
317 FMLA v21.4s, v3.4s, v0.s[0]
318 FMLA v22.4s, v4.4s, v0.s[0]
319
320 # Second block of 3 B
321 FMLA v20.4s, v5.4s, v0.s[1]
322 FMLA v21.4s, v6.4s, v0.s[1]
323 FMLA v22.4s, v7.4s, v0.s[1]
324
Frank Barchard81558542020-02-11 16:35:26 -0800325 TBZ x0, 2, 4b
3266:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700327 # Remainder - 1 float of A (4 bytes)
Frank Barchardafbca9a2019-10-07 18:20:45 -0700328 LDR s0, [x8], 4 // a0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700329 LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
330
331 FMLA v20.4s, v2.4s, v0.s[0]
332 FMLA v21.4s, v3.4s, v0.s[0]
333 FMLA v22.4s, v4.4s, v0.s[0]
Frank Barchard81558542020-02-11 16:35:26 -0800334 B 4b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700335
3368:
Frank Barchard6383f492019-12-04 22:33:49 -0800337 ADD x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700338 # Store odd channels
339 TBZ x1, 3, 9f
340 STP q20, q21, [x6]
341 ADD x6, x6, 32
342 MOV v20.16b, v22.16b
343
3449:
345 TBZ x1, 2, 10f
346 STR q20, [x6], 16
347 MOV v20.16b, v21.16b
348
34910:
350 TBZ x1, 1, 11f
351 STR d20, [x6], 8
352 DUP d20, v20.d[1]
353
35411:
355 TBZ x1, 0, 12f
356 STR s20, [x6]
35712:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700358 RET
359
360END_FUNCTION xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53
361
362#ifdef __ELF__
363.section ".note.GNU-stack","",%progbits
364#endif