blob: e5744e1304d3600e350576dedb6d40ef334e751a [file] [log] [blame]
Frank Barcharda7fb8552019-10-23 17:14:17 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020# const xnn_f32_minmax_params params [sp + 24] -> x8
Frank Barcharda7fb8552019-10-23 17:14:17 -070021
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x14 a0
27# x15 a1
28# x20 a2
29# x21 a3
30# x22 a4
31# x23 a5
32
33# C pointers
34# x6 c0
35# x16 c1
36# x17 c2
37# x18 c3
38# x13 c4
39# x7 c5
40
Frank Barchard32167582020-01-10 16:10:01 -080041# x19 temporary vector shadow register
Frank Barchard80b537a2019-11-18 10:51:33 -080042
Frank Barchardb3c6c6e2019-10-25 13:15:35 -070043# Vector register usage
Frank Barchard80b537a2019-11-18 10:51:33 -080044# A0 v0 v3
45# A1 v0[1] v3[1]
46# A2 v1 v4
47# A3 v1[1] v4[1]
48# A4 v2 v5
49# A5 v2[1] v5[1]
50# B v12 v13 v14 v15 second set of B
51# B v16 v17 v18 v19 first set
Frank Barcharda7fb8552019-10-23 17:14:17 -070052# C v20 v21
53# C v22 v23
54# C v24 v25
55# C v26 v27
56# C v28 v29
57# C v30 v31
58# Clamp v6 v7
Frank Barchard80b537a2019-11-18 10:51:33 -080059# unused A v8 v9 v10 v11
Frank Barcharda7fb8552019-10-23 17:14:17 -070060
61BEGIN_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53
62
Frank Barchardb3c6c6e2019-10-25 13:15:35 -070063 # Clamp C pointers
Frank Barcharda7fb8552019-10-23 17:14:17 -070064 CMP x0, 2 // if mr < 2
Frank Barchard684bbb02019-11-16 14:14:42 -080065 ADD x16, x6, x7 // c1 = c0 + cm_stride
Frank Barcharda7fb8552019-10-23 17:14:17 -070066 CSEL x16, x6, x16, LO // c1 = c0
67
Frank Barcharda7fb8552019-10-23 17:14:17 -070068 ADD x17, x16, x7 // c2 = c1 + cm_stride
69 // if mr <= 2
70 CSEL x17, x16, x17, LS // c2 = c1
71
Frank Barcharda7fb8552019-10-23 17:14:17 -070072 CMP x0, 4 // if mr < 4
Frank Barchard684bbb02019-11-16 14:14:42 -080073 ADD x18, x17, x7 // c3 = c2 + cm_stride
Frank Barcharda7fb8552019-10-23 17:14:17 -070074 CSEL x18, x17, x18, LO // c3 = c2
75
Frank Barcharda7fb8552019-10-23 17:14:17 -070076 ADD x13, x18, x7 // c4 = c3 + cm_stride
77 // if mr <= 5
78 CSEL x13, x18, x13, LS // c4 = c3
79
Frank Barcharda7fb8552019-10-23 17:14:17 -070080
Frank Barcharda7fb8552019-10-23 17:14:17 -070081 CMP x0, 6 // if mr < 6
Frank Barchard684bbb02019-11-16 14:14:42 -080082 ADD x7, x13, x7 // c5 = c4 + cm_stride
Frank Barcharda7fb8552019-10-23 17:14:17 -070083 CSEL x7, x13, x7, LO // c5 = c4
84
85 # Load cn_stride, a_offset
Frank Barchard80b537a2019-11-18 10:51:33 -080086 LDP x10, x11, [sp]
Frank Barcharda7fb8552019-10-23 17:14:17 -070087
88 # Load zero, clamping params pointer
Frank Barchard80b537a2019-11-18 10:51:33 -080089 LDP x12, x8, [sp, 16]
Frank Barcharda7fb8552019-10-23 17:14:17 -070090
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070091 # Load min/max values
Frank Barcharda7fb8552019-10-23 17:14:17 -070092 LD2R {v6.4s, v7.4s}, [x8]
93
Frank Barchard32167582020-01-10 16:10:01 -080094 // Save x19-x23, d12-d15 on stack
Frank Barchard80b537a2019-11-18 10:51:33 -080095 STP d12, d13, [sp, -80]!
96 STP d14, d15, [sp, 16]
97 STP x19, x20, [sp, 32]
98 STP x21, x22, [sp, 48]
Frank Barchard32167582020-01-10 16:10:01 -080099 STR x23, [sp, 64]
Frank Barchard80b537a2019-11-18 10:51:33 -0800100
Frank Barcharda7fb8552019-10-23 17:14:17 -07001010:
102 # Load initial bias from w into accumulators
103 LDP q20, q21, [x5], 32
104 MOV v22.16b, v20.16b
105 MOV v23.16b, v21.16b
106 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
107 MOV v24.16b, v20.16b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700108 PRFM PLDL1KEEP, [x5, 64]
Frank Barchard64a5bfe2019-10-30 19:12:49 -0700109 MOV v25.16b, v21.16b
110 PRFM PLDL1KEEP, [x5, 128]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700111 MOV v26.16b, v20.16b
Frank Barchard534375d2020-01-15 19:22:41 -0800112 PRFM PLDL1KEEP, [x5, 192]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700113 MOV v27.16b, v21.16b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700114 MOV v28.16b, v20.16b
115 MOV v29.16b, v21.16b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700116 MOV v30.16b, v20.16b
117 MOV v31.16b, v21.16b
118
119 MOV x9, x3 // p = ks
120
1211:
122 # Load next 6 A pointers
123 LDP x14, x15, [x4], 16
124 LDP x20, x21, [x4], 16
125 LDP x22, x23, [x4], 16
126
127 CMP x14, x12 // if a0 == zero
128 ADD x14, x14, x11 // a0 += a_offset
129 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset
130 CMP x15, x12 // if a1 == zero
131 ADD x15, x15, x11 // a1 += a_offset
132 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset
133 CMP x20, x12 // if a2 == zero
134 ADD x20, x20, x11 // a2 += a_offset
135 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset
136 CMP x21, x12 // if a3 == zero
137 ADD x21, x21, x11 // a3 += a_offset
138 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset
139 CMP x22, x12 // if a4 == zero
140 ADD x22, x22, x11 // a4 += a_offset
141 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset
142 CMP x23, x12 // if a5 == zero
143 ADD x23, x23, x11 // a5 += a_offset
144 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset
145
Frank Barchard80b537a2019-11-18 10:51:33 -0800146 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
147 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -0800148 B.LO 5f
Frank Barchard80b537a2019-11-18 10:51:33 -0800149
150 # Prologue - First group loads, no FMA
Frank Barchardf884a7b2020-01-13 16:39:50 -0800151 LDR d0, [x14], 8 // a0
152 LDP q16, q17, [x5], 32 // b
153 LDR d1, [x20], 8 // a2
154 LDR d2, [x22], 8 // a4
155 LD1 {v0.d}[1], [x15], 8 // a1
156 LD1 {v1.d}[1], [x21], 8 // a3
157 LD1 {v2.d}[1], [x23], 8 // a5
Frank Barchard80b537a2019-11-18 10:51:33 -0800158 SUBS x0, x0, 16
159 LDR q18, [x5], 16
160 LDR d19, [x5], 8
161 LDR x19, [x5], 8 // ins is in BLOCK 0
162
163 # Is there at least 4 floats (16 bytes) for main loop?
Frank Barcharda7fb8552019-10-23 17:14:17 -0700164 B.LO 3f
165
Frank Barchard80b537a2019-11-18 10:51:33 -0800166 # Main loop - 4 floats of A (16 bytes)
167 # 48 FMA + 12 LD64 A + 8 LDR B
Frank Barcharda7fb8552019-10-23 17:14:17 -07001682:
Frank Barchard80b537a2019-11-18 10:51:33 -0800169 # First group of 24 FMA, Second group loads
170 // BLOCK 0
171 LDR d3, [x14], 8 // a0
172 INS v19.d[1], x19 // b from second group
Frank Barcharda7fb8552019-10-23 17:14:17 -0700173 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800174 LDR x19, [x15], 8 // a1
Frank Barchard32167582020-01-10 16:10:01 -0800175 FMLA v22.4s, v16.4s, v0.s[2]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700176 FMLA v24.4s, v16.4s, v1.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800177
178 // BLOCK 1
179 LDR d12, [x5]
Frank Barchard32167582020-01-10 16:10:01 -0800180 INS v3.d[1], x19 // a1 ins
Frank Barcharda7fb8552019-10-23 17:14:17 -0700181 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchard32167582020-01-10 16:10:01 -0800182 LDR x19, [x5, 8] // b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700183 FMLA v28.4s, v16.4s, v2.s[0]
184 FMLA v30.4s, v16.4s, v2.s[2]
Frank Barchard64a5bfe2019-10-30 19:12:49 -0700185
Frank Barchard80b537a2019-11-18 10:51:33 -0800186 // BLOCK 2
187 LDR d4, [x20], 8 // a2
Frank Barchard32167582020-01-10 16:10:01 -0800188 INS v12.d[1], x19 // b ins
Frank Barcharda7fb8552019-10-23 17:14:17 -0700189 FMLA v21.4s, v17.4s, v0.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800190 LDR x19, [x21], 8 // a3
Frank Barchard32167582020-01-10 16:10:01 -0800191 FMLA v23.4s, v17.4s, v0.s[2]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700192 FMLA v25.4s, v17.4s, v1.s[0]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700193
Frank Barchard80b537a2019-11-18 10:51:33 -0800194 // BLOCK 3
195 LDR d5, [x22], 8 // a4
Frank Barchard32167582020-01-10 16:10:01 -0800196 INS v4.d[1], x19 // a3 ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800197 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard32167582020-01-10 16:10:01 -0800198 LDR x19, [x23], 8 // a5
Frank Barchard80b537a2019-11-18 10:51:33 -0800199 FMLA v29.4s, v17.4s, v2.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800200 FMLA v31.4s, v17.4s, v2.s[2]
201
202 // BLOCK 4
203 LDR d13, [x5, 16]
Frank Barchard32167582020-01-10 16:10:01 -0800204 INS v5.d[1], x19 // a5 ins
Frank Barcharda7fb8552019-10-23 17:14:17 -0700205 FMLA v20.4s, v18.4s, v0.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800206 LDR x19, [x5, 24]
Frank Barchard32167582020-01-10 16:10:01 -0800207 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700208 FMLA v24.4s, v18.4s, v1.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800209
210 // BLOCK 5
211 LDR d14, [x5, 32]
Frank Barchard32167582020-01-10 16:10:01 -0800212 INS v13.d[1], x19 // b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700213 FMLA v26.4s, v18.4s, v1.s[3]
Frank Barchard32167582020-01-10 16:10:01 -0800214 LDR x19, [x5, 40]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700215 FMLA v28.4s, v18.4s, v2.s[1]
216 FMLA v30.4s, v18.4s, v2.s[3]
Frank Barchard80b537a2019-11-18 10:51:33 -0800217
218 // BLOCK 6
219 LDR d15, [x5, 48]
Frank Barchard32167582020-01-10 16:10:01 -0800220 INS v14.d[1], x19 // b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700221 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800222 LDR x19, [x5, 56]
Frank Barchard32167582020-01-10 16:10:01 -0800223 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700224 FMLA v25.4s, v19.4s, v1.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800225
226 // BLOCK 7
Frank Barchard32167582020-01-10 16:10:01 -0800227 INS v15.d[1], x19
Frank Barcharda7fb8552019-10-23 17:14:17 -0700228 FMLA v27.4s, v19.4s, v1.s[3]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700229 FMLA v29.4s, v19.4s, v2.s[1]
230 FMLA v31.4s, v19.4s, v2.s[3]
Frank Barchard80b537a2019-11-18 10:51:33 -0800231
232 # Second group of 24 FMA, First group of loads
233 // BLOCK 0
234 LDR d0, [x14], 8 // a0
Frank Barchard80b537a2019-11-18 10:51:33 -0800235 FMLA v20.4s, v12.4s, v3.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800236 LDR x19, [x15], 8 // a1
Frank Barchard32167582020-01-10 16:10:01 -0800237 FMLA v22.4s, v12.4s, v3.s[2]
Frank Barchard80b537a2019-11-18 10:51:33 -0800238 FMLA v24.4s, v12.4s, v4.s[0]
239 PRFM PLDL1KEEP, [x14, 128] // Prefetch A0
240
241 // BLOCK 1
242 LDR d16, [x5, 64]
Frank Barchard32167582020-01-10 16:10:01 -0800243 INS v0.d[1], x19 // a1 ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800244 FMLA v26.4s, v12.4s, v4.s[2]
Frank Barchard32167582020-01-10 16:10:01 -0800245 LDR x19, [x5, 72] // b
Frank Barchard80b537a2019-11-18 10:51:33 -0800246 FMLA v28.4s, v12.4s, v5.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800247 FMLA v30.4s, v12.4s, v5.s[2]
248 PRFM PLDL1KEEP, [x15, 128] // Prefetch A1
249
250 // BLOCK 2
251 LDR d1, [x20], 8 // a2
Frank Barchard32167582020-01-10 16:10:01 -0800252 INS v16.d[1], x19 // b
Frank Barchard80b537a2019-11-18 10:51:33 -0800253 FMLA v21.4s, v13.4s, v3.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800254 LDR x19, [x21], 8 // a3
Frank Barchard32167582020-01-10 16:10:01 -0800255 FMLA v23.4s, v13.4s, v3.s[2]
Frank Barchard80b537a2019-11-18 10:51:33 -0800256 FMLA v25.4s, v13.4s, v4.s[0]
257 PRFM PLDL1KEEP, [x20, 128] // Prefetch A2
258
259 // BLOCK 3
260 LDR d2, [x22], 8 // a4
Frank Barchard32167582020-01-10 16:10:01 -0800261 INS v1.d[1], x19 // a3 ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800262 FMLA v27.4s, v13.4s, v4.s[2]
Frank Barchard32167582020-01-10 16:10:01 -0800263 LDR x19, [x23], 8 // a5
Frank Barchard80b537a2019-11-18 10:51:33 -0800264 FMLA v29.4s, v13.4s, v5.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800265 FMLA v31.4s, v13.4s, v5.s[2]
266 PRFM PLDL1KEEP, [x21, 128] // Prefetch A3
267
268 // BLOCK 4
269 LDR d17, [x5, 80]
Frank Barchard32167582020-01-10 16:10:01 -0800270 INS v2.d[1], x19 // a5 ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800271 FMLA v20.4s, v14.4s, v3.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800272 LDR x19, [x5, 88]
Frank Barchard32167582020-01-10 16:10:01 -0800273 FMLA v22.4s, v14.4s, v3.s[3]
Frank Barchard80b537a2019-11-18 10:51:33 -0800274 FMLA v24.4s, v14.4s, v4.s[1]
275 PRFM PLDL1KEEP, [x22, 128] // Prefetch A4
276
277 // BLOCK 5
278 LDR d18, [x5, 96]
Frank Barchard32167582020-01-10 16:10:01 -0800279 INS v17.d[1], x19 // b
Frank Barchard80b537a2019-11-18 10:51:33 -0800280 FMLA v26.4s, v14.4s, v4.s[3]
Frank Barchard32167582020-01-10 16:10:01 -0800281 LDR x19, [x5, 104]
Frank Barchard80b537a2019-11-18 10:51:33 -0800282 FMLA v28.4s, v14.4s, v5.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800283 FMLA v30.4s, v14.4s, v5.s[3]
284 PRFM PLDL1KEEP, [x23, 128] // Prefetch A5
285
286 // BLOCK 6
287 LDR d19, [x5, 112]
Frank Barchard32167582020-01-10 16:10:01 -0800288 INS v18.d[1], x19 // b
Frank Barchard80b537a2019-11-18 10:51:33 -0800289 FMLA v21.4s, v15.4s, v3.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800290 LDR x19, [x5, 120]
Frank Barchard32167582020-01-10 16:10:01 -0800291 FMLA v23.4s, v15.4s, v3.s[3]
Frank Barchard534375d2020-01-15 19:22:41 -0800292 PRFM PLDL1KEEP, [x5, 192] // Prefetch B
Frank Barchard80b537a2019-11-18 10:51:33 -0800293 FMLA v25.4s, v15.4s, v4.s[1]
Frank Barchard534375d2020-01-15 19:22:41 -0800294 PRFM PLDL1KEEP, [x5, 256] // Prefetch B
Frank Barchard80b537a2019-11-18 10:51:33 -0800295
296 // BLOCK 7
297 SUBS x0, x0, 16 // LDR lands here
Frank Barchard80b537a2019-11-18 10:51:33 -0800298 FMLA v27.4s, v15.4s, v4.s[3]
299 FMLA v29.4s, v15.4s, v5.s[1]
300 ADD x5, x5, 128
301 FMLA v31.4s, v15.4s, v5.s[3]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700302 B.HS 2b
Frank Barchard80b537a2019-11-18 10:51:33 -0800303
304 # Epilogue - 4 floats of A (16 bytes)
305 # 48 FMA + 12 LD64 A + 8 LDR B
Frank Barcharda7fb8552019-10-23 17:14:17 -07003063:
Frank Barchard80b537a2019-11-18 10:51:33 -0800307 # First group of 24 FMA, Second group loads
308 // BLOCK 0
309 LDR d3, [x14], 8 // a0
310 INS v19.d[1], x19 // b from second group
311 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800312 LDR x19, [x15], 8 // a1
Frank Barchard32167582020-01-10 16:10:01 -0800313 FMLA v22.4s, v16.4s, v0.s[2]
Frank Barchard80b537a2019-11-18 10:51:33 -0800314 FMLA v24.4s, v16.4s, v1.s[0]
315 PRFM PSTL1KEEP, [x6] // Prefetch C0
316
317 // BLOCK 1
318 LDR d12, [x5]
Frank Barchard32167582020-01-10 16:10:01 -0800319 INS v3.d[1], x19 // a1 ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800320 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchard32167582020-01-10 16:10:01 -0800321 LDR x19, [x5, 8] // b
Frank Barchard80b537a2019-11-18 10:51:33 -0800322 FMLA v28.4s, v16.4s, v2.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800323 FMLA v30.4s, v16.4s, v2.s[2]
324 PRFM PSTL1KEEP, [x16] // Prefetch C1
325
326 // BLOCK 2
327 LDR d4, [x20], 8 // a2
Frank Barchard32167582020-01-10 16:10:01 -0800328 INS v12.d[1], x19 // b ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800329 FMLA v21.4s, v17.4s, v0.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800330 LDR x19, [x21], 8 // a3
Frank Barchard32167582020-01-10 16:10:01 -0800331 FMLA v23.4s, v17.4s, v0.s[2]
Frank Barchard80b537a2019-11-18 10:51:33 -0800332 FMLA v25.4s, v17.4s, v1.s[0]
333 PRFM PSTL1KEEP, [x17] // Prefetch C2
334
335 // BLOCK 3
336 LDR d5, [x22], 8 // a4
Frank Barchard32167582020-01-10 16:10:01 -0800337 INS v4.d[1], x19 // a3 ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800338 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard32167582020-01-10 16:10:01 -0800339 LDR x19, [x23], 8 // a5
Frank Barchard80b537a2019-11-18 10:51:33 -0800340 FMLA v29.4s, v17.4s, v2.s[0]
Frank Barchard80b537a2019-11-18 10:51:33 -0800341 FMLA v31.4s, v17.4s, v2.s[2]
342 PRFM PSTL1KEEP, [x18] // Prefetch C3
343
344 // BLOCK 4
345 LDR d13, [x5, 16]
Frank Barchard32167582020-01-10 16:10:01 -0800346 INS v5.d[1], x19 // a5 ins
Frank Barchard80b537a2019-11-18 10:51:33 -0800347 FMLA v20.4s, v18.4s, v0.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800348 LDR x19, [x5, 24]
Frank Barchard32167582020-01-10 16:10:01 -0800349 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barchard80b537a2019-11-18 10:51:33 -0800350 FMLA v24.4s, v18.4s, v1.s[1]
351 PRFM PSTL1KEEP, [x13] // Prefetch C4
352
353 // BLOCK 5
354 LDR d14, [x5, 32]
Frank Barchard32167582020-01-10 16:10:01 -0800355 INS v13.d[1], x19 // b
Frank Barchard80b537a2019-11-18 10:51:33 -0800356 FMLA v26.4s, v18.4s, v1.s[3]
Frank Barchard32167582020-01-10 16:10:01 -0800357 LDR x19, [x5, 40]
Frank Barchard80b537a2019-11-18 10:51:33 -0800358 FMLA v28.4s, v18.4s, v2.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800359 FMLA v30.4s, v18.4s, v2.s[3]
360 PRFM PSTL1KEEP, [x7] // Prefetch C5
361
362 // BLOCK 6
363 LDR d15, [x5, 48]
Frank Barchard32167582020-01-10 16:10:01 -0800364 INS v14.d[1], x19 // b
Frank Barchard80b537a2019-11-18 10:51:33 -0800365 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchard80b537a2019-11-18 10:51:33 -0800366 LDR x19, [x5, 56]
Frank Barchard32167582020-01-10 16:10:01 -0800367 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barchard80b537a2019-11-18 10:51:33 -0800368 FMLA v25.4s, v19.4s, v1.s[1]
369
370 // BLOCK 7
Frank Barchard32167582020-01-10 16:10:01 -0800371 INS v15.d[1], x19 // b from previous
Frank Barchard80b537a2019-11-18 10:51:33 -0800372 FMLA v27.4s, v19.4s, v1.s[3]
373 FMLA v29.4s, v19.4s, v2.s[1]
374 FMLA v31.4s, v19.4s, v2.s[3]
375
376 # Second group of 24 FMA, First group of loads
377 // BLOCK 0
Frank Barchard80b537a2019-11-18 10:51:33 -0800378 FMLA v20.4s, v12.4s, v3.s[0]
379 FMLA v22.4s, v12.4s, v3.s[2]
380 FMLA v24.4s, v12.4s, v4.s[0]
381
382 // BLOCK 1
383 FMLA v26.4s, v12.4s, v4.s[2]
384 FMLA v28.4s, v12.4s, v5.s[0]
385 FMLA v30.4s, v12.4s, v5.s[2]
386
387 // BLOCK 2
388 FMLA v21.4s, v13.4s, v3.s[0]
389 FMLA v23.4s, v13.4s, v3.s[2]
390 FMLA v25.4s, v13.4s, v4.s[0]
391
392 // BLOCK 3
393 FMLA v27.4s, v13.4s, v4.s[2]
394 FMLA v29.4s, v13.4s, v5.s[0]
395 FMLA v31.4s, v13.4s, v5.s[2]
396
397 // BLOCK 4
398 FMLA v20.4s, v14.4s, v3.s[1]
399 FMLA v22.4s, v14.4s, v3.s[3]
400 FMLA v24.4s, v14.4s, v4.s[1]
401
402 // BLOCK 5
403 FMLA v26.4s, v14.4s, v4.s[3]
404 FMLA v28.4s, v14.4s, v5.s[1]
405 FMLA v30.4s, v14.4s, v5.s[3]
406
407 // BLOCK 6
408 FMLA v21.4s, v15.4s, v3.s[1]
409 FMLA v23.4s, v15.4s, v3.s[3]
410 FMLA v25.4s, v15.4s, v4.s[1]
Frank Barchard81558542020-02-11 16:35:26 -0800411 TST x0, 15
Frank Barchard80b537a2019-11-18 10:51:33 -0800412
413 // BLOCK 7
414 FMLA v27.4s, v15.4s, v4.s[3]
415 FMLA v29.4s, v15.4s, v5.s[1]
416 FMLA v31.4s, v15.4s, v5.s[3]
417 ADD x5, x5, 64
Frank Barcharda7fb8552019-10-23 17:14:17 -0700418
Frank Barchard81558542020-02-11 16:35:26 -0800419 # Is there a remainder?- 2 floats of A (8 bytes) or less
420 B.NE 5f
Frank Barchard80b537a2019-11-18 10:51:33 -0800421
Frank Barchard81558542020-02-11 16:35:26 -08004224:
Frank Barcharda7fb8552019-10-23 17:14:17 -0700423 # ks loop
424 SUBS x9, x9, 48 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800425 B.HI 1b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700426
427 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700428 FMAX v20.4s, v20.4s, v6.4s
Frank Barchardf884a7b2020-01-13 16:39:50 -0800429 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700430 FMAX v21.4s, v21.4s, v6.4s
431 FMAX v22.4s, v22.4s, v6.4s
432 FMAX v23.4s, v23.4s, v6.4s
433 FMAX v24.4s, v24.4s, v6.4s
434 FMAX v25.4s, v25.4s, v6.4s
435 FMAX v26.4s, v26.4s, v6.4s
436 FMAX v27.4s, v27.4s, v6.4s
437 FMAX v28.4s, v28.4s, v6.4s
438 FMAX v29.4s, v29.4s, v6.4s
439 FMAX v30.4s, v30.4s, v6.4s
440 FMAX v31.4s, v31.4s, v6.4s
441 FMIN v20.4s, v20.4s, v7.4s
442 FMIN v21.4s, v21.4s, v7.4s
443 FMIN v22.4s, v22.4s, v7.4s
444 FMIN v23.4s, v23.4s, v7.4s
445 FMIN v24.4s, v24.4s, v7.4s
446 FMIN v25.4s, v25.4s, v7.4s
447 FMIN v26.4s, v26.4s, v7.4s
448 FMIN v27.4s, v27.4s, v7.4s
449 FMIN v28.4s, v28.4s, v7.4s
450 FMIN v29.4s, v29.4s, v7.4s
451 FMIN v30.4s, v30.4s, v7.4s
452 FMIN v31.4s, v31.4s, v7.4s
Frank Barcharda7fb8552019-10-23 17:14:17 -0700453
454 # Store full 6 x 8
Frank Barcharda7fb8552019-10-23 17:14:17 -0700455 B.LO 8f
456
457 STP q30, q31, [x7]
458 ADD x7, x7, x10
459 STP q28, q29, [x13]
460 ADD x13, x13, x10
461 STP q26, q27, [x18]
462 ADD x18, x18, x10
463 STP q24, q25, [x17]
464 ADD x17, x17, x10
465 STP q22, q23, [x16]
466 ADD x16, x16, x10
467 STP q20, q21, [x6]
468 ADD x6, x6, x10
469
470 SUB x4, x4, x3 // a -= ks
471
472 # nc loop
Frank Barcharda7fb8552019-10-23 17:14:17 -0700473 B.HI 0b
474
Frank Barchard32167582020-01-10 16:10:01 -0800475 // Restore x19-x23, d12-d15 from stack
476 LDR x23, [sp, 64]
Frank Barchard80b537a2019-11-18 10:51:33 -0800477 LDP x21, x22, [sp, 48]
478 LDP x19, x20, [sp, 32]
479 LDP d14, d15, [sp, 16]
480 LDP d12, d13, [sp], 80
Frank Barcharda7fb8552019-10-23 17:14:17 -0700481 RET
482
Frank Barchard81558542020-02-11 16:35:26 -08004835:
484 # Is there a remainder?- 2 floats of A (8 bytes)
485 TBZ x0, 3, 6f
486
487 # Remainder- 2 floats of A (8 bytes)
Frank Barchard80b537a2019-11-18 10:51:33 -0800488 LDR d0, [x14], 8
489 LDR q16, [x5], 16
490 LD1 {v0.d}[1], [x15], 8
491 LDR d1, [x20], 8
492 LD1 {v1.d}[1], [x21], 8
493 LDR d2, [x22], 8
494 LD1 {v2.d}[1], [x23], 8
495 LDR q17, [x5], 16
496 LDR q18, [x5], 16
497 LDR q19, [x5], 16
498 FMLA v20.4s, v16.4s, v0.s[0]
499 FMLA v22.4s, v16.4s, v0.s[2]
500 FMLA v24.4s, v16.4s, v1.s[0]
501 FMLA v26.4s, v16.4s, v1.s[2]
502 FMLA v28.4s, v16.4s, v2.s[0]
503 FMLA v30.4s, v16.4s, v2.s[2]
504 FMLA v21.4s, v17.4s, v0.s[0]
505 FMLA v23.4s, v17.4s, v0.s[2]
506 FMLA v25.4s, v17.4s, v1.s[0]
507 FMLA v27.4s, v17.4s, v1.s[2]
508 FMLA v29.4s, v17.4s, v2.s[0]
509 FMLA v31.4s, v17.4s, v2.s[2]
510
511 FMLA v20.4s, v18.4s, v0.s[1]
512 FMLA v22.4s, v18.4s, v0.s[3]
513 FMLA v24.4s, v18.4s, v1.s[1]
514 FMLA v26.4s, v18.4s, v1.s[3]
515 FMLA v28.4s, v18.4s, v2.s[1]
516 FMLA v30.4s, v18.4s, v2.s[3]
517 FMLA v21.4s, v19.4s, v0.s[1]
518 FMLA v23.4s, v19.4s, v0.s[3]
519 FMLA v25.4s, v19.4s, v1.s[1]
520 FMLA v27.4s, v19.4s, v1.s[3]
521 FMLA v29.4s, v19.4s, v2.s[1]
522 FMLA v31.4s, v19.4s, v2.s[3]
523
524 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barchard81558542020-02-11 16:35:26 -0800525 TBZ x0, 2, 4b
5266:
Frank Barcharda7fb8552019-10-23 17:14:17 -0700527 # Remainder- 1 floats of A (4 bytes)
528 LDR s0, [x14], 4
Frank Barchard64a5bfe2019-10-30 19:12:49 -0700529 LDR q16, [x5], 16
Frank Barcharda7fb8552019-10-23 17:14:17 -0700530 LD1 {v0.s}[2], [x15], 4
531 LDR s1, [x20], 4
532 LD1 {v1.s}[2], [x21], 4
533 LDR s2, [x22], 4
534 LD1 {v2.s}[2], [x23], 4
Frank Barchard80b537a2019-11-18 10:51:33 -0800535 LDR q17, [x5], 16
Frank Barcharda7fb8552019-10-23 17:14:17 -0700536
537 FMLA v20.4s, v16.4s, v0.s[0]
538 FMLA v22.4s, v16.4s, v0.s[2]
539 FMLA v24.4s, v16.4s, v1.s[0]
540 FMLA v26.4s, v16.4s, v1.s[2]
541 FMLA v28.4s, v16.4s, v2.s[0]
542 FMLA v30.4s, v16.4s, v2.s[2]
543 FMLA v21.4s, v17.4s, v0.s[0]
544 FMLA v23.4s, v17.4s, v0.s[2]
545 FMLA v25.4s, v17.4s, v1.s[0]
546 FMLA v27.4s, v17.4s, v1.s[2]
547 FMLA v29.4s, v17.4s, v2.s[0]
548 FMLA v31.4s, v17.4s, v2.s[2]
Frank Barchard81558542020-02-11 16:35:26 -0800549 B 4b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700550
551 # Store odd width
5528:
553 TBZ x1, 2, 9f
554 STR q30, [x7], 16
555 MOV v30.16b, v31.16b
556 STR q28, [x13], 16
557 MOV v28.16b, v29.16b
558 STR q26, [x18], 16
559 MOV v26.16b, v27.16b
560 STR q24, [x17], 16
561 MOV v24.16b, v25.16b
562 STR q22, [x16], 16
563 MOV v22.16b, v23.16b
564 STR q20, [x6], 16
565 MOV v20.16b, v21.16b
5669:
567 TBZ x1, 1, 10f
568 STR d30, [x7], 8
569 DUP d30, v30.d[1]
570 STR d28, [x13], 8
571 DUP d28, v28.d[1]
572 STR d26, [x18], 8
573 DUP d26, v26.d[1]
574 STR d24, [x17], 8
575 DUP d24, v24.d[1]
576 STR d22, [x16], 8
577 DUP d22, v22.d[1]
578 STR d20, [x6], 8
579 DUP d20, v20.d[1]
580
58110:
582 TBZ x1, 0, 11f
583 STR s30, [x7]
584 STR s28, [x13]
585 STR s26, [x18]
586 STR s24, [x17]
587 STR s22, [x16]
588 STR s20, [x6]
58911:
Frank Barchard32167582020-01-10 16:10:01 -0800590 // Restore x19-x23, d12-d15 from stack
591 LDR x23, [sp, 64]
Frank Barchard80b537a2019-11-18 10:51:33 -0800592 LDP x21, x22, [sp, 48]
593 LDP x19, x20, [sp, 32]
594 LDP d14, d15, [sp, 16]
595 LDP d12, d13, [sp], 80
Frank Barcharda7fb8552019-10-23 17:14:17 -0700596 RET
597
598END_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53
599
600#ifdef __ELF__
601.section ".note.GNU-stack","",%progbits
602#endif