blob: a6cdb85dfa2b4824c1b7474acba83c06b20cc2d8 [file] [log] [blame]
Frank Barchard46fb8072019-10-25 12:54:22 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
20# const xnn_f32_output_params params [sp + 24] -> x8
21
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
Frank Barchardae777b42019-10-25 18:31:58 -070026# x13 a0
27# x14 a1
28# x15 a2
29# x8 a3
Frank Barchard46fb8072019-10-25 12:54:22 -070030
31# C pointers
32# x6 c0
33# x16 c1
34# x17 c2
Frank Barchardae777b42019-10-25 18:31:58 -070035# x7 c3
Frank Barchard46fb8072019-10-25 12:54:22 -070036
Frank Barchard7693acf2020-01-13 17:44:16 -080037# x19 temporary vector shadow register
38
Frank Barchard46fb8072019-10-25 12:54:22 -070039# Vector register usage
Frank Barchard7c8e0c72019-11-17 00:02:36 -080040# A0 v0 v3
41# A1 v0[1] v3[1]
42# A2 v1 v4
43# A3 v1[1] v4[1]
44
45# B v12 v13 v14 v15 second set of B
46# B v16 v17 v18 v19 first set
Frank Barchard46fb8072019-10-25 12:54:22 -070047# C v20 v21
48# C v22 v23
49# C v24 v25
50# C v26 v27
51# Clamp v6 v7
Frank Barchard7c8e0c72019-11-17 00:02:36 -080052
53# unused A v8 v9 v10 v11
54# x12 a4
55# x4 a5
56# x13 c4
57# x7 c5
58# A4 v2 v5
59# A5 v2[1] v5[1]
60# C v28 v29
61# C v30 v31
Frank Barchard46fb8072019-10-25 12:54:22 -070062
63BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53
64
65 # Clamp C pointers
Frank Barchard46fb8072019-10-25 12:54:22 -070066 CMP x0, 2 // if mr < 2
Frank Barchard684bbb02019-11-16 14:14:42 -080067 ADD x16, x6, x7 // c1 = c0 + cm_stride
Frank Barchard46fb8072019-10-25 12:54:22 -070068 CSEL x16, x6, x16, LO // c1 = c0
69
70 ADD x17, x16, x7 // c2 = c1 + cm_stride
71 // if mr <= 2
72 CSEL x17, x16, x17, LS // c2 = c1
73
Frank Barchard46fb8072019-10-25 12:54:22 -070074 CMP x0, 4 // if mr < 4
Frank Barchard7c8e0c72019-11-17 00:02:36 -080075 ADD x7, x17, x7 // c3 = c2 + cm_stride
76 CSEL x7, x17, x7, LO // c3 = c2
Frank Barchard46fb8072019-10-25 12:54:22 -070077
Frank Barchard46fb8072019-10-25 12:54:22 -070078 # Load cn_stride, a_offset
Frank Barchardae777b42019-10-25 18:31:58 -070079 LDP x10, x11, [sp]
Frank Barchard46fb8072019-10-25 12:54:22 -070080
81 # Load zero, clamping params pointer
Frank Barchardae777b42019-10-25 18:31:58 -070082 LDP x12, x8, [sp, 16]
Frank Barchard46fb8072019-10-25 12:54:22 -070083
84 # Load clamping_params values
85 LD2R {v6.4s, v7.4s}, [x8]
86
Frank Barchard7693acf2020-01-13 17:44:16 -080087 // Save x19, d12-d15 on stack
Frank Barchard7c8e0c72019-11-17 00:02:36 -080088 STP d12, d13, [sp, -48]!
89 STP d14, d15, [sp, 16]
Frank Barchard7693acf2020-01-13 17:44:16 -080090 STP x19, x19, [sp, 32]
Frank Barchard7c8e0c72019-11-17 00:02:36 -080091
Frank Barchard46fb8072019-10-25 12:54:22 -0700920:
93 # Load initial bias from w into accumulators
94 LDP q20, q21, [x5], 32
95 MOV v22.16b, v20.16b
Frank Barchard7c8e0c72019-11-17 00:02:36 -080096 PRFM PLDL1KEEP, [x13, 0] // Prefetch A
97 PRFM PLDL1KEEP, [x13, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -070098 MOV v23.16b, v21.16b
Frank Barchard7c8e0c72019-11-17 00:02:36 -080099 PRFM PLDL1KEEP, [x14, 0]
100 PRFM PLDL1KEEP, [x14, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700101 MOV v24.16b, v20.16b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800102 PRFM PLDL1KEEP, [x15, 0]
103 PRFM PLDL1KEEP, [x15, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700104 MOV v25.16b, v21.16b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800105 PRFM PLDL1KEEP, [x8, 0]
106 PRFM PLDL1KEEP, [x8, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700107 MOV v26.16b, v20.16b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800108 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
109 PRFM PLDL1KEEP, [x5, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700110 MOV v27.16b, v21.16b
111 PRFM PLDL1KEEP, [x5, 128]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800112 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard46fb8072019-10-25 12:54:22 -0700113
114 MOV x9, x3 // p = ks
115
1161:
117 # Load next 4 A pointers
Frank Barchardae777b42019-10-25 18:31:58 -0700118 LDP x13, x14, [x4], 16
119 LDP x15, x8, [x4], 16
Frank Barchard46fb8072019-10-25 12:54:22 -0700120
Frank Barchard7693acf2020-01-13 17:44:16 -0800121
Frank Barchardae777b42019-10-25 18:31:58 -0700122 CMP x13, x12 // if a0 == zero
123 ADD x13, x13, x11 // a0 += a_offset
124 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
125 CMP x14, x12 // if a1 == zero
126 ADD x14, x14, x11 // a1 += a_offset
127 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
128 CMP x15, x12 // if a2 == zero
129 ADD x15, x15, x11 // a2 += a_offset
130 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800131 CMP x8, x12 // if a3 == zero
132 ADD x8, x8, x11 // a3 += a_offset
133 CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset
Frank Barchard46fb8072019-10-25 12:54:22 -0700134
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800135 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
136 SUBS x0, x2, 16 // k = kc - 16
137 B.LO 4f
138
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800139 # Prologue - First group loads, no FMA
Frank Barchard7693acf2020-01-13 17:44:16 -0800140 LDR d0, [x13], 8 // a0
141 LDP q16, q17, [x5], 32 // b
142 LDR d1, [x15], 8 // a2
143 LD1 {v0.d}[1], [x14], 8 // a1
144 LD1 {v1.d}[1], [x8], 8 // a3
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800145 SUBS x0, x0, 16
Frank Barchard7693acf2020-01-13 17:44:16 -0800146 LDR q18, [x5], 16
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800147 LDR d19, [x5], 8
Frank Barchard7693acf2020-01-13 17:44:16 -0800148 LDR x19, [x5], 8 // ins is in BLOCK 0
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800149
150 # Is there at least 4 floats (16 bytes) for main loop?
Frank Barchard46fb8072019-10-25 12:54:22 -0700151 B.LO 3f
152
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800153 # Main loop - 4 floats of A (16 bytes)
154 # 32 FMA + 8 LD64 A + 8 LDR B
Frank Barchard46fb8072019-10-25 12:54:22 -07001552:
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800156 # First group of 16 FMA, Second group loads
157 // BLOCK 0
Frank Barchard7693acf2020-01-13 17:44:16 -0800158 LDR d3, [x13], 8 // a0
159 INS v19.d[1], x19 // b from second group
Frank Barchard46fb8072019-10-25 12:54:22 -0700160 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchard7693acf2020-01-13 17:44:16 -0800161 LDR x19, [x14], 8 // a1
Frank Barchard46fb8072019-10-25 12:54:22 -0700162 FMLA v22.4s, v16.4s, v0.s[2]
163 FMLA v24.4s, v16.4s, v1.s[0]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800164
165 // BLOCK 1
166 LDR d12, [x5]
Frank Barchard7693acf2020-01-13 17:44:16 -0800167 INS v3.d[1], x19 // a1 ins
Frank Barchard46fb8072019-10-25 12:54:22 -0700168 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchard7693acf2020-01-13 17:44:16 -0800169 LDR x19, [x5, 8] // b
Frank Barchard46fb8072019-10-25 12:54:22 -0700170 FMLA v21.4s, v17.4s, v0.s[0]
171 FMLA v23.4s, v17.4s, v0.s[2]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800172
173 // BLOCK 2
Frank Barchard7693acf2020-01-13 17:44:16 -0800174 LDR d4, [x15], 8 // a2
175 INS v12.d[1], x19 // b ins
Frank Barchard46fb8072019-10-25 12:54:22 -0700176 FMLA v25.4s, v17.4s, v1.s[0]
Frank Barchard7693acf2020-01-13 17:44:16 -0800177 LDR x19, [x8], 8 // a3
Frank Barchard46fb8072019-10-25 12:54:22 -0700178 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard46fb8072019-10-25 12:54:22 -0700179 FMLA v20.4s, v18.4s, v0.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800180
181 // BLOCK 3
182 LDR d13, [x5, 16]
Frank Barchard7693acf2020-01-13 17:44:16 -0800183 INS v4.d[1], x19 // a3 ins
Frank Barchard46fb8072019-10-25 12:54:22 -0700184 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barchard7693acf2020-01-13 17:44:16 -0800185 LDR x19, [x5, 24]
Frank Barchard46fb8072019-10-25 12:54:22 -0700186 FMLA v24.4s, v18.4s, v1.s[1]
187 FMLA v26.4s, v18.4s, v1.s[3]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800188
189 // BLOCK 4
190 LDR d14, [x5, 32]
Frank Barchard7693acf2020-01-13 17:44:16 -0800191 INS v13.d[1], x19 // b
Frank Barchard46fb8072019-10-25 12:54:22 -0700192 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800193 LDR x19, [x5, 40]
Frank Barchard7693acf2020-01-13 17:44:16 -0800194 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barchard46fb8072019-10-25 12:54:22 -0700195 FMLA v25.4s, v19.4s, v1.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800196
197 // BLOCK 5
198 // NOPs to ensure 4 cycle LDR lands on next LDR
199 LDR d15, [x5, 48]
Frank Barchard7693acf2020-01-13 17:44:16 -0800200 INS v14.d[1], x19 // b from previous
Frank Barchard46fb8072019-10-25 12:54:22 -0700201 FMLA v27.4s, v19.4s, v1.s[3]
Frank Barchard7693acf2020-01-13 17:44:16 -0800202 LDR x19, [x5, 56]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800203 NOP
204 NOP
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800205 NOP
206 NOP
207
208 # Second group of 16 FMA, First group of loads
209 // BLOCK 0
Frank Barchard7693acf2020-01-13 17:44:16 -0800210 LDR d0, [x13], 8 // a0
211 INS v15.d[1], x19 // b from previous
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800212 FMLA v20.4s, v12.4s, v3.s[0]
Frank Barchard7693acf2020-01-13 17:44:16 -0800213 LDR x19, [x14], 8 // a1
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800214 FMLA v22.4s, v12.4s, v3.s[2]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800215 FMLA v24.4s, v12.4s, v4.s[0]
Frank Barchard7693acf2020-01-13 17:44:16 -0800216 PRFM PLDL1KEEP, [x13, 128] // Prefetch A0
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800217
218 // BLOCK 1
219 LDR d16, [x5, 64]
Frank Barchard7693acf2020-01-13 17:44:16 -0800220 INS v0.d[1], x19 // a1 ins
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800221 FMLA v26.4s, v12.4s, v4.s[2]
Frank Barchard7693acf2020-01-13 17:44:16 -0800222 LDR x19, [x5, 72] // b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800223 FMLA v21.4s, v13.4s, v3.s[0]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800224 FMLA v23.4s, v13.4s, v3.s[2]
Frank Barchard7693acf2020-01-13 17:44:16 -0800225 PRFM PLDL1KEEP, [x14, 128] // Prefetch A1
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800226
227 // BLOCK 2
228 LDR d1, [x15], 8 // a2
Frank Barchard7693acf2020-01-13 17:44:16 -0800229 INS v16.d[1], x19 // b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800230 FMLA v25.4s, v13.4s, v4.s[0]
Frank Barchard7693acf2020-01-13 17:44:16 -0800231 LDR x19, [x8], 8 // a3
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800232 FMLA v27.4s, v13.4s, v4.s[2]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800233 FMLA v20.4s, v14.4s, v3.s[1]
234 PRFM PLDL1KEEP, [x15, 128] // Prefetch A2
235
236 // BLOCK 3
237 LDR d17, [x5, 80]
Frank Barchard7693acf2020-01-13 17:44:16 -0800238 INS v1.d[1], x19 // a3 ins
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800239 FMLA v22.4s, v14.4s, v3.s[3]
Frank Barchard7693acf2020-01-13 17:44:16 -0800240 LDR x19, [x5, 88]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800241 FMLA v24.4s, v14.4s, v4.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800242 FMLA v26.4s, v14.4s, v4.s[3]
Frank Barchard7693acf2020-01-13 17:44:16 -0800243 PRFM PLDL1KEEP, [x8, 128] // Prefetch A3
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800244
245 // BLOCK 4
246 LDR d18, [x5, 96]
Frank Barchard7693acf2020-01-13 17:44:16 -0800247 INS v17.d[1], x19 // b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800248 FMLA v21.4s, v15.4s, v3.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800249 LDR x19, [x5, 104]
Frank Barchard7693acf2020-01-13 17:44:16 -0800250 FMLA v23.4s, v15.4s, v3.s[3]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800251 FMLA v25.4s, v15.4s, v4.s[1]
Frank Barchard534375d2020-01-15 19:22:41 -0800252 PRFM PLDL1KEEP, [x5, 192] // Prefetch B
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800253
254 // BLOCK 5
255 // NOTE that block needs to be 4 cycles for LDR not to stall
256 LDR d19, [x5, 112]
Frank Barchard7693acf2020-01-13 17:44:16 -0800257 INS v18.d[1], x19
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800258 FMLA v27.4s, v15.4s, v4.s[3]
Frank Barchard7693acf2020-01-13 17:44:16 -0800259 LDR x19, [x5, 120]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800260 SUBS x0, x0, 16
Frank Barchard534375d2020-01-15 19:22:41 -0800261 PRFM PLDL1KEEP, [x5, 256] // Prefetch B
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800262 ADD x5, x5, 128
Frank Barchard46fb8072019-10-25 12:54:22 -0700263 B.HS 2b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800264
265 # Epilogue - 4 floats of A (16 bytes)
266 # 32 FMA + 8 LD64 A + 8 LDR B
Frank Barchard46fb8072019-10-25 12:54:22 -07002673:
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800268 # First group of 16 FMA, Second group loads
269 // BLOCK 0
Frank Barchard7693acf2020-01-13 17:44:16 -0800270 LDR d3, [x13], 8 // a0
271 INS v19.d[1], x19 // b from second group
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800272 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchard7693acf2020-01-13 17:44:16 -0800273 LDR x19, [x14], 8 // a1
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800274 FMLA v22.4s, v16.4s, v0.s[2]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800275 FMLA v24.4s, v16.4s, v1.s[0]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800276
277 // BLOCK 1
278 LDR d12, [x5]
Frank Barchard7693acf2020-01-13 17:44:16 -0800279 INS v3.d[1], x19 // a1 ins
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800280 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchard7693acf2020-01-13 17:44:16 -0800281 LDR x19, [x5, 8] // b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800282 FMLA v21.4s, v17.4s, v0.s[0]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800283 FMLA v23.4s, v17.4s, v0.s[2]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800284
285 // BLOCK 2
286 LDR d4, [x15], 8 // a2
Frank Barchard7693acf2020-01-13 17:44:16 -0800287 INS v12.d[1], x19 // b ins
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800288 FMLA v25.4s, v17.4s, v1.s[0]
Frank Barchard7693acf2020-01-13 17:44:16 -0800289 LDR x19, [x8], 8 // a3
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800290 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800291 FMLA v20.4s, v18.4s, v0.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800292
293 // BLOCK 3
294 LDR d13, [x5, 16]
Frank Barchard7693acf2020-01-13 17:44:16 -0800295 INS v4.d[1], x19 // a3 ins
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800296 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barchard7693acf2020-01-13 17:44:16 -0800297 LDR x19, [x5, 24]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800298 FMLA v24.4s, v18.4s, v1.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800299 FMLA v26.4s, v18.4s, v1.s[3]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800300
301 // BLOCK 4
302 LDR d14, [x5, 32]
Frank Barchard7693acf2020-01-13 17:44:16 -0800303 INS v13.d[1], x19 // b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800304 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800305 LDR x19, [x5, 40]
Frank Barchard7693acf2020-01-13 17:44:16 -0800306 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800307 FMLA v25.4s, v19.4s, v1.s[1]
308
309 // BLOCK 5
310 // NOPs to ensure 4 cycle LDR lands on next LDR
311 LDR d15, [x5, 48]
Frank Barchard7693acf2020-01-13 17:44:16 -0800312 INS v14.d[1], x19
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800313 FMLA v27.4s, v19.4s, v1.s[3]
Frank Barchard7693acf2020-01-13 17:44:16 -0800314 LDR x19, [x5, 56]
315 NOP // fma
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800316 NOP
Frank Barchard7693acf2020-01-13 17:44:16 -0800317 NOP // fma
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800318 NOP
319
320 # Second group of 16 FMA, no loads
321 // BLOCK 0
Frank Barchard7693acf2020-01-13 17:44:16 -0800322 INS v15.d[1], x19 // b from previous
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800323 FMLA v20.4s, v12.4s, v3.s[0]
324 FMLA v22.4s, v12.4s, v3.s[2]
325 FMLA v24.4s, v12.4s, v4.s[0]
326
327 // BLOCK 1
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800328 FMLA v26.4s, v12.4s, v4.s[2]
329 FMLA v21.4s, v13.4s, v3.s[0]
330 FMLA v23.4s, v13.4s, v3.s[2]
331
332 // BLOCK 2
333 FMLA v25.4s, v13.4s, v4.s[0]
334 FMLA v27.4s, v13.4s, v4.s[2]
335 FMLA v20.4s, v14.4s, v3.s[1]
336
337 // BLOCK 3
338 FMLA v22.4s, v14.4s, v3.s[3]
339 FMLA v24.4s, v14.4s, v4.s[1]
340 FMLA v26.4s, v14.4s, v4.s[3]
341
342 // BLOCK 4
343 FMLA v21.4s, v15.4s, v3.s[1]
344 FMLA v23.4s, v15.4s, v3.s[3]
345 FMLA v25.4s, v15.4s, v4.s[1]
346 ADD x5, x5, 64
347
348 // BLOCK 5
349 FMLA v27.4s, v15.4s, v4.s[3]
Frank Barchard46fb8072019-10-25 12:54:22 -0700350
3514:
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800352 # Is there a remainder?- 2 floats of A (8 bytes)
353 TBNZ x0, 3, 6f
354 # Is there a remainder?- 1 floats of A (4 bytes)
355 TBNZ x0, 2, 7f
3565:
Frank Barchard46fb8072019-10-25 12:54:22 -0700357 # ks loop
358 SUBS x9, x9, 32 // ks -= MR * sizeof(void*)
359 B.NE 1b
360
361 # Clamp
362 FMIN v20.4s, v20.4s, v6.4s
363 FMIN v21.4s, v21.4s, v6.4s
364 FMIN v22.4s, v22.4s, v6.4s
365 FMIN v23.4s, v23.4s, v6.4s
366 FMIN v24.4s, v24.4s, v6.4s
367 FMIN v25.4s, v25.4s, v6.4s
368 FMIN v26.4s, v26.4s, v6.4s
369 FMIN v27.4s, v27.4s, v6.4s
370 FMAX v20.4s, v20.4s, v7.4s
371 FMAX v21.4s, v21.4s, v7.4s
372 FMAX v22.4s, v22.4s, v7.4s
373 FMAX v23.4s, v23.4s, v7.4s
374 FMAX v24.4s, v24.4s, v7.4s
375 FMAX v25.4s, v25.4s, v7.4s
376 FMAX v26.4s, v26.4s, v7.4s
377 FMAX v27.4s, v27.4s, v7.4s
378
379 # Store full 4 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800380 SUBS x1, x1, 8
Frank Barchard46fb8072019-10-25 12:54:22 -0700381 B.LO 8f
382
383 STP q26, q27, [x7]
384 ADD x7, x7, x10
385 STP q24, q25, [x17]
386 ADD x17, x17, x10
387 STP q22, q23, [x16]
388 ADD x16, x16, x10
389 STP q20, q21, [x6]
390 ADD x6, x6, x10
391
392 SUB x4, x4, x3 // a -= ks
393
394 # nc loop
Frank Barchard46fb8072019-10-25 12:54:22 -0700395 B.HI 0b
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800396
Frank Barchard7693acf2020-01-13 17:44:16 -0800397 // Restore x19, d12-d15 from stack
398 LDR x19, [sp, 32]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800399 LDP d14, d15, [sp, 16]
400 LDP d12, d13, [sp], 48
Frank Barchard46fb8072019-10-25 12:54:22 -0700401 RET
402
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800403 # Remainder - 2 floats of A (8 bytes)
404 # 16 FMA + 4 LD64 A + 2 LDP B
4056:
406 LDR d0, [x13], 8
407 LDP q16, q17, [x5], 32
408 LD1 {v0.d}[1], [x14], 8
409 LDR d1, [x15], 8
410 LD1 {v1.d}[1], [x8], 8
411 LDP q18, q19, [x5], 32
412 FMLA v20.4s, v16.4s, v0.s[0]
413 FMLA v22.4s, v16.4s, v0.s[2]
414 FMLA v24.4s, v16.4s, v1.s[0]
415 FMLA v26.4s, v16.4s, v1.s[2]
416 FMLA v21.4s, v17.4s, v0.s[0]
417 FMLA v23.4s, v17.4s, v0.s[2]
418 FMLA v25.4s, v17.4s, v1.s[0]
419 FMLA v27.4s, v17.4s, v1.s[2]
420
421 FMLA v20.4s, v18.4s, v0.s[1]
422 FMLA v22.4s, v18.4s, v0.s[3]
423 FMLA v24.4s, v18.4s, v1.s[1]
424 FMLA v26.4s, v18.4s, v1.s[3]
425 FMLA v21.4s, v19.4s, v0.s[1]
426 FMLA v23.4s, v19.4s, v0.s[3]
427 FMLA v25.4s, v19.4s, v1.s[1]
428 FMLA v27.4s, v19.4s, v1.s[3]
429
430 # Is there a remainder?- 1 floats of A (4 bytes)
431 TBZ x0, 2, 5b
432
4337:
Frank Barchard46fb8072019-10-25 12:54:22 -0700434 # Remainder- 1 floats of A (4 bytes)
Frank Barchardae777b42019-10-25 18:31:58 -0700435 LDR s0, [x13], 4
Frank Barchard46fb8072019-10-25 12:54:22 -0700436 LDP q16, q17, [x5], 32
Frank Barchardae777b42019-10-25 18:31:58 -0700437 LD1 {v0.s}[2], [x14], 4
438 LDR s1, [x15], 4
439 LD1 {v1.s}[2], [x8], 4
Frank Barchard46fb8072019-10-25 12:54:22 -0700440
441 FMLA v20.4s, v16.4s, v0.s[0]
442 FMLA v22.4s, v16.4s, v0.s[2]
443 FMLA v24.4s, v16.4s, v1.s[0]
444 FMLA v26.4s, v16.4s, v1.s[2]
445 FMLA v21.4s, v17.4s, v0.s[0]
446 FMLA v23.4s, v17.4s, v0.s[2]
447 FMLA v25.4s, v17.4s, v1.s[0]
448 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800449 B 5b
Frank Barchard46fb8072019-10-25 12:54:22 -0700450
451 # Store odd width
4528:
453 TBZ x1, 2, 9f
454 STR q26, [x7], 16
455 MOV v26.16b, v27.16b
456 STR q24, [x17], 16
457 MOV v24.16b, v25.16b
458 STR q22, [x16], 16
459 MOV v22.16b, v23.16b
460 STR q20, [x6], 16
461 MOV v20.16b, v21.16b
4629:
463 TBZ x1, 1, 10f
464 STR d26, [x7], 8
465 DUP d26, v26.d[1]
466 STR d24, [x17], 8
467 DUP d24, v24.d[1]
468 STR d22, [x16], 8
469 DUP d22, v22.d[1]
470 STR d20, [x6], 8
471 DUP d20, v20.d[1]
472
47310:
474 TBZ x1, 0, 11f
475 STR s26, [x7]
476 STR s24, [x17]
477 STR s22, [x16]
478 STR s20, [x6]
47911:
Frank Barchard7693acf2020-01-13 17:44:16 -0800480 // Restore x19, d12-d15 from stack
481 LDR x19, [sp, 32]
Frank Barchard7c8e0c72019-11-17 00:02:36 -0800482 LDP d14, d15, [sp, 16]
483 LDP d12, d13, [sp], 48
Frank Barchard46fb8072019-10-25 12:54:22 -0700484 RET
485
486END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53
487
488#ifdef __ELF__
489.section ".note.GNU-stack","",%progbits
490#endif