blob: 6c3934f49db3877963410f2c52a792df8bd5faf0 [file] [log] [blame]
Frank Barchard46fb8072019-10-25 12:54:22 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
Frank Barchard46fb8072019-10-25 12:54:22 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080031# x11 a3
Frank Barchard46fb8072019-10-25 12:54:22 -070032
33# C pointers
34# x6 c0
35# x16 c1
36# x17 c2
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080037# x18 c3
38
Frank Barchard324f2bb2020-01-14 15:12:11 -080039# x4 temporary vector shadow register
Frank Barchard46fb8072019-10-25 12:54:22 -070040
41# Vector register usage
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080042# A0 v0 v3
43# A1 v0[1] v3[1]
44# A2 v1 v4
45# A3 v1[1] v4[1]
46
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080047# B v12 v13 v14 v15 second set of B
48# B v16 v17 v18 v19 first set
Frank Barchard46fb8072019-10-25 12:54:22 -070049# C v20 v21
50# C v22 v23
51# C v24 v25
52# C v26 v27
53# Clamp v6 v7
Frank Barchard7693acf2020-01-13 17:44:16 -080054
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080055# unused A v8 v9 v10 v11
56# x12 a4
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080057# x13 c4
58# x7 c5
59# A4 v2 v5
60# A5 v2[1] v5[1]
61# C v28 v29
62# C v30 v31
Frank Barchard46fb8072019-10-25 12:54:22 -070063
64BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53
65
Frank Barchard46fb8072019-10-25 12:54:22 -070066 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080067 CMP x0, 2 // if mr < 2
Frank Barchard46fb8072019-10-25 12:54:22 -070068 ADD x9, x3, x4 // a1 = a0 + a_stride
69 ADD x16, x6, x7 // c1 = c0 + cm_stride
Frank Barchard46fb8072019-10-25 12:54:22 -070070 CSEL x9, x3, x9, LO // a1 = a0
71 CSEL x16, x6, x16, LO // c1 = c0
72
73 ADD x10, x9, x4 // a2 = a1 + a_stride
74 ADD x17, x16, x7 // c2 = c1 + cm_stride
75 // if mr <= 2
76 CSEL x10, x9, x10, LS // a2 = a1
77 CSEL x17, x16, x17, LS // c2 = c1
78
Frank Barchard684bbb02019-11-16 14:14:42 -080079 CMP x0, 4 // if mr < 4
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080080 ADD x11, x10, x4 // a3 = a2 + a_stride
81 ADD x18, x17, x7 // c3 = c2 + cm_stride
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080082 CSEL x11, x10, x11, LO // a3 = a2
83 CSEL x18, x17, x18, LO // c3 = c2
84
85 # Load params pointer
86 LDR x8, [sp, 8]
Frank Barchard46fb8072019-10-25 12:54:22 -070087
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070088 # Load min/max values
Frank Barchard46fb8072019-10-25 12:54:22 -070089 LD2R {v6.4s, v7.4s}, [x8]
90
91 # Load cn_stride
92 LDR x14, [sp]
93
Frank Barchard324f2bb2020-01-14 15:12:11 -080094 // Save d12-d15 on stack
95 STP d12, d13, [sp, -32]!
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080096 STP d14, d15, [sp, 16]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080097
Frank Barchard46fb8072019-10-25 12:54:22 -0700980:
99 # Load initial bias from w into accumulators
100 LDP q20, q21, [x5], 32
101 MOV v22.16b, v20.16b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800102 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
103 PRFM PLDL1KEEP, [x3, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700104 MOV v23.16b, v21.16b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800105 PRFM PLDL1KEEP, [x9, 0]
106 PRFM PLDL1KEEP, [x9, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700107 MOV v24.16b, v20.16b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800108 PRFM PLDL1KEEP, [x10, 0]
109 PRFM PLDL1KEEP, [x10, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700110 MOV v25.16b, v21.16b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800111 PRFM PLDL1KEEP, [x11, 0]
112 PRFM PLDL1KEEP, [x11, 64]
Frank Barchard46fb8072019-10-25 12:54:22 -0700113 MOV v26.16b, v20.16b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800114 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
Frank Barchard534375d2020-01-15 19:22:41 -0800115 MOV v27.16b, v21.16b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800116 PRFM PLDL1KEEP, [x5, 64]
117 PRFM PLDL1KEEP, [x5, 128]
118 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard46fb8072019-10-25 12:54:22 -0700119
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800120 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
121 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -0800122 B.LO 5f
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800123
124 # Prologue - First group loads, no FMA
125 LDR d0, [x3], 8 // a0
Frank Barchard7693acf2020-01-13 17:44:16 -0800126 LDP q16, q17, [x5], 32 // b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800127 LDR d1, [x10], 8 // a2
Frank Barchard7693acf2020-01-13 17:44:16 -0800128 LD1 {v0.d}[1], [x9], 8 // a1
129 LD1 {v1.d}[1], [x11], 8 // a3
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800130 SUBS x0, x0, 16
Frank Barchard7693acf2020-01-13 17:44:16 -0800131 LDR q18, [x5], 16
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800132 LDR d19, [x5], 8
Frank Barchard324f2bb2020-01-14 15:12:11 -0800133 LDR x4, [x5], 8 // ins is in BLOCK 0
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800134
135 # Is there at least 4 floats (16 bytes) for main loop?
Frank Barchard46fb8072019-10-25 12:54:22 -0700136 B.LO 2f
137
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800138 # Main loop - 4 floats of A (16 bytes)
139 # 32 FMA + 8 LD64 A + 8 LDR B
Frank Barchard46fb8072019-10-25 12:54:22 -07001401:
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800141 # First group of 16 FMA, Second group loads
142 // BLOCK 0
143 LDR d3, [x3], 8 // a0
Frank Barchard324f2bb2020-01-14 15:12:11 -0800144 INS v19.d[1], x4 // b from second group
Frank Barchard46fb8072019-10-25 12:54:22 -0700145 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800146 LDR x4, [x9], 8 // a1
Frank Barchard7693acf2020-01-13 17:44:16 -0800147 FMLA v22.4s, v16.4s, v0.s[2]
Frank Barchard46fb8072019-10-25 12:54:22 -0700148 FMLA v24.4s, v16.4s, v1.s[0]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800149
150 // BLOCK 1
151 LDR d12, [x5]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800152 INS v3.d[1], x4 // a1 ins
Frank Barchard46fb8072019-10-25 12:54:22 -0700153 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800154 LDR x4, [x5, 8] // b
Frank Barchard46fb8072019-10-25 12:54:22 -0700155 FMLA v21.4s, v17.4s, v0.s[0]
156 FMLA v23.4s, v17.4s, v0.s[2]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800157
158 // BLOCK 2
159 LDR d4, [x10], 8 // a2
Frank Barchard324f2bb2020-01-14 15:12:11 -0800160 INS v12.d[1], x4 // b ins
Frank Barchard46fb8072019-10-25 12:54:22 -0700161 FMLA v25.4s, v17.4s, v1.s[0]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800162 LDR x4, [x11], 8 // a3
Frank Barchard7693acf2020-01-13 17:44:16 -0800163 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard46fb8072019-10-25 12:54:22 -0700164 FMLA v20.4s, v18.4s, v0.s[1]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800165
166 // BLOCK 3
167 LDR d13, [x5, 16]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800168 INS v4.d[1], x4 // a3 ins
Frank Barchard46fb8072019-10-25 12:54:22 -0700169 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800170 LDR x4, [x5, 24]
Frank Barchard46fb8072019-10-25 12:54:22 -0700171 FMLA v24.4s, v18.4s, v1.s[1]
172 FMLA v26.4s, v18.4s, v1.s[3]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800173
174 // BLOCK 4
175 LDR d14, [x5, 32]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800176 INS v13.d[1], x4 // b
Frank Barchard46fb8072019-10-25 12:54:22 -0700177 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800178 LDR x4, [x5, 40]
Frank Barchard7693acf2020-01-13 17:44:16 -0800179 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barchard46fb8072019-10-25 12:54:22 -0700180 FMLA v25.4s, v19.4s, v1.s[1]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800181
182 // BLOCK 5
183 // NOPs to ensure 4 cycle LDR lands on next LDR
184 LDR d15, [x5, 48]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800185 INS v14.d[1], x4 // b from previous
Frank Barchard46fb8072019-10-25 12:54:22 -0700186 FMLA v27.4s, v19.4s, v1.s[3]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800187 LDR x4, [x5, 56]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800188 NOP
189 NOP
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800190 NOP
191 NOP
192
193 # Second group of 16 FMA, First group of loads
194 // BLOCK 0
195 LDR d0, [x3], 8 // a0
Frank Barchard324f2bb2020-01-14 15:12:11 -0800196 INS v15.d[1], x4 // b from previous
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800197 FMLA v20.4s, v12.4s, v3.s[0]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800198 LDR x4, [x9], 8 // a1
Frank Barchard7693acf2020-01-13 17:44:16 -0800199 FMLA v22.4s, v12.4s, v3.s[2]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800200 FMLA v24.4s, v12.4s, v4.s[0]
201 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
202
203 // BLOCK 1
204 LDR d16, [x5, 64]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800205 INS v0.d[1], x4 // a1 ins
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800206 FMLA v26.4s, v12.4s, v4.s[2]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800207 LDR x4, [x5, 72] // b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800208 FMLA v21.4s, v13.4s, v3.s[0]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800209 FMLA v23.4s, v13.4s, v3.s[2]
210 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1
211
212 // BLOCK 2
213 LDR d1, [x10], 8 // a2
Frank Barchard324f2bb2020-01-14 15:12:11 -0800214 INS v16.d[1], x4 // b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800215 FMLA v25.4s, v13.4s, v4.s[0]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800216 LDR x4, [x11], 8 // a3
Frank Barchard7693acf2020-01-13 17:44:16 -0800217 FMLA v27.4s, v13.4s, v4.s[2]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800218 FMLA v20.4s, v14.4s, v3.s[1]
219 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2
220
221 // BLOCK 3
222 LDR d17, [x5, 80]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800223 INS v1.d[1], x4 // a3 ins
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800224 FMLA v22.4s, v14.4s, v3.s[3]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800225 LDR x4, [x5, 88]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800226 FMLA v24.4s, v14.4s, v4.s[1]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800227 FMLA v26.4s, v14.4s, v4.s[3]
228 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3
229
230 // BLOCK 4
231 LDR d18, [x5, 96]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800232 INS v17.d[1], x4 // b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800233 FMLA v21.4s, v15.4s, v3.s[1]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800234 LDR x4, [x5, 104]
Frank Barchard7693acf2020-01-13 17:44:16 -0800235 FMLA v23.4s, v15.4s, v3.s[3]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800236 FMLA v25.4s, v15.4s, v4.s[1]
Frank Barchard534375d2020-01-15 19:22:41 -0800237 PRFM PLDL1KEEP, [x5, 192] // Prefetch B
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800238
239 // BLOCK 5
240 // NOTE that block needs to be 4 cycles for LDR not to stall
241 LDR d19, [x5, 112]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800242 INS v18.d[1], x4
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800243 FMLA v27.4s, v15.4s, v4.s[3]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800244 LDR x4, [x5, 120]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800245 SUBS x0, x0, 16
Frank Barchard534375d2020-01-15 19:22:41 -0800246 PRFM PLDL1KEEP, [x5, 256] // Prefetch B
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800247 ADD x5, x5, 128
Frank Barchard46fb8072019-10-25 12:54:22 -0700248 B.HS 1b
249
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800250 # Epilogue - 4 floats of A (16 bytes)
251 # 32 FMA + 8 LD64 A + 8 LDR B
Frank Barchard46fb8072019-10-25 12:54:22 -07002522:
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800253 # First group of 16 FMA, Second group loads
254 // BLOCK 0
255 LDR d3, [x3], 8 // a0
Frank Barchard324f2bb2020-01-14 15:12:11 -0800256 INS v19.d[1], x4 // b from second group
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800257 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800258 LDR x4, [x9], 8 // a1
Frank Barchard7693acf2020-01-13 17:44:16 -0800259 FMLA v22.4s, v16.4s, v0.s[2]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800260 FMLA v24.4s, v16.4s, v1.s[0]
261
262 // BLOCK 1
263 LDR d12, [x5]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800264 INS v3.d[1], x4 // a1 ins
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800265 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800266 LDR x4, [x5, 8] // b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800267 FMLA v21.4s, v17.4s, v0.s[0]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800268 FMLA v23.4s, v17.4s, v0.s[2]
269
270 // BLOCK 2
271 LDR d4, [x10], 8 // a2
Frank Barchard324f2bb2020-01-14 15:12:11 -0800272 INS v12.d[1], x4 // b ins
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800273 FMLA v25.4s, v17.4s, v1.s[0]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800274 LDR x4, [x11], 8 // a3
Frank Barchard7693acf2020-01-13 17:44:16 -0800275 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800276 FMLA v20.4s, v18.4s, v0.s[1]
277
278 // BLOCK 3
279 LDR d13, [x5, 16]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800280 INS v4.d[1], x4 // a3 ins
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800281 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800282 LDR x4, [x5, 24]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800283 FMLA v24.4s, v18.4s, v1.s[1]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800284 FMLA v26.4s, v18.4s, v1.s[3]
285
286 // BLOCK 4
287 LDR d14, [x5, 32]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800288 INS v13.d[1], x4 // b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800289 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800290 LDR x4, [x5, 40]
Frank Barchard7693acf2020-01-13 17:44:16 -0800291 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800292 FMLA v25.4s, v19.4s, v1.s[1]
293
294 // BLOCK 5
Frank Barchard82cfe182019-11-15 15:01:30 -0800295 // NOPs to ensure 4 cycle LDR lands on next LDR
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800296 LDR d15, [x5, 48]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800297 INS v14.d[1], x4
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800298 FMLA v27.4s, v19.4s, v1.s[3]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800299 LDR x4, [x5, 56]
Frank Barchard7693acf2020-01-13 17:44:16 -0800300 NOP // fma
Frank Barchard82cfe182019-11-15 15:01:30 -0800301 NOP
Frank Barchard7693acf2020-01-13 17:44:16 -0800302 NOP // fma
Frank Barchard82cfe182019-11-15 15:01:30 -0800303 NOP
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800304
305 # Second group of 16 FMA, no loads
306 // BLOCK 0
Frank Barchard324f2bb2020-01-14 15:12:11 -0800307 INS v15.d[1], x4 // b from previous
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800308 FMLA v20.4s, v12.4s, v3.s[0]
309 FMLA v22.4s, v12.4s, v3.s[2]
310 FMLA v24.4s, v12.4s, v4.s[0]
311
312 // BLOCK 1
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800313 FMLA v26.4s, v12.4s, v4.s[2]
314 FMLA v21.4s, v13.4s, v3.s[0]
315 FMLA v23.4s, v13.4s, v3.s[2]
316
317 // BLOCK 2
318 FMLA v25.4s, v13.4s, v4.s[0]
319 FMLA v27.4s, v13.4s, v4.s[2]
320 FMLA v20.4s, v14.4s, v3.s[1]
321
322 // BLOCK 3
323 FMLA v22.4s, v14.4s, v3.s[3]
324 FMLA v24.4s, v14.4s, v4.s[1]
325 FMLA v26.4s, v14.4s, v4.s[3]
Frank Barchard81558542020-02-11 16:35:26 -0800326 TST x0, 15
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800327
328 // BLOCK 4
329 FMLA v21.4s, v15.4s, v3.s[1]
330 FMLA v23.4s, v15.4s, v3.s[3]
331 FMLA v25.4s, v15.4s, v4.s[1]
332 ADD x5, x5, 64
333
334 // BLOCK 5
335 FMLA v27.4s, v15.4s, v4.s[3]
336
Frank Barchard81558542020-02-11 16:35:26 -0800337 # Is there a remainder?- 2 floats of A (8 bytes) or less
338 B.NE 5f
339
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08003404:
Frank Barchard46fb8072019-10-25 12:54:22 -0700341 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700342 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800343 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700344 FMAX v21.4s, v21.4s, v6.4s
345 FMAX v22.4s, v22.4s, v6.4s
346 FMAX v23.4s, v23.4s, v6.4s
347 FMAX v24.4s, v24.4s, v6.4s
348 FMAX v25.4s, v25.4s, v6.4s
349 FMAX v26.4s, v26.4s, v6.4s
350 FMAX v27.4s, v27.4s, v6.4s
351 FMIN v20.4s, v20.4s, v7.4s
352 FMIN v21.4s, v21.4s, v7.4s
353 FMIN v22.4s, v22.4s, v7.4s
354 FMIN v23.4s, v23.4s, v7.4s
355 FMIN v24.4s, v24.4s, v7.4s
356 FMIN v25.4s, v25.4s, v7.4s
357 FMIN v26.4s, v26.4s, v7.4s
358 FMIN v27.4s, v27.4s, v7.4s
Frank Barchard46fb8072019-10-25 12:54:22 -0700359
360 # Store full 4 x 8
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800361 B.LO 8f
Frank Barchard46fb8072019-10-25 12:54:22 -0700362
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800363 ST1 {v20.16b, v21.16b}, [x6], x14
Frank Barchard46fb8072019-10-25 12:54:22 -0700364 SUB x3, x3, x2 // a0 -= kc
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800365 ST1 {v22.16b, v23.16b}, [x16], x14
Frank Barchard46fb8072019-10-25 12:54:22 -0700366 SUB x9, x9, x2 // a1 -= kc
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800367 ST1 {v24.16b, v25.16b}, [x17], x14
Frank Barchard46fb8072019-10-25 12:54:22 -0700368 SUB x10, x10, x2 // a2 -= kc
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800369 ST1 {v26.16b, v27.16b}, [x18], x14
370 SUB x11, x11, x2 // a3 -= kc
Frank Barchard46fb8072019-10-25 12:54:22 -0700371
Frank Barchard46fb8072019-10-25 12:54:22 -0700372 B.HI 0b
373
Frank Barchard324f2bb2020-01-14 15:12:11 -0800374 // Restore d12-d15 from stack
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800375 LDP d14, d15, [sp, 16]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800376 LDP d12, d13, [sp], 32
Frank Barchard46fb8072019-10-25 12:54:22 -0700377 RET
378
Frank Barchard81558542020-02-11 16:35:26 -08003795:
380 # Is there a remainder?- 2 floats of A (8 bytes)
381 TBZ x0, 3, 6f
382
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800383 # Remainder- 2 floats of A (8 bytes)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800384 LDR d0, [x3], 8
385 LDR q16, [x5], 16
386 LD1 {v0.d}[1], [x9], 8
387 LDR d1, [x10], 8
388 LD1 {v1.d}[1], [x11], 8
389 LDR q17, [x5], 16
390 LDR q18, [x5], 16
391 LDR q19, [x5], 16
392 FMLA v20.4s, v16.4s, v0.s[0]
393 FMLA v22.4s, v16.4s, v0.s[2]
394 FMLA v24.4s, v16.4s, v1.s[0]
395 FMLA v26.4s, v16.4s, v1.s[2]
396 FMLA v21.4s, v17.4s, v0.s[0]
397 FMLA v23.4s, v17.4s, v0.s[2]
398 FMLA v25.4s, v17.4s, v1.s[0]
399 FMLA v27.4s, v17.4s, v1.s[2]
400
401 FMLA v20.4s, v18.4s, v0.s[1]
402 FMLA v22.4s, v18.4s, v0.s[3]
403 FMLA v24.4s, v18.4s, v1.s[1]
404 FMLA v26.4s, v18.4s, v1.s[3]
405 FMLA v21.4s, v19.4s, v0.s[1]
406 FMLA v23.4s, v19.4s, v0.s[3]
407 FMLA v25.4s, v19.4s, v1.s[1]
408 FMLA v27.4s, v19.4s, v1.s[3]
409
410 # Is there a remainder?- 1 floats of A (4 bytes)
411 TBZ x0, 2, 4b
412
Frank Barchard81558542020-02-11 16:35:26 -08004136:
Frank Barchard46fb8072019-10-25 12:54:22 -0700414 # Remainder- 1 floats of A (4 bytes)
415 LDR s0, [x3], 4
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800416 LDR q16, [x5], 16
Frank Barchard46fb8072019-10-25 12:54:22 -0700417 LD1 {v0.s}[2], [x9], 4
418 LDR s1, [x10], 4
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800419 LD1 {v1.s}[2], [x11], 4
420 LDR q17, [x5], 16
Frank Barchard46fb8072019-10-25 12:54:22 -0700421
422 FMLA v20.4s, v16.4s, v0.s[0]
423 FMLA v22.4s, v16.4s, v0.s[2]
424 FMLA v24.4s, v16.4s, v1.s[0]
425 FMLA v26.4s, v16.4s, v1.s[2]
426 FMLA v21.4s, v17.4s, v0.s[0]
427 FMLA v23.4s, v17.4s, v0.s[2]
428 FMLA v25.4s, v17.4s, v1.s[0]
429 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800430 B 4b
Frank Barchard46fb8072019-10-25 12:54:22 -0700431
432 # Store odd width
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08004338:
434 TBZ x1, 2, 9f
Frank Barchard46fb8072019-10-25 12:54:22 -0700435 STR q20, [x6], 16
436 MOV v20.16b, v21.16b
437 STR q22, [x16], 16
438 MOV v22.16b, v23.16b
439 STR q24, [x17], 16
440 MOV v24.16b, v25.16b
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800441 STR q26, [x18], 16
Frank Barchard46fb8072019-10-25 12:54:22 -0700442 MOV v26.16b, v27.16b
443
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08004449:
445 TBZ x1, 1, 10f
Frank Barchard46fb8072019-10-25 12:54:22 -0700446 STR d20, [x6], 8
447 DUP d20, v20.d[1]
448 STR d22, [x16], 8
449 DUP d22, v22.d[1]
450 STR d24, [x17], 8
451 DUP d24, v24.d[1]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800452 STR d26, [x18], 8
Frank Barchard46fb8072019-10-25 12:54:22 -0700453 DUP d26, v26.d[1]
454
Frank Barchard0ecc2ab2019-11-14 10:57:48 -080045510:
456 TBZ x1, 0, 11f
Frank Barchard46fb8072019-10-25 12:54:22 -0700457 STR s20, [x6]
458 STR s22, [x16]
459 STR s24, [x17]
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800460 STR s26, [x18]
46111:
Frank Barchard324f2bb2020-01-14 15:12:11 -0800462 // Restore d12-d15 from stack
Frank Barchard0ecc2ab2019-11-14 10:57:48 -0800463 LDP d14, d15, [sp, 16]
Frank Barchard324f2bb2020-01-14 15:12:11 -0800464 LDP d12, d13, [sp], 32
Frank Barchard46fb8072019-10-25 12:54:22 -0700465 RET
466
467END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53
468
469#ifdef __ELF__
470.section ".note.GNU-stack","",%progbits
471#endif