blob: 4504d2f8d370837521f002a9e19dd18eef20a1d4 [file] [log] [blame]
Frank Barcharda7fb8552019-10-23 17:14:17 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
Frank Barcharda7fb8552019-10-23 17:14:17 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41# x7 c5
42
Frank Barchardd6ebf0c2020-01-16 10:47:22 -080043# x8 temporary vector shadow register
Frank Barcharde64f91a2019-11-11 13:18:00 -080044
Frank Barcharda7fb8552019-10-23 17:14:17 -070045# Vector register usage
Frank Barchard00bf68e2019-10-27 03:00:09 -070046# A0 v0 v3
47# A1 v0[1] v3[1]
48# A2 v1 v4
49# A3 v1[1] v4[1]
50# A4 v2 v5
51# A5 v2[1] v5[1]
52# B v12 v13 v14 v15 second set of B
53# B v16 v17 v18 v19 first set
Frank Barcharda7fb8552019-10-23 17:14:17 -070054# C v20 v21
55# C v22 v23
56# C v24 v25
57# C v26 v27
58# C v28 v29
59# C v30 v31
60# Clamp v6 v7
Frank Barchard00bf68e2019-10-27 03:00:09 -070061# unused A v8 v9 v10 v11
Frank Barcharda7fb8552019-10-23 17:14:17 -070062
63BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53
64
65 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080066 CMP x0, 2 // if mr < 2
Frank Barcharda7fb8552019-10-23 17:14:17 -070067 ADD x9, x3, x4 // a1 = a0 + a_stride
68 ADD x16, x6, x7 // c1 = c0 + cm_stride
Frank Barcharda7fb8552019-10-23 17:14:17 -070069 CSEL x9, x3, x9, LO // a1 = a0
70 CSEL x16, x6, x16, LO // c1 = c0
71
72 ADD x10, x9, x4 // a2 = a1 + a_stride
73 ADD x17, x16, x7 // c2 = c1 + cm_stride
74 // if mr <= 2
75 CSEL x10, x9, x10, LS // a2 = a1
76 CSEL x17, x16, x17, LS // c2 = c1
77
Frank Barchard684bbb02019-11-16 14:14:42 -080078 CMP x0, 4 // if mr < 4
Frank Barcharda7fb8552019-10-23 17:14:17 -070079 ADD x11, x10, x4 // a3 = a2 + a_stride
80 ADD x18, x17, x7 // c3 = c2 + cm_stride
Frank Barcharda7fb8552019-10-23 17:14:17 -070081 CSEL x11, x10, x11, LO // a3 = a2
82 CSEL x18, x17, x18, LO // c3 = c2
83
84 ADD x12, x11, x4 // a4 = a3 + a_stride
85 ADD x13, x18, x7 // c4 = c3 + cm_stride
86 // if mr <= 5
87 CSEL x12, x11, x12, LS // a4 = a3
88 CSEL x13, x18, x13, LS // c4 = c3
89
90 # Load params pointer
91 LDR x8, [sp, 8]
92
Frank Barchard684bbb02019-11-16 14:14:42 -080093 CMP x0, 6 // if mr < 6
Frank Barcharda7fb8552019-10-23 17:14:17 -070094 ADD x4, x12, x4 // a5 = a4 + a_stride
95 ADD x7, x13, x7 // c5 = c4 + cm_stride
Frank Barcharda7fb8552019-10-23 17:14:17 -070096 CSEL x4, x12, x4, LO // a5 = a4
97 CSEL x7, x13, x7, LO // c5 = c4
98
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070099 # Load min/max values
Frank Barcharda7fb8552019-10-23 17:14:17 -0700100 LD2R {v6.4s, v7.4s}, [x8]
101
102 # Load cn_stride
103 LDR x14, [sp]
104
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800105 // Save d12-d15 on stack
106 STP d12, d13, [sp, -32]!
Frank Barchard00bf68e2019-10-27 03:00:09 -0700107 STP d14, d15, [sp, 16]
108
Frank Barcharda7fb8552019-10-23 17:14:17 -07001090:
110 # Load initial bias from w into accumulators
111 LDP q20, q21, [x5], 32
112 MOV v22.16b, v20.16b
Frank Barchardbd1d5d92019-10-30 15:53:30 -0700113 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
114 PRFM PLDL1KEEP, [x3, 64]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700115 MOV v23.16b, v21.16b
Frank Barchardbd1d5d92019-10-30 15:53:30 -0700116 PRFM PLDL1KEEP, [x9, 0]
117 PRFM PLDL1KEEP, [x9, 64]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700118 MOV v24.16b, v20.16b
Frank Barchardbd1d5d92019-10-30 15:53:30 -0700119 PRFM PLDL1KEEP, [x10, 0]
120 PRFM PLDL1KEEP, [x10, 64]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700121 MOV v25.16b, v21.16b
Frank Barchardbd1d5d92019-10-30 15:53:30 -0700122 PRFM PLDL1KEEP, [x11, 0]
123 PRFM PLDL1KEEP, [x11, 64]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700124 MOV v26.16b, v20.16b
Frank Barchardbd1d5d92019-10-30 15:53:30 -0700125 PRFM PLDL1KEEP, [x12, 0]
126 PRFM PLDL1KEEP, [x12, 64]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700127 MOV v27.16b, v21.16b
Frank Barchardbd1d5d92019-10-30 15:53:30 -0700128 PRFM PLDL1KEEP, [x4, 0]
129 PRFM PLDL1KEEP, [x4, 64]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700130 MOV v28.16b, v20.16b
Frank Barchardbd1d5d92019-10-30 15:53:30 -0700131 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
Frank Barcharda7fb8552019-10-23 17:14:17 -0700132 MOV v29.16b, v21.16b
Frank Barchard534375d2020-01-15 19:22:41 -0800133 PRFM PLDL1KEEP, [x5, 64]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700134 MOV v30.16b, v20.16b
Frank Barchard534375d2020-01-15 19:22:41 -0800135 PRFM PLDL1KEEP, [x5, 128]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700136 MOV v31.16b, v21.16b
Frank Barchard534375d2020-01-15 19:22:41 -0800137 PRFM PLDL1KEEP, [x5, 192]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700138
Frank Barcharde64f91a2019-11-11 13:18:00 -0800139 # Is there at least 4 floats (16 bytes) for prologue + epilogue?
Frank Barchard00bf68e2019-10-27 03:00:09 -0700140 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -0800141 B.LO 5f
Frank Barcharde64f91a2019-11-11 13:18:00 -0800142
143 # Prologue - First group loads, no FMA
Frank Barchardf884a7b2020-01-13 16:39:50 -0800144 LDR d0, [x3], 8 // a0
145 LDP q16, q17, [x5], 32 // b
Frank Barchardf884a7b2020-01-13 16:39:50 -0800146 LDR d1, [x10], 8 // a2
147 LDR d2, [x12], 8 // a4
Frank Barchard7693acf2020-01-13 17:44:16 -0800148 LD1 {v0.d}[1], [x9], 8 // a1
Frank Barchardf884a7b2020-01-13 16:39:50 -0800149 LD1 {v1.d}[1], [x11], 8 // a3
150 LD1 {v2.d}[1], [x4], 8 // a5
Frank Barcharde64f91a2019-11-11 13:18:00 -0800151 SUBS x0, x0, 16
152 LDR q18, [x5], 16
153 LDR d19, [x5], 8
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800154 LDR x8, [x5], 8 // ins is in BLOCK 0
Frank Barcharde64f91a2019-11-11 13:18:00 -0800155
156 # Is there at least 4 floats (16 bytes) for main loop?
Frank Barcharda7fb8552019-10-23 17:14:17 -0700157 B.LO 2f
158
Frank Barcharde64f91a2019-11-11 13:18:00 -0800159 # Main loop - 4 floats of A (16 bytes)
160 # 48 FMA + 12 LD64 A + 8 LDR B
Frank Barcharda7fb8552019-10-23 17:14:17 -07001611:
Frank Barcharde64f91a2019-11-11 13:18:00 -0800162 # First group of 24 FMA, Second group loads
163 // BLOCK 0
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800164 LDR d3, [x3], 8 // a0
165 INS v19.d[1], x8 // b from second group
Frank Barcharda7fb8552019-10-23 17:14:17 -0700166 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800167 LDR x8, [x9], 8 // a1
Frank Barcharda7fb8552019-10-23 17:14:17 -0700168 FMLA v22.4s, v16.4s, v0.s[2]
169 FMLA v24.4s, v16.4s, v1.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800170
171 // BLOCK 1
172 LDR d12, [x5]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800173 INS v3.d[1], x8 // a1 ins
Frank Barcharda7fb8552019-10-23 17:14:17 -0700174 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800175 LDR x8, [x5, 8] // b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700176 FMLA v28.4s, v16.4s, v2.s[0]
177 FMLA v30.4s, v16.4s, v2.s[2]
Frank Barchard00bf68e2019-10-27 03:00:09 -0700178
Frank Barcharde64f91a2019-11-11 13:18:00 -0800179 // BLOCK 2
180 LDR d4, [x10], 8 // a2
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800181 INS v12.d[1], x8 // b ins
Frank Barcharda7fb8552019-10-23 17:14:17 -0700182 FMLA v21.4s, v17.4s, v0.s[0]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800183 LDR x8, [x11], 8 // a3
Frank Barchardf884a7b2020-01-13 16:39:50 -0800184 FMLA v23.4s, v17.4s, v0.s[2]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700185 FMLA v25.4s, v17.4s, v1.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800186
187 // BLOCK 3
188 LDR d5, [x12], 8 // a4
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800189 INS v4.d[1], x8 // a3 ins
Frank Barcharda7fb8552019-10-23 17:14:17 -0700190 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800191 LDR x8, [x4], 8 // a5
Frank Barcharda7fb8552019-10-23 17:14:17 -0700192 FMLA v29.4s, v17.4s, v2.s[0]
193 FMLA v31.4s, v17.4s, v2.s[2]
194
Frank Barcharde64f91a2019-11-11 13:18:00 -0800195 // BLOCK 4
196 LDR d13, [x5, 16]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800197 INS v5.d[1], x8 // a5 ins
Frank Barcharda7fb8552019-10-23 17:14:17 -0700198 FMLA v20.4s, v18.4s, v0.s[1]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800199 LDR x8, [x5, 24]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800200 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barcharda7fb8552019-10-23 17:14:17 -0700201 FMLA v24.4s, v18.4s, v1.s[1]
Frank Barchard00bf68e2019-10-27 03:00:09 -0700202
Frank Barcharde64f91a2019-11-11 13:18:00 -0800203 // BLOCK 5
204 LDR d14, [x5, 32]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800205 INS v13.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800206 FMLA v26.4s, v18.4s, v1.s[3]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800207 LDR x8, [x5, 40]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800208 FMLA v28.4s, v18.4s, v2.s[1]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800209 FMLA v30.4s, v18.4s, v2.s[3]
210
211 // BLOCK 6
212 LDR d15, [x5, 48]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800213 INS v14.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800214 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800215 LDR x8, [x5, 56]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800216 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800217 FMLA v25.4s, v19.4s, v1.s[1]
218
219 // BLOCK 7
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800220 INS v15.d[1], x8
Frank Barcharde64f91a2019-11-11 13:18:00 -0800221 FMLA v27.4s, v19.4s, v1.s[3]
222 FMLA v29.4s, v19.4s, v2.s[1]
223 FMLA v31.4s, v19.4s, v2.s[3]
224
225 # Second group of 24 FMA, First group of loads
226 // BLOCK 0
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800227 LDR d0, [x3], 8 // a0
Frank Barchard00bf68e2019-10-27 03:00:09 -0700228 FMLA v20.4s, v12.4s, v3.s[0]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800229 LDR x8, [x9], 8 // a1
Frank Barcharde64f91a2019-11-11 13:18:00 -0800230 FMLA v22.4s, v12.4s, v3.s[2]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800231 FMLA v24.4s, v12.4s, v4.s[0]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800232 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
Frank Barcharde64f91a2019-11-11 13:18:00 -0800233
234 // BLOCK 1
235 LDR d16, [x5, 64]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800236 INS v0.d[1], x8 // a1 ins
Frank Barcharde64f91a2019-11-11 13:18:00 -0800237 FMLA v26.4s, v12.4s, v4.s[2]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800238 LDR x8, [x5, 72] // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800239 FMLA v28.4s, v12.4s, v5.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800240 FMLA v30.4s, v12.4s, v5.s[2]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800241 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1
Frank Barcharde64f91a2019-11-11 13:18:00 -0800242
243 // BLOCK 2
244 LDR d1, [x10], 8 // a2
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800245 INS v16.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800246 FMLA v21.4s, v13.4s, v3.s[0]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800247 LDR x8, [x11], 8 // a3
Frank Barchardf884a7b2020-01-13 16:39:50 -0800248 FMLA v23.4s, v13.4s, v3.s[2]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800249 FMLA v25.4s, v13.4s, v4.s[0]
250 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2
251
252 // BLOCK 3
253 LDR d2, [x12], 8 // a4
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800254 INS v1.d[1], x8 // a3 ins
Frank Barcharde64f91a2019-11-11 13:18:00 -0800255 FMLA v27.4s, v13.4s, v4.s[2]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800256 LDR x8, [x4], 8 // a5
Frank Barcharde64f91a2019-11-11 13:18:00 -0800257 FMLA v29.4s, v13.4s, v5.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800258 FMLA v31.4s, v13.4s, v5.s[2]
259 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3
260
261 // BLOCK 4
262 LDR d17, [x5, 80]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800263 INS v2.d[1], x8 // a5 ins
Frank Barcharde64f91a2019-11-11 13:18:00 -0800264 FMLA v20.4s, v14.4s, v3.s[1]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800265 LDR x8, [x5, 88]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800266 FMLA v22.4s, v14.4s, v3.s[3]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800267 FMLA v24.4s, v14.4s, v4.s[1]
268 PRFM PLDL1KEEP, [x12, 128] // Prefetch A4
269
270 // BLOCK 5
271 LDR d18, [x5, 96]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800272 INS v17.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800273 FMLA v26.4s, v14.4s, v4.s[3]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800274 LDR x8, [x5, 104]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800275 FMLA v28.4s, v14.4s, v5.s[1]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800276 FMLA v30.4s, v14.4s, v5.s[3]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800277 PRFM PLDL1KEEP, [x4, 128] // Prefetch A5
Frank Barcharde64f91a2019-11-11 13:18:00 -0800278
279 // BLOCK 6
280 LDR d19, [x5, 112]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800281 INS v18.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800282 FMLA v21.4s, v15.4s, v3.s[1]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800283 LDR x8, [x5, 120]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800284 FMLA v23.4s, v15.4s, v3.s[3]
Frank Barchard534375d2020-01-15 19:22:41 -0800285 PRFM PLDL1KEEP, [x5, 192] // Prefetch B
Frank Barcharde64f91a2019-11-11 13:18:00 -0800286 FMLA v25.4s, v15.4s, v4.s[1]
Frank Barchard534375d2020-01-15 19:22:41 -0800287 PRFM PLDL1KEEP, [x5, 256] // Prefetch B
Frank Barcharde64f91a2019-11-11 13:18:00 -0800288
289 // BLOCK 7
290 SUBS x0, x0, 16 // LDR lands here
Frank Barcharde64f91a2019-11-11 13:18:00 -0800291 FMLA v27.4s, v15.4s, v4.s[3]
292 FMLA v29.4s, v15.4s, v5.s[1]
293 ADD x5, x5, 128
294 FMLA v31.4s, v15.4s, v5.s[3]
295 B.HS 1b
296
297 # Epilogue - 4 floats of A (16 bytes)
298 # 48 FMA + 12 LD64 A + 8 LDR B
2992:
300 # First group of 24 FMA, Second group loads
301 // BLOCK 0
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800302 LDR d3, [x3], 8 // a0
303 INS v19.d[1], x8 // b from second group
Frank Barcharde64f91a2019-11-11 13:18:00 -0800304 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800305 LDR x8, [x9], 8 // a1
Frank Barcharde64f91a2019-11-11 13:18:00 -0800306 FMLA v22.4s, v16.4s, v0.s[2]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800307 FMLA v24.4s, v16.4s, v1.s[0]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800308 PRFM PSTL1KEEP, [x6] // Prefetch C0
Frank Barcharde64f91a2019-11-11 13:18:00 -0800309
310 // BLOCK 1
311 LDR d12, [x5]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800312 INS v3.d[1], x8 // a1 ins
Frank Barcharde64f91a2019-11-11 13:18:00 -0800313 FMLA v26.4s, v16.4s, v1.s[2]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800314 LDR x8, [x5, 8] // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800315 FMLA v28.4s, v16.4s, v2.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800316 FMLA v30.4s, v16.4s, v2.s[2]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800317 PRFM PSTL1KEEP, [x16] // Prefetch C1
Frank Barcharde64f91a2019-11-11 13:18:00 -0800318
319 // BLOCK 2
320 LDR d4, [x10], 8 // a2
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800321 INS v12.d[1], x8 // b ins
Frank Barcharde64f91a2019-11-11 13:18:00 -0800322 FMLA v21.4s, v17.4s, v0.s[0]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800323 LDR x8, [x11], 8 // a3
Frank Barchardf884a7b2020-01-13 16:39:50 -0800324 FMLA v23.4s, v17.4s, v0.s[2]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800325 FMLA v25.4s, v17.4s, v1.s[0]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800326 PRFM PSTL1KEEP, [x17] // Prefetch C2
Frank Barcharde64f91a2019-11-11 13:18:00 -0800327
328 // BLOCK 3
329 LDR d5, [x12], 8 // a4
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800330 INS v4.d[1], x8 // a3 ins
Frank Barcharde64f91a2019-11-11 13:18:00 -0800331 FMLA v27.4s, v17.4s, v1.s[2]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800332 LDR x8, [x4], 8 // a5
Frank Barcharde64f91a2019-11-11 13:18:00 -0800333 FMLA v29.4s, v17.4s, v2.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800334 FMLA v31.4s, v17.4s, v2.s[2]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800335 PRFM PSTL1KEEP, [x18] // Prefetch C3
Frank Barcharde64f91a2019-11-11 13:18:00 -0800336
337 // BLOCK 4
338 LDR d13, [x5, 16]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800339 INS v5.d[1], x8 // a5 ins
Frank Barcharde64f91a2019-11-11 13:18:00 -0800340 FMLA v20.4s, v18.4s, v0.s[1]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800341 LDR x8, [x5, 24]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800342 FMLA v22.4s, v18.4s, v0.s[3]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800343 FMLA v24.4s, v18.4s, v1.s[1]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800344 PRFM PSTL1KEEP, [x13] // Prefetch C4
Frank Barcharde64f91a2019-11-11 13:18:00 -0800345
346 // BLOCK 5
347 LDR d14, [x5, 32]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800348 INS v13.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800349 FMLA v26.4s, v18.4s, v1.s[3]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800350 LDR x8, [x5, 40]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800351 FMLA v28.4s, v18.4s, v2.s[1]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800352 FMLA v30.4s, v18.4s, v2.s[3]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800353 PRFM PSTL1KEEP, [x7] // Prefetch C5
Frank Barcharde64f91a2019-11-11 13:18:00 -0800354
355 // BLOCK 6
356 LDR d15, [x5, 48]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800357 INS v14.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800358 FMLA v21.4s, v19.4s, v0.s[1]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800359 LDR x8, [x5, 56]
Frank Barchardf884a7b2020-01-13 16:39:50 -0800360 FMLA v23.4s, v19.4s, v0.s[3]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800361 FMLA v25.4s, v19.4s, v1.s[1]
362
363 // BLOCK 7
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800364 INS v15.d[1], x8 // b
Frank Barcharde64f91a2019-11-11 13:18:00 -0800365 FMLA v27.4s, v19.4s, v1.s[3]
366 FMLA v29.4s, v19.4s, v2.s[1]
367 FMLA v31.4s, v19.4s, v2.s[3]
368
369 # Second group of 24 FMA, First group of loads
370 // BLOCK 0
Frank Barcharde64f91a2019-11-11 13:18:00 -0800371 FMLA v20.4s, v12.4s, v3.s[0]
Frank Barchard00bf68e2019-10-27 03:00:09 -0700372 FMLA v22.4s, v12.4s, v3.s[2]
373 FMLA v24.4s, v12.4s, v4.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800374
375 // BLOCK 1
Frank Barchard00bf68e2019-10-27 03:00:09 -0700376 FMLA v26.4s, v12.4s, v4.s[2]
377 FMLA v28.4s, v12.4s, v5.s[0]
378 FMLA v30.4s, v12.4s, v5.s[2]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800379
380 // BLOCK 2
Frank Barchard00bf68e2019-10-27 03:00:09 -0700381 FMLA v21.4s, v13.4s, v3.s[0]
382 FMLA v23.4s, v13.4s, v3.s[2]
383 FMLA v25.4s, v13.4s, v4.s[0]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800384
385 // BLOCK 3
Frank Barchard00bf68e2019-10-27 03:00:09 -0700386 FMLA v27.4s, v13.4s, v4.s[2]
387 FMLA v29.4s, v13.4s, v5.s[0]
388 FMLA v31.4s, v13.4s, v5.s[2]
389
Frank Barcharde64f91a2019-11-11 13:18:00 -0800390 // BLOCK 4
Frank Barchard00bf68e2019-10-27 03:00:09 -0700391 FMLA v20.4s, v14.4s, v3.s[1]
392 FMLA v22.4s, v14.4s, v3.s[3]
393 FMLA v24.4s, v14.4s, v4.s[1]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800394
395 // BLOCK 5
Frank Barchard00bf68e2019-10-27 03:00:09 -0700396 FMLA v26.4s, v14.4s, v4.s[3]
397 FMLA v28.4s, v14.4s, v5.s[1]
398 FMLA v30.4s, v14.4s, v5.s[3]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800399
400 // BLOCK 6
Frank Barchard00bf68e2019-10-27 03:00:09 -0700401 FMLA v21.4s, v15.4s, v3.s[1]
402 FMLA v23.4s, v15.4s, v3.s[3]
403 FMLA v25.4s, v15.4s, v4.s[1]
Frank Barchard81558542020-02-11 16:35:26 -0800404 TST x0, 15
Frank Barcharde64f91a2019-11-11 13:18:00 -0800405
406 // BLOCK 7
Frank Barchard00bf68e2019-10-27 03:00:09 -0700407 FMLA v27.4s, v15.4s, v4.s[3]
Frank Barchard00bf68e2019-10-27 03:00:09 -0700408 FMLA v29.4s, v15.4s, v5.s[1]
409 FMLA v31.4s, v15.4s, v5.s[3]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800410 ADD x5, x5, 64
Frank Barchard00bf68e2019-10-27 03:00:09 -0700411
Frank Barchard81558542020-02-11 16:35:26 -0800412 # Is there a remainder?- 2 floats of A (8 bytes) or less
413 B.NE 5f
Frank Barcharde64f91a2019-11-11 13:18:00 -08004144:
Frank Barcharda7fb8552019-10-23 17:14:17 -0700415 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700416 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800417 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700418 FMAX v21.4s, v21.4s, v6.4s
419 FMAX v22.4s, v22.4s, v6.4s
420 FMAX v23.4s, v23.4s, v6.4s
421 FMAX v24.4s, v24.4s, v6.4s
422 FMAX v25.4s, v25.4s, v6.4s
423 FMAX v26.4s, v26.4s, v6.4s
424 FMAX v27.4s, v27.4s, v6.4s
425 FMAX v28.4s, v28.4s, v6.4s
426 FMAX v29.4s, v29.4s, v6.4s
427 FMAX v30.4s, v30.4s, v6.4s
428 FMAX v31.4s, v31.4s, v6.4s
429 FMIN v20.4s, v20.4s, v7.4s
430 FMIN v21.4s, v21.4s, v7.4s
431 FMIN v22.4s, v22.4s, v7.4s
432 FMIN v23.4s, v23.4s, v7.4s
433 FMIN v24.4s, v24.4s, v7.4s
434 FMIN v25.4s, v25.4s, v7.4s
435 FMIN v26.4s, v26.4s, v7.4s
436 FMIN v27.4s, v27.4s, v7.4s
437 FMIN v28.4s, v28.4s, v7.4s
438 FMIN v29.4s, v29.4s, v7.4s
439 FMIN v30.4s, v30.4s, v7.4s
440 FMIN v31.4s, v31.4s, v7.4s
Frank Barcharda7fb8552019-10-23 17:14:17 -0700441
442 # Store full 6 x 8
Frank Barcharde64f91a2019-11-11 13:18:00 -0800443 B.LO 8f
Frank Barcharda7fb8552019-10-23 17:14:17 -0700444
Frank Barcharde64f91a2019-11-11 13:18:00 -0800445 ST1 {v20.16b, v21.16b}, [x6], x14
Frank Barcharda7fb8552019-10-23 17:14:17 -0700446 SUB x3, x3, x2 // a0 -= kc
Frank Barcharde64f91a2019-11-11 13:18:00 -0800447 ST1 {v22.16b, v23.16b}, [x16], x14
Frank Barcharda7fb8552019-10-23 17:14:17 -0700448 SUB x9, x9, x2 // a1 -= kc
Frank Barcharde64f91a2019-11-11 13:18:00 -0800449 ST1 {v24.16b, v25.16b}, [x17], x14
Frank Barcharda7fb8552019-10-23 17:14:17 -0700450 SUB x10, x10, x2 // a2 -= kc
Frank Barcharde64f91a2019-11-11 13:18:00 -0800451 ST1 {v26.16b, v27.16b}, [x18], x14
Frank Barcharda7fb8552019-10-23 17:14:17 -0700452 SUB x11, x11, x2 // a3 -= kc
Frank Barcharde64f91a2019-11-11 13:18:00 -0800453 ST1 {v28.16b, v29.16b}, [x13], x14
Frank Barcharda7fb8552019-10-23 17:14:17 -0700454 SUB x12, x12, x2 // a4 -= kc
Frank Barcharde64f91a2019-11-11 13:18:00 -0800455 ST1 {v30.16b, v31.16b}, [x7], x14
Frank Barcharda7fb8552019-10-23 17:14:17 -0700456 SUB x4, x4, x2 // a5 -= kc
457
Frank Barcharda7fb8552019-10-23 17:14:17 -0700458 B.HI 0b
459
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800460 // Restore d12-d15 from stack
Frank Barchard00bf68e2019-10-27 03:00:09 -0700461 LDP d14, d15, [sp, 16]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800462 LDP d12, d13, [sp], 32
Frank Barcharda7fb8552019-10-23 17:14:17 -0700463 RET
464
Frank Barchard81558542020-02-11 16:35:26 -08004655:
466 # Is there a remainder?- 2 floats of A (8 bytes)
467 TBZ x0, 3, 6f
468
Frank Barchard00bf68e2019-10-27 03:00:09 -0700469 # Remainder- 2 floats of A (8 bytes)
Frank Barchard00bf68e2019-10-27 03:00:09 -0700470 LDR d0, [x3], 8
471 LDR q16, [x5], 16
Frank Barchard00bf68e2019-10-27 03:00:09 -0700472 LD1 {v0.d}[1], [x9], 8
473 LDR d1, [x10], 8
474 LD1 {v1.d}[1], [x11], 8
475 LDR d2, [x12], 8
476 LD1 {v2.d}[1], [x4], 8
Frank Barchard9efaed72019-11-15 17:38:49 -0800477 LDR q17, [x5], 16
Frank Barchard00bf68e2019-10-27 03:00:09 -0700478 LDR q18, [x5], 16
479 LDR q19, [x5], 16
Frank Barchard9efaed72019-11-15 17:38:49 -0800480
481 FMLA v20.4s, v16.4s, v0.s[0]
Frank Barchard00bf68e2019-10-27 03:00:09 -0700482 FMLA v22.4s, v16.4s, v0.s[2]
483 FMLA v24.4s, v16.4s, v1.s[0]
484 FMLA v26.4s, v16.4s, v1.s[2]
485 FMLA v28.4s, v16.4s, v2.s[0]
486 FMLA v30.4s, v16.4s, v2.s[2]
487 FMLA v21.4s, v17.4s, v0.s[0]
488 FMLA v23.4s, v17.4s, v0.s[2]
489 FMLA v25.4s, v17.4s, v1.s[0]
490 FMLA v27.4s, v17.4s, v1.s[2]
491 FMLA v29.4s, v17.4s, v2.s[0]
492 FMLA v31.4s, v17.4s, v2.s[2]
493
494 FMLA v20.4s, v18.4s, v0.s[1]
495 FMLA v22.4s, v18.4s, v0.s[3]
496 FMLA v24.4s, v18.4s, v1.s[1]
497 FMLA v26.4s, v18.4s, v1.s[3]
498 FMLA v28.4s, v18.4s, v2.s[1]
499 FMLA v30.4s, v18.4s, v2.s[3]
500 FMLA v21.4s, v19.4s, v0.s[1]
501 FMLA v23.4s, v19.4s, v0.s[3]
502 FMLA v25.4s, v19.4s, v1.s[1]
503 FMLA v27.4s, v19.4s, v1.s[3]
504 FMLA v29.4s, v19.4s, v2.s[1]
505 FMLA v31.4s, v19.4s, v2.s[3]
506
507 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barcharde64f91a2019-11-11 13:18:00 -0800508 TBZ x0, 2, 4b
Frank Barchard81558542020-02-11 16:35:26 -08005096:
Frank Barcharda7fb8552019-10-23 17:14:17 -0700510 # Remainder- 1 floats of A (4 bytes)
511 LDR s0, [x3], 4
Frank Barchard00bf68e2019-10-27 03:00:09 -0700512 LDR q16, [x5], 16
Frank Barcharda7fb8552019-10-23 17:14:17 -0700513 LD1 {v0.s}[2], [x9], 4
514 LDR s1, [x10], 4
515 LD1 {v1.s}[2], [x11], 4
516 LDR s2, [x12], 4
517 LD1 {v2.s}[2], [x4], 4
Frank Barchard9efaed72019-11-15 17:38:49 -0800518 LDR q17, [x5], 16
Frank Barcharda7fb8552019-10-23 17:14:17 -0700519
520 FMLA v20.4s, v16.4s, v0.s[0]
521 FMLA v22.4s, v16.4s, v0.s[2]
522 FMLA v24.4s, v16.4s, v1.s[0]
523 FMLA v26.4s, v16.4s, v1.s[2]
524 FMLA v28.4s, v16.4s, v2.s[0]
525 FMLA v30.4s, v16.4s, v2.s[2]
526 FMLA v21.4s, v17.4s, v0.s[0]
527 FMLA v23.4s, v17.4s, v0.s[2]
528 FMLA v25.4s, v17.4s, v1.s[0]
529 FMLA v27.4s, v17.4s, v1.s[2]
530 FMLA v29.4s, v17.4s, v2.s[0]
531 FMLA v31.4s, v17.4s, v2.s[2]
Frank Barcharde64f91a2019-11-11 13:18:00 -0800532 B 4b
Frank Barcharda7fb8552019-10-23 17:14:17 -0700533
534 # Store odd width
Frank Barcharde64f91a2019-11-11 13:18:00 -08005358:
536 TBZ x1, 2, 9f
Frank Barcharda7fb8552019-10-23 17:14:17 -0700537 STR q20, [x6], 16
538 MOV v20.16b, v21.16b
539 STR q22, [x16], 16
540 MOV v22.16b, v23.16b
541 STR q24, [x17], 16
542 MOV v24.16b, v25.16b
543 STR q26, [x18], 16
544 MOV v26.16b, v27.16b
545 STR q28, [x13], 16
546 MOV v28.16b, v29.16b
547 STR q30, [x7], 16
548 MOV v30.16b, v31.16b
549
Frank Barcharde64f91a2019-11-11 13:18:00 -08005509:
551 TBZ x1, 1, 10f
Frank Barcharda7fb8552019-10-23 17:14:17 -0700552 STR d20, [x6], 8
553 DUP d20, v20.d[1]
554 STR d22, [x16], 8
555 DUP d22, v22.d[1]
556 STR d24, [x17], 8
557 DUP d24, v24.d[1]
558 STR d26, [x18], 8
559 DUP d26, v26.d[1]
560 STR d28, [x13], 8
561 DUP d28, v28.d[1]
562 STR d30, [x7], 8
563 DUP d30, v30.d[1]
564
Frank Barcharde64f91a2019-11-11 13:18:00 -080056510:
566 TBZ x1, 0, 11f
Frank Barcharda7fb8552019-10-23 17:14:17 -0700567 STR s20, [x6]
568 STR s22, [x16]
569 STR s24, [x17]
570 STR s26, [x18]
571 STR s28, [x13]
572 STR s30, [x7]
Frank Barcharde64f91a2019-11-11 13:18:00 -080057311:
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800574 // Restore d12-d15 from stack
Frank Barchard00bf68e2019-10-27 03:00:09 -0700575 LDP d14, d15, [sp, 16]
Frank Barchardd6ebf0c2020-01-16 10:47:22 -0800576 LDP d12, d13, [sp], 32
Frank Barcharda7fb8552019-10-23 17:14:17 -0700577 RET
578
579END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53
580
581#ifdef __ELF__
582.section ".note.GNU-stack","",%progbits
583#endif