blob: d0bb021ed556dad29d2894d66e54d6e0170bbd9a [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
Frank Barchardc03b2bd2020-01-15 12:20:25 -080039# x8 temporary vector shadow register
40
XNNPACK Teamb455b122019-09-27 18:10:33 -070041# Vector register usage and GPR shadows
Frank Barchardc03b2bd2020-01-15 12:20:25 -080042# a0 v0
43# a1 v0[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -070044# a2 v1
Frank Barchardc03b2bd2020-01-15 12:20:25 -080045# a3 v1[1]
46# a0 v2
47# a1 v2[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -070048# a2 v3
Frank Barchardc03b2bd2020-01-15 12:20:25 -080049# a3 v3[1]
50# B v6 v7 v8
51# B v9 v10 v11
52# B v14 v15 v16
53# B v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070054# C v20 v21 v22
55# C v23 v24 v25
56# C v26 v27 v28
57# C v29 v30 v31
58# Clamp v4 v5
59# v12 to v13 unused.
60
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
62
63 $if INC:
64 # Load cn_stride, acc
65 LDP x14, x15, [sp]
66 # Load params pointer
67 LDR x8, [sp, 16]
68 $else:
69 # Load cn_stride, params pointer
70 LDP x14, x8, [sp]
71
72 # Load clamping_params values
73 LD2R {v4.4s, v5.4s}, [x8]
74
XNNPACK Teamb455b122019-09-27 18:10:33 -070075 # Save d8-d11,d14,d15 on stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -080076 STP d8, d9, [sp, -48]!
77 STP d10, d11, [sp, 16]
78 STP d14, d15, [sp, 32]
XNNPACK Teamb455b122019-09-27 18:10:33 -070079
80 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080081 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 ADD x11, x3, x4 // a1 = a0 + a_stride
83 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070084 CSEL x11, x3, x11, LO // a1 = a0
85 CSEL x9, x6, x9, LO // c1 = c0
86 ADD x12, x11, x4 // a2 = a1 + a_stride
87 ADD x10, x9, x7 // c2 = c1 + cm_stride
88 // if mr <= 2
89 CSEL x12, x11, x12, LS // a2 = a1
90 CSEL x10, x9, x10, LS // c2 = c1
Frank Barchard684bbb02019-11-16 14:14:42 -080091 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070092 ADD x4, x12, x4 // a3 = a2 + a_stride
93 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070094 CSEL x4, x12, x4, LO // a3 = a2
95 CSEL x7, x10, x7, LO // c3 = c2
96
970:
98 $if INC:
99 # Load initial accumulators
100 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
101 LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
102 LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
103 LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
Frank Barchard4cd89072020-01-10 11:35:10 -0800104 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
105 PRFM PLDL1KEEP, [x3, 64]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800106 PRFM PLDL1KEEP, [x11, 0]
107 PRFM PLDL1KEEP, [x11, 64]
Frank Barchard4cd89072020-01-10 11:35:10 -0800108 PRFM PLDL1KEEP, [x12, 0]
109 PRFM PLDL1KEEP, [x12, 64]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800110 PRFM PLDL1KEEP, [x4, 0]
111 PRFM PLDL1KEEP, [x4, 64]
Frank Barchard4cd89072020-01-10 11:35:10 -0800112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
113 PRFM PLDL1KEEP, [x5, 64]
114 PRFM PLDL1KEEP, [x5, 128]
115 PRFM PLDL1KEEP, [x5, 192]
116 PRFM PLDL1KEEP, [x5, 256]
117 PRFM PLDL1KEEP, [x5, 320]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700118 $else:
119 # Load initial bias from w into accumulators
120 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
121 MOV v23.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800122 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
123 PRFM PLDL1KEEP, [x3, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700124 MOV v24.16b, v21.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800125 PRFM PLDL1KEEP, [x11, 0]
126 PRFM PLDL1KEEP, [x11, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127 MOV v25.16b, v22.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800128 PRFM PLDL1KEEP, [x12, 0]
129 PRFM PLDL1KEEP, [x12, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700130 MOV v26.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800131 PRFM PLDL1KEEP, [x4, 0]
132 PRFM PLDL1KEEP, [x4, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700133 MOV v27.16b, v21.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800134 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
135 PRFM PLDL1KEEP, [x5, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700136 MOV v28.16b, v22.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800137 PRFM PLDL1KEEP, [x5, 128]
138 PRFM PLDL1KEEP, [x5, 192]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700139 MOV v29.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800140 PRFM PLDL1KEEP, [x5, 256]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700141 MOV v30.16b, v21.16b
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800142 PRFM PLDL1KEEP, [x5, 320]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700143 MOV v31.16b, v22.16b
144
XNNPACK Teamb455b122019-09-27 18:10:33 -0700145 # Is there at least 4 floats (16 bytes)?
146 SUBS x0, x2, 16 // k = kc - 16
Miao Wang3fa1f012020-02-17 22:45:06 +0000147 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148
149 SUBS x0, x0, 16
150
151 # Prologue - loads for first group of 24 FMA
152
153 # Read first block of 4 A.
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800154 LDR d0, [x3], 8 // a0
155 LDR d1, [x12], 8 // a2
156 LD1 {v0.d}[1], [x11], 8 // a1
157 LD1 {v1.d}[1], [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700158
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800159 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
160 LD1 {v9.16b, v10.16b}, [x5], 32
161 LDR d11, [x5], 8
162 LDR x8, [x5], 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700163
164 # Is there at least 4 floats (16 bytes) for main loop?
165 B.LO 2f
166
167 # Main loop - 4 floats of A (16 bytes)
1681:
169 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
170 # A is loaded for 2nd group into v2/v3
171 # INS is 4 blocks (16 cycles) after load
172
173 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800174 LDR d2, [x3], 8 // a0
175 INS v11.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700176 FMLA v20.4s, v6.4s, v0.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800177 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700178 FMLA v23.4s, v6.4s, v0.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700179 FMLA v26.4s, v6.4s, v1.s[0]
Frank Barchard4cd89072020-01-10 11:35:10 -0800180 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700181
182 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800183 LDR d3, [x12], 8 // a2
184 INS v2.d[1], x8 // a1 was loaded in block 0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700185 FMLA v29.4s, v6.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800186 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700187 FMLA v21.4s, v7.4s, v0.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700188 FMLA v24.4s, v7.4s, v0.s[2]
Frank Barchard4cd89072020-01-10 11:35:10 -0800189 PRFM PLDL1KEEP, [x11, 128] // Prefetch A1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190
191 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800192 LDR d14, [x5] // vb0x0123
193 INS v3.d[1], x8 // a3 was loaded in block 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700194 FMLA v27.4s, v7.4s, v1.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800195 LDR x8, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700196 FMLA v30.4s, v7.4s, v1.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700197 FMLA v22.4s, v8.4s, v0.s[0]
Frank Barchard4cd89072020-01-10 11:35:10 -0800198 PRFM PLDL1KEEP, [x12, 128] // Prefetch A2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700199
200 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800201 LDR d15, [x5, 16] // vb0x4567
202 INS v14.d[1], x8 // v14 was loaded in block 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700203 FMLA v25.4s, v8.4s, v0.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800204 LDR x8, [x5, 24]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205 FMLA v28.4s, v8.4s, v1.s[0]
206 FMLA v31.4s, v8.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800207 PRFM PLDL1KEEP, [x4, 128] // Prefetch A3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700208
209 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800210 LDR d16, [x5, 32] // vb0x89AB
211 INS v15.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700212 FMLA v20.4s, v9.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800213 LDR x8, [x5, 40]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700214 FMLA v23.4s, v9.4s, v0.s[3]
215 FMLA v26.4s, v9.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800216 PRFM PLDL1KEEP, [x5, 320] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700217
218 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800219 LDR d17, [x5, 48] // vb1x0123
220 INS v16.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700221 FMLA v29.4s, v9.4s, v1.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800222 LDR x8, [x5, 56]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 FMLA v21.4s, v10.4s, v0.s[1]
224 FMLA v24.4s, v10.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800225 PRFM PLDL1KEEP, [x5, 384] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700226
227 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800228 LDR d18, [x5, 64] // vb1x4567
229 INS v17.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700230 FMLA v27.4s, v10.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800231 LDR x8, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700232 FMLA v30.4s, v10.4s, v1.s[3]
233 FMLA v22.4s, v11.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800234 PRFM PLDL1KEEP, [x5, 448] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700235
236 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800237 LDR d19, [x5, 80] // vb1x89AB
238 INS v18.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700239 FMLA v25.4s, v11.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800240 LDR x8, [x5, 88]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700241 FMLA v28.4s, v11.4s, v1.s[1]
242 FMLA v31.4s, v11.4s, v1.s[3]
243
244 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
245 # A is loaded for 1st group into v0/v1
246
247 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800248 LDR d0, [x3], 8 // a0
249 INS v19.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700250 FMLA v20.4s, v14.4s, v2.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800251 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700252 FMLA v23.4s, v14.4s, v2.s[2]
253 FMLA v26.4s, v14.4s, v3.s[0]
254
255 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800256 LDR d1, [x12], 8 // a2
257 INS v0.d[1], x8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700258 FMLA v29.4s, v14.4s, v3.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800259 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700260 FMLA v21.4s, v15.4s, v2.s[0]
261 FMLA v24.4s, v15.4s, v2.s[2]
262
263 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800264 LDR d6, [x5, 96] // vb0x0123
265 INS v1.d[1], x8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266 FMLA v27.4s, v15.4s, v3.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800267 LDR x8, [x5, 104]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700268 FMLA v30.4s, v15.4s, v3.s[2]
269 FMLA v22.4s, v16.4s, v2.s[0]
270
271 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800272 LDR d7, [x5, 112] // vb0x4567
273 INS v6.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700274 FMLA v25.4s, v16.4s, v2.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800275 LDR x8, [x5, 120]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700276 FMLA v28.4s, v16.4s, v3.s[0]
277 FMLA v31.4s, v16.4s, v3.s[2]
278
279 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800280 LDR d8, [x5, 128] // vb0x89AB
281 INS v7.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700282 FMLA v20.4s, v17.4s, v2.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800283 LDR x8, [x5, 136]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700284 FMLA v23.4s, v17.4s, v2.s[3]
285 FMLA v26.4s, v17.4s, v3.s[1]
286
287 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800288 LDR d9, [x5, 144] // vb1x0123
289 INS v8.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700290 FMLA v29.4s, v17.4s, v3.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800291 LDR x8, [x5, 152]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700292 FMLA v21.4s, v18.4s, v2.s[1]
293 FMLA v24.4s, v18.4s, v2.s[3]
294
295 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800296 LDR d10, [x5, 160] // vb1x4567
297 INS v9.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700298 FMLA v27.4s, v18.4s, v3.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800299 LDR x8, [x5, 168]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700300 FMLA v30.4s, v18.4s, v3.s[3]
301 SUBS x0, x0, 16
302 FMLA v22.4s, v19.4s, v2.s[1]
303
304 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800305 LDR d11, [x5, 176] // vb1x89AB
306 INS v10.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700307 FMLA v25.4s, v19.4s, v2.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800308 LDR x8, [x5, 184]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700309 FMLA v28.4s, v19.4s, v3.s[1]
310 ADD x5, x5, 192
311 FMLA v31.4s, v19.4s, v3.s[3]
312 B.HS 1b
313
314 # Epilogue
315 # First block same as main loop. Second block has no loads.
3162:
317 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800318 LDR d2, [x3], 8 // a0
319 INS v11.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700320 FMLA v20.4s, v6.4s, v0.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800321 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700322 FMLA v23.4s, v6.4s, v0.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700323 FMLA v26.4s, v6.4s, v1.s[0]
324
325 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800326 LDR d3, [x12], 8 // a2
327 INS v2.d[1], x8 // a1 was loaded in block 0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700328 FMLA v29.4s, v6.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800329 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700330 FMLA v21.4s, v7.4s, v0.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700331 FMLA v24.4s, v7.4s, v0.s[2]
332
333 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800334 LDR d14, [x5] // vb0x0123
335 INS v3.d[1], x8 // a3 was loaded in block 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700336 FMLA v27.4s, v7.4s, v1.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800337 LDR x8, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700338 FMLA v30.4s, v7.4s, v1.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700339 FMLA v22.4s, v8.4s, v0.s[0]
340
341 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800342 LDR d15, [x5, 16] // vb0x4567
343 INS v14.d[1], x8 // v14 was loaded in block 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700344 FMLA v25.4s, v8.4s, v0.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800345 LDR x8, [x5, 24]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700346 FMLA v28.4s, v8.4s, v1.s[0]
347 FMLA v31.4s, v8.4s, v1.s[2]
348
349 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800350 LDR d16, [x5, 32] // vb0x89AB
351 INS v15.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700352 FMLA v20.4s, v9.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800353 LDR x8, [x5, 40]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700354 FMLA v23.4s, v9.4s, v0.s[3]
355 FMLA v26.4s, v9.4s, v1.s[1]
356
357 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800358 LDR d17, [x5, 48] // vb1x0123
359 INS v16.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700360 FMLA v29.4s, v9.4s, v1.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800361 LDR x8, [x5, 56]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700362 FMLA v21.4s, v10.4s, v0.s[1]
363 FMLA v24.4s, v10.4s, v0.s[3]
364
365 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800366 LDR d18, [x5, 64] // vb1x4567
367 INS v17.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700368 FMLA v27.4s, v10.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800369 LDR x8, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370 FMLA v30.4s, v10.4s, v1.s[3]
371 FMLA v22.4s, v11.4s, v0.s[1]
372
373 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800374 LDR d19, [x5, 80] // vb1x89AB
375 INS v18.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700376 FMLA v25.4s, v11.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800377 LDR x8, [x5, 88]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700378 FMLA v28.4s, v11.4s, v1.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700379 FMLA v31.4s, v11.4s, v1.s[3]
380
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800381 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
382 # A is loaded for 1st group into v0/v1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700383
384 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800385 INS v19.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700386 FMLA v20.4s, v14.4s, v2.s[0]
387 FMLA v23.4s, v14.4s, v2.s[2]
388 FMLA v26.4s, v14.4s, v3.s[0]
389
390 # BLOCK 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700391 FMLA v29.4s, v14.4s, v3.s[2]
392 FMLA v21.4s, v15.4s, v2.s[0]
393 FMLA v24.4s, v15.4s, v2.s[2]
394
395 # BLOCK 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700396 FMLA v27.4s, v15.4s, v3.s[0]
397 FMLA v30.4s, v15.4s, v3.s[2]
398 FMLA v22.4s, v16.4s, v2.s[0]
399
400 # BLOCK 3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700401 FMLA v25.4s, v16.4s, v2.s[2]
402 FMLA v28.4s, v16.4s, v3.s[0]
403 FMLA v31.4s, v16.4s, v3.s[2]
404
405 # BLOCK 4
XNNPACK Teamb455b122019-09-27 18:10:33 -0700406 FMLA v20.4s, v17.4s, v2.s[1]
407 FMLA v23.4s, v17.4s, v2.s[3]
408 FMLA v26.4s, v17.4s, v3.s[1]
409
410 # BLOCK 5
411 FMLA v29.4s, v17.4s, v3.s[3]
412 FMLA v21.4s, v18.4s, v2.s[1]
413 FMLA v24.4s, v18.4s, v2.s[3]
414
415 # BLOCK 6
416 FMLA v27.4s, v18.4s, v3.s[1]
417 FMLA v30.4s, v18.4s, v3.s[3]
418 FMLA v22.4s, v19.4s, v2.s[1]
Miao Wang3fa1f012020-02-17 22:45:06 +0000419 TST x0, 15
XNNPACK Teamb455b122019-09-27 18:10:33 -0700420
421 # BLOCK 7
422 FMLA v25.4s, v19.4s, v2.s[3]
423 FMLA v28.4s, v19.4s, v3.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800424 ADD x5, x5, 96
XNNPACK Teamb455b122019-09-27 18:10:33 -0700425 FMLA v31.4s, v19.4s, v3.s[3]
426
Miao Wang3fa1f012020-02-17 22:45:06 +0000427 # Is there a remainder?- 2 floats of A (8 bytes) or less
428 B.NE 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700429
4304:
431 # Clamp
432 FMIN v20.4s, v20.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800433 SUBS x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700434 FMIN v21.4s, v21.4s, v4.4s
435 FMIN v22.4s, v22.4s, v4.4s
436 FMIN v23.4s, v23.4s, v4.4s
437 FMIN v24.4s, v24.4s, v4.4s
438 FMIN v25.4s, v25.4s, v4.4s
439 FMIN v26.4s, v26.4s, v4.4s
440 FMIN v27.4s, v27.4s, v4.4s
441 FMIN v28.4s, v28.4s, v4.4s
442 FMIN v29.4s, v29.4s, v4.4s
443 FMIN v30.4s, v30.4s, v4.4s
444 FMIN v31.4s, v31.4s, v4.4s
445 FMAX v20.4s, v20.4s, v5.4s
446 FMAX v21.4s, v21.4s, v5.4s
447 FMAX v22.4s, v22.4s, v5.4s
448 FMAX v23.4s, v23.4s, v5.4s
449 FMAX v24.4s, v24.4s, v5.4s
450 FMAX v25.4s, v25.4s, v5.4s
451 FMAX v26.4s, v26.4s, v5.4s
452 FMAX v27.4s, v27.4s, v5.4s
453 FMAX v28.4s, v28.4s, v5.4s
454 FMAX v29.4s, v29.4s, v5.4s
455 FMAX v30.4s, v30.4s, v5.4s
456 FMAX v31.4s, v31.4s, v5.4s
457
458 # Store full 4 x 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700459 B.LO 7f
460
Frank Barchard19418b52019-11-15 15:15:13 -0800461 $if INC:
462 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14
463 SUB x3, x3, x2 // a0 -= kc
464 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
465 SUB x11, x11, x2 // a1 -= kc
466 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14
467 SUB x12, x12, x2 // a2 -= kc
468 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
469 SUB x4, x4, x2 // a3 -= kc
470 $else:
471 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
472 SUB x3, x3, x2 // a0 -= kc
473 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14
474 SUB x11, x11, x2 // a1 -= kc
475 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
476 SUB x12, x12, x2 // a2 -= kc
477 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14
478 SUB x4, x4, x2 // a3 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700479
XNNPACK Teamb455b122019-09-27 18:10:33 -0700480 B.HI 0b
481
482 # Restore d8-d11,d14,d15 from stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800483 LDP d14, d15, [sp, 32]
484 LDP d10, d11, [sp, 16]
485 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700486 RET
487
4885:
Miao Wang3fa1f012020-02-17 22:45:06 +0000489 # Is there a remainder?- 2 floats of A (8 bytes)
490 TBZ x0, 3, 6f
491
XNNPACK Teamb455b122019-09-27 18:10:33 -0700492 # Remainder - 2 floats of A (8 bytes)
493 # Read first block of 4 A.
494 LDR d0, [x3], 8 // a0
Frank Barchard9efaed72019-11-15 17:38:49 -0800495 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700496 LDR d1, [x11], 8 // a1
497 LDR d2, [x12], 8 // a2
498 LDR d3, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700499 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
500
501 # First block of 3 B
502 FMLA v20.4s, v6.4s, v0.s[0]
503 FMLA v23.4s, v6.4s, v1.s[0]
504 FMLA v26.4s, v6.4s, v2.s[0]
505 FMLA v29.4s, v6.4s, v3.s[0]
506 FMLA v21.4s, v7.4s, v0.s[0]
507 FMLA v24.4s, v7.4s, v1.s[0]
508 FMLA v27.4s, v7.4s, v2.s[0]
509 FMLA v30.4s, v7.4s, v3.s[0]
510 FMLA v22.4s, v8.4s, v0.s[0]
511 FMLA v25.4s, v8.4s, v1.s[0]
512 FMLA v28.4s, v8.4s, v2.s[0]
513 FMLA v31.4s, v8.4s, v3.s[0]
514
515 # Second block of 3 B
516 FMLA v20.4s, v9.4s, v0.s[1]
517 FMLA v23.4s, v9.4s, v1.s[1]
518 FMLA v26.4s, v9.4s, v2.s[1]
519 FMLA v29.4s, v9.4s, v3.s[1]
520 FMLA v21.4s, v10.4s, v0.s[1]
521 FMLA v24.4s, v10.4s, v1.s[1]
522 FMLA v27.4s, v10.4s, v2.s[1]
523 FMLA v30.4s, v10.4s, v3.s[1]
524 FMLA v22.4s, v11.4s, v0.s[1]
525 FMLA v25.4s, v11.4s, v1.s[1]
526 FMLA v28.4s, v11.4s, v2.s[1]
527 FMLA v31.4s, v11.4s, v3.s[1]
528
529 TBZ x0, 2, 4b
5306:
531 # Remainder - 1 float of A (4 bytes)
532 LDR s0, [x3], 4 // a0
Frank Barchard9efaed72019-11-15 17:38:49 -0800533 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700534 LDR s1, [x11], 4 // a1
535 LDR s2, [x12], 4 // a2
536 LDR s3, [x4], 4 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700537
538 FMLA v20.4s, v6.4s, v0.s[0]
539 FMLA v23.4s, v6.4s, v1.s[0]
540 FMLA v26.4s, v6.4s, v2.s[0]
541 FMLA v29.4s, v6.4s, v3.s[0]
542 FMLA v21.4s, v7.4s, v0.s[0]
543 FMLA v24.4s, v7.4s, v1.s[0]
544 FMLA v27.4s, v7.4s, v2.s[0]
545 FMLA v30.4s, v7.4s, v3.s[0]
546 FMLA v22.4s, v8.4s, v0.s[0]
547 FMLA v25.4s, v8.4s, v1.s[0]
548 FMLA v28.4s, v8.4s, v2.s[0]
549 FMLA v31.4s, v8.4s, v3.s[0]
550 B 4b
551
5527:
Frank Barchard6383f492019-12-04 22:33:49 -0800553 ADD x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700554 # Store odd channels
555 TBZ x1, 3, 8f
Frank Barchard19418b52019-11-15 15:15:13 -0800556 $if INC:
557 STP q29, q30, [x7], 32
558 MOV v29.16b, v31.16b
559 STP q26, q27, [x10], 32
560 MOV v26.16b, v28.16b
561 STP q23, q24, [x9], 32
562 MOV v23.16b, v25.16b
563 STP q20, q21, [x6], 32
564 MOV v20.16b, v22.16b
565 $else:
566 STP q20, q21, [x6], 32
567 MOV v20.16b, v22.16b
568 STP q23, q24, [x9], 32
569 MOV v23.16b, v25.16b
570 STP q26, q27, [x10], 32
571 MOV v26.16b, v28.16b
572 STP q29, q30, [x7], 32
573 MOV v29.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700574
5758:
576 TBZ x1, 2, 9f
Frank Barchard19418b52019-11-15 15:15:13 -0800577 $if INC:
578 STR q29, [x7], 16
579 MOV v29.16b, v30.16b
580 STR q26, [x10], 16
581 MOV v26.16b, v27.16b
582 STR q23, [x9], 16
583 MOV v23.16b, v24.16b
584 STR q20, [x6], 16
585 MOV v20.16b, v21.16b
586 $else:
587 STR q20, [x6], 16
588 MOV v20.16b, v21.16b
589 STR q23, [x9], 16
590 MOV v23.16b, v24.16b
591 STR q26, [x10], 16
592 MOV v26.16b, v27.16b
593 STR q29, [x7], 16
594 MOV v29.16b, v30.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700595
5969:
597 TBZ x1, 1, 10f
Frank Barchard19418b52019-11-15 15:15:13 -0800598 $if INC:
599 STR d29, [x7], 8
600 DUP d29, v29.d[1]
601 STR d26, [x10], 8
602 DUP d26, v26.d[1]
603 STR d23, [x9], 8
604 DUP d23, v23.d[1]
605 STR d20, [x6], 8
606 DUP d20, v20.d[1]
607 $else:
608 STR d20, [x6], 8
609 DUP d20, v20.d[1]
610 STR d23, [x9], 8
611 DUP d23, v23.d[1]
612 STR d26, [x10], 8
613 DUP d26, v26.d[1]
614 STR d29, [x7], 8
615 DUP d29, v29.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700616
61710:
618 TBZ x1, 0, 11f
Frank Barchard19418b52019-11-15 15:15:13 -0800619 $if INC:
620 STR s29, [x7]
621 STR s26, [x10]
622 STR s23, [x9]
623 STR s20, [x6]
624 $else:
625 STR s20, [x6]
626 STR s23, [x9]
627 STR s26, [x10]
628 STR s29, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -070062911:
630 # Restore d8-d11,d14,d15 from stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800631 LDP d14, d15, [sp, 32]
632 LDP d10, d11, [sp, 16]
633 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700634 RET
635
636END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
637
638#ifdef __ELF__
639.section ".note.GNU-stack","",%progbits
640#endif