blob: a20699867045c6f0b1d077de785feb4b06082938 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53(
9# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
21$else:
22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
Frank Barchardc03b2bd2020-01-15 12:20:25 -080039# x8 temporary vector shadow register
40
XNNPACK Teamb455b122019-09-27 18:10:33 -070041# Vector register usage and GPR shadows
Frank Barchardc03b2bd2020-01-15 12:20:25 -080042# a0 v0
43# a1 v0[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -070044# a2 v1
Frank Barchardc03b2bd2020-01-15 12:20:25 -080045# a3 v1[1]
46# a0 v2
47# a1 v2[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -070048# a2 v3
Frank Barchardc03b2bd2020-01-15 12:20:25 -080049# a3 v3[1]
50# B v6 v7 v8
51# B v9 v10 v11
52# B v14 v15 v16
53# B v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070054# C v20 v21 v22
55# C v23 v24 v25
56# C v26 v27 v28
57# C v29 v30 v31
58# Clamp v4 v5
59# v12 to v13 unused.
60
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
62
63 $if INC:
64 # Load cn_stride, acc
65 LDP x14, x15, [sp]
66 # Load params pointer
67 LDR x8, [sp, 16]
68 $else:
69 # Load cn_stride, params pointer
70 LDP x14, x8, [sp]
71
72 # Load clamping_params values
73 LD2R {v4.4s, v5.4s}, [x8]
74
XNNPACK Teamb455b122019-09-27 18:10:33 -070075 # Save d8-d11,d14,d15 on stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -080076 STP d8, d9, [sp, -48]!
77 STP d10, d11, [sp, 16]
78 STP d14, d15, [sp, 32]
XNNPACK Teamb455b122019-09-27 18:10:33 -070079
80 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080081 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070082 ADD x11, x3, x4 // a1 = a0 + a_stride
83 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070084 CSEL x11, x3, x11, LO // a1 = a0
85 CSEL x9, x6, x9, LO // c1 = c0
86 ADD x12, x11, x4 // a2 = a1 + a_stride
87 ADD x10, x9, x7 // c2 = c1 + cm_stride
88 // if mr <= 2
89 CSEL x12, x11, x12, LS // a2 = a1
90 CSEL x10, x9, x10, LS // c2 = c1
Frank Barchard684bbb02019-11-16 14:14:42 -080091 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070092 ADD x4, x12, x4 // a3 = a2 + a_stride
93 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070094 CSEL x4, x12, x4, LO // a3 = a2
95 CSEL x7, x10, x7, LO // c3 = c2
96
970:
98 $if INC:
99 # Load initial accumulators
100 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
101 LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48
102 LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48
103 LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48
Frank Barchard4cd89072020-01-10 11:35:10 -0800104 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
105 PRFM PLDL1KEEP, [x3, 64]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800106 PRFM PLDL1KEEP, [x11, 0]
107 PRFM PLDL1KEEP, [x11, 64]
Frank Barchard4cd89072020-01-10 11:35:10 -0800108 PRFM PLDL1KEEP, [x12, 0]
109 PRFM PLDL1KEEP, [x12, 64]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800110 PRFM PLDL1KEEP, [x4, 0]
111 PRFM PLDL1KEEP, [x4, 64]
Frank Barchard4cd89072020-01-10 11:35:10 -0800112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
113 PRFM PLDL1KEEP, [x5, 64]
114 PRFM PLDL1KEEP, [x5, 128]
115 PRFM PLDL1KEEP, [x5, 192]
116 PRFM PLDL1KEEP, [x5, 256]
117 PRFM PLDL1KEEP, [x5, 320]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700118 $else:
119 # Load initial bias from w into accumulators
120 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
121 MOV v23.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800122 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
123 PRFM PLDL1KEEP, [x3, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700124 MOV v24.16b, v21.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800125 PRFM PLDL1KEEP, [x11, 0]
126 PRFM PLDL1KEEP, [x11, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127 MOV v25.16b, v22.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800128 PRFM PLDL1KEEP, [x12, 0]
129 PRFM PLDL1KEEP, [x12, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700130 MOV v26.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800131 PRFM PLDL1KEEP, [x4, 0]
132 PRFM PLDL1KEEP, [x4, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700133 MOV v27.16b, v21.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800134 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
135 PRFM PLDL1KEEP, [x5, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700136 MOV v28.16b, v22.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800137 PRFM PLDL1KEEP, [x5, 128]
138 PRFM PLDL1KEEP, [x5, 192]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700139 MOV v29.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800140 PRFM PLDL1KEEP, [x5, 256]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700141 MOV v30.16b, v21.16b
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800142 PRFM PLDL1KEEP, [x5, 320]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700143 MOV v31.16b, v22.16b
144
XNNPACK Teamb455b122019-09-27 18:10:33 -0700145 # Is there at least 4 floats (16 bytes)?
146 SUBS x0, x2, 16 // k = kc - 16
Haibo Huang016db6e2020-02-12 19:44:10 +0000147 B.LO 3f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148
149 SUBS x0, x0, 16
150
151 # Prologue - loads for first group of 24 FMA
152
153 # Read first block of 4 A.
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800154 LDR d0, [x3], 8 // a0
155 LDR d1, [x12], 8 // a2
156 LD1 {v0.d}[1], [x11], 8 // a1
157 LD1 {v1.d}[1], [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700158
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800159 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
160 LD1 {v9.16b, v10.16b}, [x5], 32
161 LDR d11, [x5], 8
162 LDR x8, [x5], 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700163
164 # Is there at least 4 floats (16 bytes) for main loop?
165 B.LO 2f
166
167 # Main loop - 4 floats of A (16 bytes)
1681:
169 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
170 # A is loaded for 2nd group into v2/v3
171 # INS is 4 blocks (16 cycles) after load
172
173 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800174 LDR d2, [x3], 8 // a0
175 INS v11.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700176 FMLA v20.4s, v6.4s, v0.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800177 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700178 FMLA v23.4s, v6.4s, v0.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700179 FMLA v26.4s, v6.4s, v1.s[0]
Frank Barchard4cd89072020-01-10 11:35:10 -0800180 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700181
182 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800183 LDR d3, [x12], 8 // a2
184 INS v2.d[1], x8 // a1 was loaded in block 0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700185 FMLA v29.4s, v6.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800186 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700187 FMLA v21.4s, v7.4s, v0.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700188 FMLA v24.4s, v7.4s, v0.s[2]
Frank Barchard4cd89072020-01-10 11:35:10 -0800189 PRFM PLDL1KEEP, [x11, 128] // Prefetch A1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190
191 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800192 LDR d14, [x5] // vb0x0123
193 INS v3.d[1], x8 // a3 was loaded in block 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700194 FMLA v27.4s, v7.4s, v1.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800195 LDR x8, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700196 FMLA v30.4s, v7.4s, v1.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700197 FMLA v22.4s, v8.4s, v0.s[0]
Frank Barchard4cd89072020-01-10 11:35:10 -0800198 PRFM PLDL1KEEP, [x12, 128] // Prefetch A2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700199
200 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800201 LDR d15, [x5, 16] // vb0x4567
202 INS v14.d[1], x8 // v14 was loaded in block 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700203 FMLA v25.4s, v8.4s, v0.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800204 LDR x8, [x5, 24]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205 FMLA v28.4s, v8.4s, v1.s[0]
206 FMLA v31.4s, v8.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800207 PRFM PLDL1KEEP, [x4, 128] // Prefetch A3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700208
209 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800210 LDR d16, [x5, 32] // vb0x89AB
211 INS v15.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700212 FMLA v20.4s, v9.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800213 LDR x8, [x5, 40]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700214 FMLA v23.4s, v9.4s, v0.s[3]
215 FMLA v26.4s, v9.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800216 PRFM PLDL1KEEP, [x5, 320] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700217
218 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800219 LDR d17, [x5, 48] // vb1x0123
220 INS v16.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700221 FMLA v29.4s, v9.4s, v1.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800222 LDR x8, [x5, 56]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 FMLA v21.4s, v10.4s, v0.s[1]
224 FMLA v24.4s, v10.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800225 PRFM PLDL1KEEP, [x5, 384] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700226
227 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800228 LDR d18, [x5, 64] // vb1x4567
229 INS v17.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700230 FMLA v27.4s, v10.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800231 LDR x8, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700232 FMLA v30.4s, v10.4s, v1.s[3]
233 FMLA v22.4s, v11.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800234 PRFM PLDL1KEEP, [x5, 448] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700235
236 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800237 LDR d19, [x5, 80] // vb1x89AB
238 INS v18.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700239 FMLA v25.4s, v11.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800240 LDR x8, [x5, 88]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700241 FMLA v28.4s, v11.4s, v1.s[1]
242 FMLA v31.4s, v11.4s, v1.s[3]
243
244 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
245 # A is loaded for 1st group into v0/v1
246
247 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800248 LDR d0, [x3], 8 // a0
249 INS v19.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700250 FMLA v20.4s, v14.4s, v2.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800251 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700252 FMLA v23.4s, v14.4s, v2.s[2]
253 FMLA v26.4s, v14.4s, v3.s[0]
254
255 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800256 LDR d1, [x12], 8 // a2
257 INS v0.d[1], x8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700258 FMLA v29.4s, v14.4s, v3.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800259 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700260 FMLA v21.4s, v15.4s, v2.s[0]
261 FMLA v24.4s, v15.4s, v2.s[2]
262
263 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800264 LDR d6, [x5, 96] // vb0x0123
265 INS v1.d[1], x8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266 FMLA v27.4s, v15.4s, v3.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800267 LDR x8, [x5, 104]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700268 FMLA v30.4s, v15.4s, v3.s[2]
269 FMLA v22.4s, v16.4s, v2.s[0]
270
271 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800272 LDR d7, [x5, 112] // vb0x4567
273 INS v6.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700274 FMLA v25.4s, v16.4s, v2.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800275 LDR x8, [x5, 120]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700276 FMLA v28.4s, v16.4s, v3.s[0]
277 FMLA v31.4s, v16.4s, v3.s[2]
278
279 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800280 LDR d8, [x5, 128] // vb0x89AB
281 INS v7.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700282 FMLA v20.4s, v17.4s, v2.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800283 LDR x8, [x5, 136]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700284 FMLA v23.4s, v17.4s, v2.s[3]
285 FMLA v26.4s, v17.4s, v3.s[1]
286
287 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800288 LDR d9, [x5, 144] // vb1x0123
289 INS v8.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700290 FMLA v29.4s, v17.4s, v3.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800291 LDR x8, [x5, 152]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700292 FMLA v21.4s, v18.4s, v2.s[1]
293 FMLA v24.4s, v18.4s, v2.s[3]
294
295 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800296 LDR d10, [x5, 160] // vb1x4567
297 INS v9.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700298 FMLA v27.4s, v18.4s, v3.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800299 LDR x8, [x5, 168]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700300 FMLA v30.4s, v18.4s, v3.s[3]
301 SUBS x0, x0, 16
302 FMLA v22.4s, v19.4s, v2.s[1]
303
304 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800305 LDR d11, [x5, 176] // vb1x89AB
306 INS v10.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700307 FMLA v25.4s, v19.4s, v2.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800308 LDR x8, [x5, 184]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700309 FMLA v28.4s, v19.4s, v3.s[1]
310 ADD x5, x5, 192
311 FMLA v31.4s, v19.4s, v3.s[3]
312 B.HS 1b
313
314 # Epilogue
315 # First block same as main loop. Second block has no loads.
3162:
317 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800318 LDR d2, [x3], 8 // a0
319 INS v11.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700320 FMLA v20.4s, v6.4s, v0.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800321 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700322 FMLA v23.4s, v6.4s, v0.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700323 FMLA v26.4s, v6.4s, v1.s[0]
324
325 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800326 LDR d3, [x12], 8 // a2
327 INS v2.d[1], x8 // a1 was loaded in block 0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700328 FMLA v29.4s, v6.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800329 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700330 FMLA v21.4s, v7.4s, v0.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700331 FMLA v24.4s, v7.4s, v0.s[2]
332
333 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800334 LDR d14, [x5] // vb0x0123
335 INS v3.d[1], x8 // a3 was loaded in block 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700336 FMLA v27.4s, v7.4s, v1.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800337 LDR x8, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700338 FMLA v30.4s, v7.4s, v1.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700339 FMLA v22.4s, v8.4s, v0.s[0]
340
341 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800342 LDR d15, [x5, 16] // vb0x4567
343 INS v14.d[1], x8 // v14 was loaded in block 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700344 FMLA v25.4s, v8.4s, v0.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800345 LDR x8, [x5, 24]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700346 FMLA v28.4s, v8.4s, v1.s[0]
347 FMLA v31.4s, v8.4s, v1.s[2]
348
349 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800350 LDR d16, [x5, 32] // vb0x89AB
351 INS v15.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700352 FMLA v20.4s, v9.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800353 LDR x8, [x5, 40]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700354 FMLA v23.4s, v9.4s, v0.s[3]
355 FMLA v26.4s, v9.4s, v1.s[1]
356
357 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800358 LDR d17, [x5, 48] // vb1x0123
359 INS v16.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700360 FMLA v29.4s, v9.4s, v1.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800361 LDR x8, [x5, 56]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700362 FMLA v21.4s, v10.4s, v0.s[1]
363 FMLA v24.4s, v10.4s, v0.s[3]
364
365 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800366 LDR d18, [x5, 64] // vb1x4567
367 INS v17.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700368 FMLA v27.4s, v10.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800369 LDR x8, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700370 FMLA v30.4s, v10.4s, v1.s[3]
371 FMLA v22.4s, v11.4s, v0.s[1]
372
373 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800374 LDR d19, [x5, 80] // vb1x89AB
375 INS v18.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700376 FMLA v25.4s, v11.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800377 LDR x8, [x5, 88]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700378 FMLA v28.4s, v11.4s, v1.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700379 FMLA v31.4s, v11.4s, v1.s[3]
380
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800381 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
382 # A is loaded for 1st group into v0/v1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700383
384 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800385 INS v19.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700386 FMLA v20.4s, v14.4s, v2.s[0]
387 FMLA v23.4s, v14.4s, v2.s[2]
388 FMLA v26.4s, v14.4s, v3.s[0]
389
390 # BLOCK 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700391 FMLA v29.4s, v14.4s, v3.s[2]
392 FMLA v21.4s, v15.4s, v2.s[0]
393 FMLA v24.4s, v15.4s, v2.s[2]
394
395 # BLOCK 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700396 FMLA v27.4s, v15.4s, v3.s[0]
397 FMLA v30.4s, v15.4s, v3.s[2]
398 FMLA v22.4s, v16.4s, v2.s[0]
399
400 # BLOCK 3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700401 FMLA v25.4s, v16.4s, v2.s[2]
402 FMLA v28.4s, v16.4s, v3.s[0]
403 FMLA v31.4s, v16.4s, v3.s[2]
404
405 # BLOCK 4
XNNPACK Teamb455b122019-09-27 18:10:33 -0700406 FMLA v20.4s, v17.4s, v2.s[1]
407 FMLA v23.4s, v17.4s, v2.s[3]
408 FMLA v26.4s, v17.4s, v3.s[1]
409
410 # BLOCK 5
411 FMLA v29.4s, v17.4s, v3.s[3]
412 FMLA v21.4s, v18.4s, v2.s[1]
413 FMLA v24.4s, v18.4s, v2.s[3]
414
415 # BLOCK 6
416 FMLA v27.4s, v18.4s, v3.s[1]
417 FMLA v30.4s, v18.4s, v3.s[3]
418 FMLA v22.4s, v19.4s, v2.s[1]
419
420 # BLOCK 7
421 FMLA v25.4s, v19.4s, v2.s[3]
422 FMLA v28.4s, v19.4s, v3.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800423 ADD x5, x5, 96
XNNPACK Teamb455b122019-09-27 18:10:33 -0700424 FMLA v31.4s, v19.4s, v3.s[3]
425
Haibo Huang016db6e2020-02-12 19:44:10 +00004263:
427 # Is there a remainder?- 2 floats of A (8 bytes)
428 TBNZ x0, 3, 5f
429 # Is there a remainder?- 1 floats of A (4 bytes)
430 TBNZ x0, 2, 6f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700431
4324:
433 # Clamp
434 FMIN v20.4s, v20.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800435 SUBS x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700436 FMIN v21.4s, v21.4s, v4.4s
437 FMIN v22.4s, v22.4s, v4.4s
438 FMIN v23.4s, v23.4s, v4.4s
439 FMIN v24.4s, v24.4s, v4.4s
440 FMIN v25.4s, v25.4s, v4.4s
441 FMIN v26.4s, v26.4s, v4.4s
442 FMIN v27.4s, v27.4s, v4.4s
443 FMIN v28.4s, v28.4s, v4.4s
444 FMIN v29.4s, v29.4s, v4.4s
445 FMIN v30.4s, v30.4s, v4.4s
446 FMIN v31.4s, v31.4s, v4.4s
447 FMAX v20.4s, v20.4s, v5.4s
448 FMAX v21.4s, v21.4s, v5.4s
449 FMAX v22.4s, v22.4s, v5.4s
450 FMAX v23.4s, v23.4s, v5.4s
451 FMAX v24.4s, v24.4s, v5.4s
452 FMAX v25.4s, v25.4s, v5.4s
453 FMAX v26.4s, v26.4s, v5.4s
454 FMAX v27.4s, v27.4s, v5.4s
455 FMAX v28.4s, v28.4s, v5.4s
456 FMAX v29.4s, v29.4s, v5.4s
457 FMAX v30.4s, v30.4s, v5.4s
458 FMAX v31.4s, v31.4s, v5.4s
459
460 # Store full 4 x 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700461 B.LO 7f
462
Frank Barchard19418b52019-11-15 15:15:13 -0800463 $if INC:
464 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14
465 SUB x3, x3, x2 // a0 -= kc
466 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
467 SUB x11, x11, x2 // a1 -= kc
468 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14
469 SUB x12, x12, x2 // a2 -= kc
470 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
471 SUB x4, x4, x2 // a3 -= kc
472 $else:
473 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
474 SUB x3, x3, x2 // a0 -= kc
475 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14
476 SUB x11, x11, x2 // a1 -= kc
477 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
478 SUB x12, x12, x2 // a2 -= kc
479 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14
480 SUB x4, x4, x2 // a3 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700481
XNNPACK Teamb455b122019-09-27 18:10:33 -0700482 B.HI 0b
483
484 # Restore d8-d11,d14,d15 from stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800485 LDP d14, d15, [sp, 32]
486 LDP d10, d11, [sp, 16]
487 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700488 RET
489
4905:
491 # Remainder - 2 floats of A (8 bytes)
492 # Read first block of 4 A.
493 LDR d0, [x3], 8 // a0
Frank Barchard9efaed72019-11-15 17:38:49 -0800494 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700495 LDR d1, [x11], 8 // a1
496 LDR d2, [x12], 8 // a2
497 LDR d3, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700498 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
499
500 # First block of 3 B
501 FMLA v20.4s, v6.4s, v0.s[0]
502 FMLA v23.4s, v6.4s, v1.s[0]
503 FMLA v26.4s, v6.4s, v2.s[0]
504 FMLA v29.4s, v6.4s, v3.s[0]
505 FMLA v21.4s, v7.4s, v0.s[0]
506 FMLA v24.4s, v7.4s, v1.s[0]
507 FMLA v27.4s, v7.4s, v2.s[0]
508 FMLA v30.4s, v7.4s, v3.s[0]
509 FMLA v22.4s, v8.4s, v0.s[0]
510 FMLA v25.4s, v8.4s, v1.s[0]
511 FMLA v28.4s, v8.4s, v2.s[0]
512 FMLA v31.4s, v8.4s, v3.s[0]
513
514 # Second block of 3 B
515 FMLA v20.4s, v9.4s, v0.s[1]
516 FMLA v23.4s, v9.4s, v1.s[1]
517 FMLA v26.4s, v9.4s, v2.s[1]
518 FMLA v29.4s, v9.4s, v3.s[1]
519 FMLA v21.4s, v10.4s, v0.s[1]
520 FMLA v24.4s, v10.4s, v1.s[1]
521 FMLA v27.4s, v10.4s, v2.s[1]
522 FMLA v30.4s, v10.4s, v3.s[1]
523 FMLA v22.4s, v11.4s, v0.s[1]
524 FMLA v25.4s, v11.4s, v1.s[1]
525 FMLA v28.4s, v11.4s, v2.s[1]
526 FMLA v31.4s, v11.4s, v3.s[1]
527
528 TBZ x0, 2, 4b
5296:
530 # Remainder - 1 float of A (4 bytes)
531 LDR s0, [x3], 4 // a0
Frank Barchard9efaed72019-11-15 17:38:49 -0800532 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700533 LDR s1, [x11], 4 // a1
534 LDR s2, [x12], 4 // a2
535 LDR s3, [x4], 4 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700536
537 FMLA v20.4s, v6.4s, v0.s[0]
538 FMLA v23.4s, v6.4s, v1.s[0]
539 FMLA v26.4s, v6.4s, v2.s[0]
540 FMLA v29.4s, v6.4s, v3.s[0]
541 FMLA v21.4s, v7.4s, v0.s[0]
542 FMLA v24.4s, v7.4s, v1.s[0]
543 FMLA v27.4s, v7.4s, v2.s[0]
544 FMLA v30.4s, v7.4s, v3.s[0]
545 FMLA v22.4s, v8.4s, v0.s[0]
546 FMLA v25.4s, v8.4s, v1.s[0]
547 FMLA v28.4s, v8.4s, v2.s[0]
548 FMLA v31.4s, v8.4s, v3.s[0]
549 B 4b
550
5517:
Frank Barchard6383f492019-12-04 22:33:49 -0800552 ADD x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700553 # Store odd channels
554 TBZ x1, 3, 8f
Frank Barchard19418b52019-11-15 15:15:13 -0800555 $if INC:
556 STP q29, q30, [x7], 32
557 MOV v29.16b, v31.16b
558 STP q26, q27, [x10], 32
559 MOV v26.16b, v28.16b
560 STP q23, q24, [x9], 32
561 MOV v23.16b, v25.16b
562 STP q20, q21, [x6], 32
563 MOV v20.16b, v22.16b
564 $else:
565 STP q20, q21, [x6], 32
566 MOV v20.16b, v22.16b
567 STP q23, q24, [x9], 32
568 MOV v23.16b, v25.16b
569 STP q26, q27, [x10], 32
570 MOV v26.16b, v28.16b
571 STP q29, q30, [x7], 32
572 MOV v29.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700573
5748:
575 TBZ x1, 2, 9f
Frank Barchard19418b52019-11-15 15:15:13 -0800576 $if INC:
577 STR q29, [x7], 16
578 MOV v29.16b, v30.16b
579 STR q26, [x10], 16
580 MOV v26.16b, v27.16b
581 STR q23, [x9], 16
582 MOV v23.16b, v24.16b
583 STR q20, [x6], 16
584 MOV v20.16b, v21.16b
585 $else:
586 STR q20, [x6], 16
587 MOV v20.16b, v21.16b
588 STR q23, [x9], 16
589 MOV v23.16b, v24.16b
590 STR q26, [x10], 16
591 MOV v26.16b, v27.16b
592 STR q29, [x7], 16
593 MOV v29.16b, v30.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700594
5959:
596 TBZ x1, 1, 10f
Frank Barchard19418b52019-11-15 15:15:13 -0800597 $if INC:
598 STR d29, [x7], 8
599 DUP d29, v29.d[1]
600 STR d26, [x10], 8
601 DUP d26, v26.d[1]
602 STR d23, [x9], 8
603 DUP d23, v23.d[1]
604 STR d20, [x6], 8
605 DUP d20, v20.d[1]
606 $else:
607 STR d20, [x6], 8
608 DUP d20, v20.d[1]
609 STR d23, [x9], 8
610 DUP d23, v23.d[1]
611 STR d26, [x10], 8
612 DUP d26, v26.d[1]
613 STR d29, [x7], 8
614 DUP d29, v29.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700615
61610:
617 TBZ x1, 0, 11f
Frank Barchard19418b52019-11-15 15:15:13 -0800618 $if INC:
619 STR s29, [x7]
620 STR s26, [x10]
621 STR s23, [x9]
622 STR s20, [x6]
623 $else:
624 STR s20, [x6]
625 STR s23, [x9]
626 STR s26, [x10]
627 STR s29, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -070062811:
629 # Restore d8-d11,d14,d15 from stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800630 LDP d14, d15, [sp, 32]
631 LDP d10, d11, [sp, 16]
632 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700633 RET
634
635END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53
636
637#ifdef __ELF__
638.section ".note.GNU-stack","",%progbits
639#endif