blob: 0b34c2f2229b626ed51664943f69a17b3b11e8fa [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
Frank Barchardc03b2bd2020-01-15 12:20:25 -080039# x8 temporary vector shadow register
40
XNNPACK Teamb455b122019-09-27 18:10:33 -070041# Vector register usage and GPR shadows
Frank Barchardc03b2bd2020-01-15 12:20:25 -080042# a0 v0
43# a1 v0[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -070044# a2 v1
Frank Barchardc03b2bd2020-01-15 12:20:25 -080045# a3 v1[1]
46# a0 v2
47# a1 v2[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -070048# a2 v3
Frank Barchardc03b2bd2020-01-15 12:20:25 -080049# a3 v3[1]
50# B v6 v7 v8
51# B v9 v10 v11
52# B v14 v15 v16
53# B v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070054# C v20 v21 v22
55# C v23 v24 v25
56# C v26 v27 v28
57# C v29 v30 v31
58# Clamp v4 v5
59# v12 to v13 unused.
60
61BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53
62
63 # Load cn_stride, params pointer
64 LDP x14, x8, [sp]
65
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070066 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070067 LD2R {v4.4s, v5.4s}, [x8]
68
XNNPACK Teamb455b122019-09-27 18:10:33 -070069 # Save d8-d11,d14,d15 on stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -080070 STP d8, d9, [sp, -48]!
71 STP d10, d11, [sp, 16]
72 STP d14, d15, [sp, 32]
XNNPACK Teamb455b122019-09-27 18:10:33 -070073
74 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080075 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070076 ADD x11, x3, x4 // a1 = a0 + a_stride
77 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 CSEL x11, x3, x11, LO // a1 = a0
79 CSEL x9, x6, x9, LO // c1 = c0
80 ADD x12, x11, x4 // a2 = a1 + a_stride
81 ADD x10, x9, x7 // c2 = c1 + cm_stride
82 // if mr <= 2
83 CSEL x12, x11, x12, LS // a2 = a1
84 CSEL x10, x9, x10, LS // c2 = c1
Frank Barchard684bbb02019-11-16 14:14:42 -080085 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070086 ADD x4, x12, x4 // a3 = a2 + a_stride
87 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070088 CSEL x4, x12, x4, LO // a3 = a2
89 CSEL x7, x10, x7, LO // c3 = c2
90
910:
92 # Load initial bias from w into accumulators
93 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
94 MOV v23.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -080095 PRFM PLDL1KEEP, [x3, 0] // Prefetch A
96 PRFM PLDL1KEEP, [x3, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -070097 MOV v24.16b, v21.16b
Frank Barchard4cd89072020-01-10 11:35:10 -080098 PRFM PLDL1KEEP, [x11, 0]
99 PRFM PLDL1KEEP, [x11, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700100 MOV v25.16b, v22.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800101 PRFM PLDL1KEEP, [x12, 0]
102 PRFM PLDL1KEEP, [x12, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700103 MOV v26.16b, v20.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800104 PRFM PLDL1KEEP, [x4, 0]
105 PRFM PLDL1KEEP, [x4, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700106 MOV v27.16b, v21.16b
Frank Barchard4cd89072020-01-10 11:35:10 -0800107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
108 PRFM PLDL1KEEP, [x5, 64]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700109 MOV v28.16b, v22.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700110 PRFM PLDL1KEEP, [x5, 128]
111 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard4cd89072020-01-10 11:35:10 -0800112 MOV v29.16b, v20.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700113 PRFM PLDL1KEEP, [x5, 256]
Frank Barchard4cd89072020-01-10 11:35:10 -0800114 MOV v30.16b, v21.16b
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800115 PRFM PLDL1KEEP, [x5, 320]
Frank Barchard4cd89072020-01-10 11:35:10 -0800116 MOV v31.16b, v22.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700117
118 # Is there at least 4 floats (16 bytes)?
119 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -0800120 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700121
122 SUBS x0, x0, 16
123
124 # Prologue - loads for first group of 24 FMA
125
126 # Read first block of 4 A.
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800127 LDR d0, [x3], 8 // a0
128 LDR d1, [x12], 8 // a2
129 LD1 {v0.d}[1], [x11], 8 // a1
130 LD1 {v1.d}[1], [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700131
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800132 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
133 LD1 {v9.16b, v10.16b}, [x5], 32
134 LDR d11, [x5], 8
135 LDR x8, [x5], 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700136
137 # Is there at least 4 floats (16 bytes) for main loop?
138 B.LO 2f
139
140 # Main loop - 4 floats of A (16 bytes)
1411:
142 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
143 # A is loaded for 2nd group into v2/v3
144 # INS is 4 blocks (16 cycles) after load
145
146 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800147 LDR d2, [x3], 8 // a0
148 INS v11.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700149 FMLA v20.4s, v6.4s, v0.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800150 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700151 FMLA v23.4s, v6.4s, v0.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700152 FMLA v26.4s, v6.4s, v1.s[0]
Frank Barchard4cd89072020-01-10 11:35:10 -0800153 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700154
155 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800156 LDR d3, [x12], 8 // a2
157 INS v2.d[1], x8 // a1 was loaded in block 0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700158 FMLA v29.4s, v6.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800159 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700160 FMLA v21.4s, v7.4s, v0.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700161 FMLA v24.4s, v7.4s, v0.s[2]
Frank Barchard4cd89072020-01-10 11:35:10 -0800162 PRFM PLDL1KEEP, [x11, 128] // Prefetch A1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700163
164 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800165 LDR d14, [x5] // vb0x0123
166 INS v3.d[1], x8 // a3 was loaded in block 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700167 FMLA v27.4s, v7.4s, v1.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800168 LDR x8, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700169 FMLA v30.4s, v7.4s, v1.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700170 FMLA v22.4s, v8.4s, v0.s[0]
Frank Barchard4cd89072020-01-10 11:35:10 -0800171 PRFM PLDL1KEEP, [x12, 128] // Prefetch A2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700172
173 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800174 LDR d15, [x5, 16] // vb0x4567
175 INS v14.d[1], x8 // v14 was loaded in block 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700176 FMLA v25.4s, v8.4s, v0.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800177 LDR x8, [x5, 24]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700178 FMLA v28.4s, v8.4s, v1.s[0]
179 FMLA v31.4s, v8.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800180 PRFM PLDL1KEEP, [x4, 128] // Prefetch A3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700181
182 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800183 LDR d16, [x5, 32] // vb0x89AB
184 INS v15.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700185 FMLA v20.4s, v9.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800186 LDR x8, [x5, 40]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700187 FMLA v23.4s, v9.4s, v0.s[3]
188 FMLA v26.4s, v9.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800189 PRFM PLDL1KEEP, [x5, 320] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190
191 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800192 LDR d17, [x5, 48] // vb1x0123
193 INS v16.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700194 FMLA v29.4s, v9.4s, v1.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800195 LDR x8, [x5, 56]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700196 FMLA v21.4s, v10.4s, v0.s[1]
197 FMLA v24.4s, v10.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800198 PRFM PLDL1KEEP, [x5, 384] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700199
200 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800201 LDR d18, [x5, 64] // vb1x4567
202 INS v17.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700203 FMLA v27.4s, v10.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800204 LDR x8, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205 FMLA v30.4s, v10.4s, v1.s[3]
206 FMLA v22.4s, v11.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800207 PRFM PLDL1KEEP, [x5, 448] // Prefetch B
XNNPACK Teamb455b122019-09-27 18:10:33 -0700208
209 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800210 LDR d19, [x5, 80] // vb1x89AB
211 INS v18.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700212 FMLA v25.4s, v11.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800213 LDR x8, [x5, 88]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700214 FMLA v28.4s, v11.4s, v1.s[1]
215 FMLA v31.4s, v11.4s, v1.s[3]
216
217 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
218 # A is loaded for 1st group into v0/v1
219
220 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800221 LDR d0, [x3], 8 // a0
222 INS v19.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223 FMLA v20.4s, v14.4s, v2.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800224 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700225 FMLA v23.4s, v14.4s, v2.s[2]
226 FMLA v26.4s, v14.4s, v3.s[0]
227
228 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800229 LDR d1, [x12], 8 // a2
230 INS v0.d[1], x8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700231 FMLA v29.4s, v14.4s, v3.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800232 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700233 FMLA v21.4s, v15.4s, v2.s[0]
234 FMLA v24.4s, v15.4s, v2.s[2]
235
236 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800237 LDR d6, [x5, 96] // vb0x0123
238 INS v1.d[1], x8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700239 FMLA v27.4s, v15.4s, v3.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800240 LDR x8, [x5, 104]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700241 FMLA v30.4s, v15.4s, v3.s[2]
242 FMLA v22.4s, v16.4s, v2.s[0]
243
244 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800245 LDR d7, [x5, 112] // vb0x4567
246 INS v6.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700247 FMLA v25.4s, v16.4s, v2.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800248 LDR x8, [x5, 120]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700249 FMLA v28.4s, v16.4s, v3.s[0]
250 FMLA v31.4s, v16.4s, v3.s[2]
251
252 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800253 LDR d8, [x5, 128] // vb0x89AB
254 INS v7.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700255 FMLA v20.4s, v17.4s, v2.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800256 LDR x8, [x5, 136]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700257 FMLA v23.4s, v17.4s, v2.s[3]
258 FMLA v26.4s, v17.4s, v3.s[1]
259
260 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800261 LDR d9, [x5, 144] // vb1x0123
262 INS v8.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700263 FMLA v29.4s, v17.4s, v3.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800264 LDR x8, [x5, 152]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700265 FMLA v21.4s, v18.4s, v2.s[1]
266 FMLA v24.4s, v18.4s, v2.s[3]
267
268 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800269 LDR d10, [x5, 160] // vb1x4567
270 INS v9.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700271 FMLA v27.4s, v18.4s, v3.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800272 LDR x8, [x5, 168]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700273 FMLA v30.4s, v18.4s, v3.s[3]
274 SUBS x0, x0, 16
275 FMLA v22.4s, v19.4s, v2.s[1]
276
277 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800278 LDR d11, [x5, 176] // vb1x89AB
279 INS v10.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700280 FMLA v25.4s, v19.4s, v2.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800281 LDR x8, [x5, 184]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700282 FMLA v28.4s, v19.4s, v3.s[1]
283 ADD x5, x5, 192
284 FMLA v31.4s, v19.4s, v3.s[3]
285 B.HS 1b
286
287 # Epilogue
288 # First block same as main loop. Second block has no loads.
2892:
290 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800291 LDR d2, [x3], 8 // a0
292 INS v11.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700293 FMLA v20.4s, v6.4s, v0.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800294 LDR x8, [x11], 8 // a1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700295 FMLA v23.4s, v6.4s, v0.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700296 FMLA v26.4s, v6.4s, v1.s[0]
297
298 # BLOCK 1
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800299 LDR d3, [x12], 8 // a2
300 INS v2.d[1], x8 // a1 was loaded in block 0
XNNPACK Teamb455b122019-09-27 18:10:33 -0700301 FMLA v29.4s, v6.4s, v1.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800302 LDR x8, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700303 FMLA v21.4s, v7.4s, v0.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700304 FMLA v24.4s, v7.4s, v0.s[2]
305
306 # BLOCK 2
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800307 LDR d14, [x5] // vb0x0123
308 INS v3.d[1], x8 // a3 was loaded in block 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700309 FMLA v27.4s, v7.4s, v1.s[0]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800310 LDR x8, [x5, 8]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700311 FMLA v30.4s, v7.4s, v1.s[2]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700312 FMLA v22.4s, v8.4s, v0.s[0]
313
314 # BLOCK 3
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800315 LDR d15, [x5, 16] // vb0x4567
316 INS v14.d[1], x8 // v14 was loaded in block 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700317 FMLA v25.4s, v8.4s, v0.s[2]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800318 LDR x8, [x5, 24]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700319 FMLA v28.4s, v8.4s, v1.s[0]
320 FMLA v31.4s, v8.4s, v1.s[2]
321
322 # BLOCK 4
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800323 LDR d16, [x5, 32] // vb0x89AB
324 INS v15.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700325 FMLA v20.4s, v9.4s, v0.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800326 LDR x8, [x5, 40]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700327 FMLA v23.4s, v9.4s, v0.s[3]
328 FMLA v26.4s, v9.4s, v1.s[1]
329
330 # BLOCK 5
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800331 LDR d17, [x5, 48] // vb1x0123
332 INS v16.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700333 FMLA v29.4s, v9.4s, v1.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800334 LDR x8, [x5, 56]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700335 FMLA v21.4s, v10.4s, v0.s[1]
336 FMLA v24.4s, v10.4s, v0.s[3]
337
338 # BLOCK 6
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800339 LDR d18, [x5, 64] // vb1x4567
340 INS v17.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700341 FMLA v27.4s, v10.4s, v1.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800342 LDR x8, [x5, 72]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700343 FMLA v30.4s, v10.4s, v1.s[3]
344 FMLA v22.4s, v11.4s, v0.s[1]
345
346 # BLOCK 7
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800347 LDR d19, [x5, 80] // vb1x89AB
348 INS v18.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700349 FMLA v25.4s, v11.4s, v0.s[3]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800350 LDR x8, [x5, 88]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700351 FMLA v28.4s, v11.4s, v1.s[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700352 FMLA v31.4s, v11.4s, v1.s[3]
353
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800354 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA
355 # A is loaded for 1st group into v0/v1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700356
357 # BLOCK 0
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800358 INS v19.d[1], x8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700359 FMLA v20.4s, v14.4s, v2.s[0]
360 FMLA v23.4s, v14.4s, v2.s[2]
361 FMLA v26.4s, v14.4s, v3.s[0]
362
363 # BLOCK 1
XNNPACK Teamb455b122019-09-27 18:10:33 -0700364 FMLA v29.4s, v14.4s, v3.s[2]
365 FMLA v21.4s, v15.4s, v2.s[0]
366 FMLA v24.4s, v15.4s, v2.s[2]
367
368 # BLOCK 2
XNNPACK Teamb455b122019-09-27 18:10:33 -0700369 FMLA v27.4s, v15.4s, v3.s[0]
370 FMLA v30.4s, v15.4s, v3.s[2]
371 FMLA v22.4s, v16.4s, v2.s[0]
372
373 # BLOCK 3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700374 FMLA v25.4s, v16.4s, v2.s[2]
375 FMLA v28.4s, v16.4s, v3.s[0]
376 FMLA v31.4s, v16.4s, v3.s[2]
377
378 # BLOCK 4
XNNPACK Teamb455b122019-09-27 18:10:33 -0700379 FMLA v20.4s, v17.4s, v2.s[1]
380 FMLA v23.4s, v17.4s, v2.s[3]
381 FMLA v26.4s, v17.4s, v3.s[1]
382
383 # BLOCK 5
384 FMLA v29.4s, v17.4s, v3.s[3]
385 FMLA v21.4s, v18.4s, v2.s[1]
386 FMLA v24.4s, v18.4s, v2.s[3]
387
388 # BLOCK 6
389 FMLA v27.4s, v18.4s, v3.s[1]
390 FMLA v30.4s, v18.4s, v3.s[3]
391 FMLA v22.4s, v19.4s, v2.s[1]
Frank Barchard81558542020-02-11 16:35:26 -0800392 TST x0, 15
XNNPACK Teamb455b122019-09-27 18:10:33 -0700393
394 # BLOCK 7
395 FMLA v25.4s, v19.4s, v2.s[3]
396 FMLA v28.4s, v19.4s, v3.s[1]
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800397 ADD x5, x5, 96
XNNPACK Teamb455b122019-09-27 18:10:33 -0700398 FMLA v31.4s, v19.4s, v3.s[3]
399
Frank Barchard81558542020-02-11 16:35:26 -0800400 # Is there a remainder?- 2 floats of A (8 bytes) or less
401 B.NE 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700402
4034:
404 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700405 FMAX v20.4s, v20.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800406 SUBS x1, x1, 12
Marat Dukhana51cf482020-04-08 16:16:19 -0700407 FMAX v21.4s, v21.4s, v4.4s
408 FMAX v22.4s, v22.4s, v4.4s
409 FMAX v23.4s, v23.4s, v4.4s
410 FMAX v24.4s, v24.4s, v4.4s
411 FMAX v25.4s, v25.4s, v4.4s
412 FMAX v26.4s, v26.4s, v4.4s
413 FMAX v27.4s, v27.4s, v4.4s
414 FMAX v28.4s, v28.4s, v4.4s
415 FMAX v29.4s, v29.4s, v4.4s
416 FMAX v30.4s, v30.4s, v4.4s
417 FMAX v31.4s, v31.4s, v4.4s
418 FMIN v20.4s, v20.4s, v5.4s
419 FMIN v21.4s, v21.4s, v5.4s
420 FMIN v22.4s, v22.4s, v5.4s
421 FMIN v23.4s, v23.4s, v5.4s
422 FMIN v24.4s, v24.4s, v5.4s
423 FMIN v25.4s, v25.4s, v5.4s
424 FMIN v26.4s, v26.4s, v5.4s
425 FMIN v27.4s, v27.4s, v5.4s
426 FMIN v28.4s, v28.4s, v5.4s
427 FMIN v29.4s, v29.4s, v5.4s
428 FMIN v30.4s, v30.4s, v5.4s
429 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700430
431 # Store full 4 x 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700432 B.LO 7f
433
XNNPACK Teamb455b122019-09-27 18:10:33 -0700434 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700435 SUB x3, x3, x2 // a0 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800436 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700437 SUB x11, x11, x2 // a1 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800438 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700439 SUB x12, x12, x2 // a2 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800440 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700441 SUB x4, x4, x2 // a3 -= kc
442
XNNPACK Teamb455b122019-09-27 18:10:33 -0700443 B.HI 0b
444
445 # Restore d8-d11,d14,d15 from stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800446 LDP d14, d15, [sp, 32]
447 LDP d10, d11, [sp, 16]
448 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700449 RET
450
4515:
Frank Barchard81558542020-02-11 16:35:26 -0800452 # Is there a remainder?- 2 floats of A (8 bytes)
453 TBZ x0, 3, 6f
454
XNNPACK Teamb455b122019-09-27 18:10:33 -0700455 # Remainder - 2 floats of A (8 bytes)
456 # Read first block of 4 A.
457 LDR d0, [x3], 8 // a0
Frank Barchard9efaed72019-11-15 17:38:49 -0800458 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700459 LDR d1, [x11], 8 // a1
460 LDR d2, [x12], 8 // a2
461 LDR d3, [x4], 8 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700462 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48
463
464 # First block of 3 B
465 FMLA v20.4s, v6.4s, v0.s[0]
466 FMLA v23.4s, v6.4s, v1.s[0]
467 FMLA v26.4s, v6.4s, v2.s[0]
468 FMLA v29.4s, v6.4s, v3.s[0]
469 FMLA v21.4s, v7.4s, v0.s[0]
470 FMLA v24.4s, v7.4s, v1.s[0]
471 FMLA v27.4s, v7.4s, v2.s[0]
472 FMLA v30.4s, v7.4s, v3.s[0]
473 FMLA v22.4s, v8.4s, v0.s[0]
474 FMLA v25.4s, v8.4s, v1.s[0]
475 FMLA v28.4s, v8.4s, v2.s[0]
476 FMLA v31.4s, v8.4s, v3.s[0]
477
478 # Second block of 3 B
479 FMLA v20.4s, v9.4s, v0.s[1]
480 FMLA v23.4s, v9.4s, v1.s[1]
481 FMLA v26.4s, v9.4s, v2.s[1]
482 FMLA v29.4s, v9.4s, v3.s[1]
483 FMLA v21.4s, v10.4s, v0.s[1]
484 FMLA v24.4s, v10.4s, v1.s[1]
485 FMLA v27.4s, v10.4s, v2.s[1]
486 FMLA v30.4s, v10.4s, v3.s[1]
487 FMLA v22.4s, v11.4s, v0.s[1]
488 FMLA v25.4s, v11.4s, v1.s[1]
489 FMLA v28.4s, v11.4s, v2.s[1]
490 FMLA v31.4s, v11.4s, v3.s[1]
491
492 TBZ x0, 2, 4b
4936:
494 # Remainder - 1 float of A (4 bytes)
495 LDR s0, [x3], 4 // a0
Frank Barchard9efaed72019-11-15 17:38:49 -0800496 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700497 LDR s1, [x11], 4 // a1
498 LDR s2, [x12], 4 // a2
499 LDR s3, [x4], 4 // a3
XNNPACK Teamb455b122019-09-27 18:10:33 -0700500
501 FMLA v20.4s, v6.4s, v0.s[0]
502 FMLA v23.4s, v6.4s, v1.s[0]
503 FMLA v26.4s, v6.4s, v2.s[0]
504 FMLA v29.4s, v6.4s, v3.s[0]
505 FMLA v21.4s, v7.4s, v0.s[0]
506 FMLA v24.4s, v7.4s, v1.s[0]
507 FMLA v27.4s, v7.4s, v2.s[0]
508 FMLA v30.4s, v7.4s, v3.s[0]
509 FMLA v22.4s, v8.4s, v0.s[0]
510 FMLA v25.4s, v8.4s, v1.s[0]
511 FMLA v28.4s, v8.4s, v2.s[0]
512 FMLA v31.4s, v8.4s, v3.s[0]
513 B 4b
514
5157:
Frank Barchard6383f492019-12-04 22:33:49 -0800516 ADD x1, x1, 12
XNNPACK Teamb455b122019-09-27 18:10:33 -0700517 # Store odd channels
518 TBZ x1, 3, 8f
Frank Barchard19418b52019-11-15 15:15:13 -0800519 STP q20, q21, [x6], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -0700520 MOV v20.16b, v22.16b
Frank Barchard19418b52019-11-15 15:15:13 -0800521 STP q23, q24, [x9], 32
522 MOV v23.16b, v25.16b
523 STP q26, q27, [x10], 32
524 MOV v26.16b, v28.16b
525 STP q29, q30, [x7], 32
526 MOV v29.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700527
5288:
529 TBZ x1, 2, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700530 STR q20, [x6], 16
531 MOV v20.16b, v21.16b
Frank Barchard19418b52019-11-15 15:15:13 -0800532 STR q23, [x9], 16
533 MOV v23.16b, v24.16b
534 STR q26, [x10], 16
535 MOV v26.16b, v27.16b
536 STR q29, [x7], 16
537 MOV v29.16b, v30.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700538
5399:
540 TBZ x1, 1, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700541 STR d20, [x6], 8
542 DUP d20, v20.d[1]
Frank Barchard19418b52019-11-15 15:15:13 -0800543 STR d23, [x9], 8
544 DUP d23, v23.d[1]
545 STR d26, [x10], 8
546 DUP d26, v26.d[1]
547 STR d29, [x7], 8
548 DUP d29, v29.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700549
55010:
551 TBZ x1, 0, 11f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700552 STR s20, [x6]
Frank Barchard19418b52019-11-15 15:15:13 -0800553 STR s23, [x9]
554 STR s26, [x10]
555 STR s29, [x7]
XNNPACK Teamb455b122019-09-27 18:10:33 -070055611:
557 # Restore d8-d11,d14,d15 from stack
Frank Barchardc03b2bd2020-01-15 12:20:25 -0800558 LDP d14, d15, [sp, 32]
559 LDP d10, d11, [sp, 16]
560 LDP d8, d9, [sp], 48
XNNPACK Teamb455b122019-09-27 18:10:33 -0700561 RET
562
563END_FUNCTION xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53
564
565#ifdef __ELF__
566.section ".note.GNU-stack","",%progbits
567#endif