blob: 1e6f8066c8d89c4f182cd7e7d71e4c3892f1abac [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
Marat Dukhande06f492020-04-09 00:19:31 -070012# void xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_ld64(
XNNPACK Teamb455b122019-09-27 18:10:33 -070013# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
Frank Barchard67242182020-06-11 11:12:50 -070021# size_t cn_stride, [sp] -> (x0)
XNNPACK Teamb455b122019-09-27 18:10:33 -070022# const float*restrict acc, [sp + 8] -> x15
Frank Barcharda03020a2021-06-28 15:44:06 -070023# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8)
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
Frank Barchard909564c2020-06-09 03:54:33 -070025# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
XNNPACK Teamb455b122019-09-27 18:10:33 -070026
27# A pointers
28# x3 a0
29# x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33# x4 a5
34
35# C pointers
36# x6 c0
37# x16 c1
38# x17 c2
Frank Barchard67242182020-06-11 11:12:50 -070039# x14 c3
XNNPACK Teamb455b122019-09-27 18:10:33 -070040# x13 c4
41# x7 c5
42
43# Vector register usage
44# A0 v0
45# A1 v1
46# A2 v2
47# A3 v3
48# A4 v4
49# A5 v5
Frank Barchardcaf85442019-10-21 22:11:06 -070050# B v16 v17 v18 v19
XNNPACK Teamb455b122019-09-27 18:10:33 -070051# C v20 v21
52# C v22 v23
53# C v24 v25
54# C v26 v27
55# C v28 v29
56# C v30 v31
57# Clamp v6 v7
58# unused A v8 v9 v10 v11
Frank Barchardcaf85442019-10-21 22:11:06 -070059# unused B v12 v13 v14 v15
XNNPACK Teamb455b122019-09-27 18:10:33 -070060
Marat Dukhande06f492020-04-09 00:19:31 -070061BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_ld64
XNNPACK Teamb455b122019-09-27 18:10:33 -070062
Frank Barchard67242182020-06-11 11:12:50 -070063 # Load acc, params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070064 LDP x15, x8, [sp, 8]
Frank Barchardf5cc7e72020-04-20 11:35:48 -070065
Frank Barchardcaf85442019-10-21 22:11:06 -070066 # Clamp A and C pointers
Frank Barchard76f43f02021-05-12 14:52:01 -070067 CMP x0, 2 // if mr < 2
68 ADD x9, x3, x4 // a1 = a0 + a_stride
69 ADD x16, x6, x7 // c1 = c0 + cm_stride
70 CSEL x9, x3, x9, LO // a1 = a0
71 CSEL x16, x6, x16, LO // c1 = c0
XNNPACK Teamb455b122019-09-27 18:10:33 -070072
Frank Barcharde3491242021-06-11 14:04:57 -070073 # Load min/max values
74 LD2R {v6.4s, v7.4s}, [x8]
75
Frank Barchard76f43f02021-05-12 14:52:01 -070076 ADD x10, x9, x4 // a2 = a1 + a_stride
77 ADD x17, x16, x7 // c2 = c1 + cm_stride
Frank Barchard7c9f1f92021-06-04 14:38:55 -070078 // if mr <= 2
Frank Barchard76f43f02021-05-12 14:52:01 -070079 CSEL x10, x9, x10, LS // a2 = a1
80 CSEL x17, x16, x17, LS // c2 = c1
XNNPACK Teamb455b122019-09-27 18:10:33 -070081
Frank Barchard76f43f02021-05-12 14:52:01 -070082 CMP x0, 4 // if mr < 4
83 ADD x11, x10, x4 // a3 = a2 + a_stride
84 ADD x14, x17, x7 // c3 = c2 + cm_stride
85 CSEL x11, x10, x11, LO // a3 = a2
86 CSEL x14, x17, x14, LO // c3 = c2
XNNPACK Teamb455b122019-09-27 18:10:33 -070087
Frank Barchard76f43f02021-05-12 14:52:01 -070088 ADD x12, x11, x4 // a4 = a3 + a_stride
89 ADD x13, x14, x7 // c4 = c3 + cm_stride
Frank Barchard7c9f1f92021-06-04 14:38:55 -070090 // if mr <= 4
Frank Barchard76f43f02021-05-12 14:52:01 -070091 CSEL x12, x11, x12, LS // a4 = a3
92 CSEL x13, x14, x13, LS // c4 = c3
XNNPACK Teamb455b122019-09-27 18:10:33 -070093
Frank Barchard76f43f02021-05-12 14:52:01 -070094 CMP x0, 6 // if mr < 6
95 ADD x4, x12, x4 // a5 = a4 + a_stride
96 ADD x7, x13, x7 // c5 = c4 + cm_stride
97 CSEL x4, x12, x4, LO // a5 = a4
98 CSEL x7, x13, x7, LO // c5 = c4
XNNPACK Teamb455b122019-09-27 18:10:33 -070099
XNNPACK Teamb455b122019-09-27 18:10:33 -0700100
XNNPACK Teamb455b122019-09-27 18:10:33 -07001010:
102 # Load initial accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -0700103 LDP q20, q21, [x15], 32
104 LDP q22, q23, [x15], 32
105 LDP q24, q25, [x15], 32
106 LDP q26, q27, [x15], 32
107 LDP q28, q29, [x15], 32
108 LDP q30, q31, [x15], 32
109 PRFM PLDL1KEEP, [x5, 0] // Prefetch B
Frank Barchardcbfa3382021-05-07 10:30:05 -0700110 PRFM PLDL1KEEP, [x5, 64]
111 PRFM PLDL1KEEP, [x5, 128]
112 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard76f43f02021-05-12 14:52:01 -0700113 PRFM PLDL1KEEP, [x3] // Prefetch A
Frank Barchardcbfa3382021-05-07 10:30:05 -0700114 PRFM PLDL1KEEP, [x9]
115 PRFM PLDL1KEEP, [x10]
116 PRFM PLDL1KEEP, [x11]
117 PRFM PLDL1KEEP, [x12]
118 PRFM PLDL1KEEP, [x4]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700119
120 # Is there at least 2 floats (8 bytes) for main loop?
Frank Barchard76f43f02021-05-12 14:52:01 -0700121 SUBS x0, x2, 8 // k = kc - 8
122 B.LO 3f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700123
124 # Main loop - 2 floats of A (8 bytes)
125 # 24 FMA + 6 LD64 A + 2 LDP B
1261:
Frank Barchard76f43f02021-05-12 14:52:01 -0700127 LDR d0, [x3], 8
128 LDP q16, q17, [x5], 32
129 LDR d1, [x9], 8
130 LDR d2, [x10], 8
131 LDR d3, [x11], 8
132 LDR d4, [x12], 8
133 LDR d5, [x4], 8
134 FMLA v20.4s, v16.4s, v0.s[0]
135 FMLA v22.4s, v16.4s, v1.s[0]
136 FMLA v24.4s, v16.4s, v2.s[0]
137 FMLA v26.4s, v16.4s, v3.s[0]
138 LDP q18, q19, [x5], 32
139 FMLA v28.4s, v16.4s, v4.s[0]
140 FMLA v30.4s, v16.4s, v5.s[0]
141 FMLA v21.4s, v17.4s, v0.s[0]
142 FMLA v23.4s, v17.4s, v1.s[0]
143 FMLA v25.4s, v17.4s, v2.s[0]
144 FMLA v27.4s, v17.4s, v3.s[0]
145 FMLA v29.4s, v17.4s, v4.s[0]
146 FMLA v31.4s, v17.4s, v5.s[0]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700147
Frank Barchard76f43f02021-05-12 14:52:01 -0700148 FMLA v20.4s, v18.4s, v0.s[1]
149 FMLA v22.4s, v18.4s, v1.s[1]
150 FMLA v24.4s, v18.4s, v2.s[1]
151 FMLA v26.4s, v18.4s, v3.s[1]
152 FMLA v28.4s, v18.4s, v4.s[1]
153 FMLA v30.4s, v18.4s, v5.s[1]
154 FMLA v21.4s, v19.4s, v0.s[1]
155 FMLA v23.4s, v19.4s, v1.s[1]
156 FMLA v25.4s, v19.4s, v2.s[1]
157 FMLA v27.4s, v19.4s, v3.s[1]
158 SUBS x0, x0, 8
159 FMLA v29.4s, v19.4s, v4.s[1]
160 FMLA v31.4s, v19.4s, v5.s[1]
161 B.HS 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700162
XNNPACK Teamb455b122019-09-27 18:10:33 -0700163 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700164 TBNZ x0, 2, 3f
Frank Barchard3b262062020-09-30 15:53:17 -07001652:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700166 # Clamp
Frank Barchard76f43f02021-05-12 14:52:01 -0700167 FMAX v20.4s, v20.4s, v6.4s
Frank Barchard67242182020-06-11 11:12:50 -0700168 # Load cn_stride
Frank Barchard76f43f02021-05-12 14:52:01 -0700169 LDR x0, [sp, 0]
170 FMAX v21.4s, v21.4s, v6.4s
171 FMAX v22.4s, v22.4s, v6.4s
172 FMAX v23.4s, v23.4s, v6.4s
173 FMAX v24.4s, v24.4s, v6.4s
174 FMAX v25.4s, v25.4s, v6.4s
175 FMAX v26.4s, v26.4s, v6.4s
176 FMAX v27.4s, v27.4s, v6.4s
177 FMAX v28.4s, v28.4s, v6.4s
178 FMAX v29.4s, v29.4s, v6.4s
179 FMAX v30.4s, v30.4s, v6.4s
180 FMAX v31.4s, v31.4s, v6.4s
181 SUBS x1, x1, 8
182 FMIN v20.4s, v20.4s, v7.4s
183 FMIN v21.4s, v21.4s, v7.4s
184 FMIN v22.4s, v22.4s, v7.4s
185 FMIN v23.4s, v23.4s, v7.4s
186 FMIN v24.4s, v24.4s, v7.4s
187 FMIN v25.4s, v25.4s, v7.4s
188 FMIN v26.4s, v26.4s, v7.4s
189 FMIN v27.4s, v27.4s, v7.4s
190 FMIN v28.4s, v28.4s, v7.4s
191 FMIN v29.4s, v29.4s, v7.4s
192 FMIN v30.4s, v30.4s, v7.4s
193 FMIN v31.4s, v31.4s, v7.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700194
195 # Store full 6 x 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700196 B.LO 4f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700197
Frank Barchard76f43f02021-05-12 14:52:01 -0700198 ST1 {v30.16b, v31.16b}, [x7], x0
199 SUB x3, x3, x2 // a0 -= kc
200 ST1 {v28.16b, v29.16b}, [x13], x0
201 SUB x9, x9, x2 // a1 -= kc
202 ST1 {v26.16b, v27.16b}, [x14], x0
203 SUB x10, x10, x2 // a2 -= kc
204 ST1 {v24.16b, v25.16b}, [x17], x0
205 SUB x11, x11, x2 // a3 -= kc
206 ST1 {v22.16b, v23.16b}, [x16], x0
207 SUB x12, x12, x2 // a4 -= kc
208 ST1 {v20.16b, v21.16b}, [x6], x0
209 SUB x4, x4, x2 // a5 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700210
Frank Barchard76f43f02021-05-12 14:52:01 -0700211 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700212 RET
213
Frank Barchard3b262062020-09-30 15:53:17 -07002143:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700215 # Remainder- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700216 LDR s0, [x3], 4
217 LDP q16, q17, [x5], 32
218 LDR s1, [x9], 4
219 LDR s2, [x10], 4
220 LDR s3, [x11], 4
221 LDR s4, [x12], 4
222 LDR s5, [x4], 4
223 FMLA v20.4s, v16.4s, v0.s[0]
224 FMLA v22.4s, v16.4s, v1.s[0]
225 FMLA v24.4s, v16.4s, v2.s[0]
226 FMLA v26.4s, v16.4s, v3.s[0]
227 FMLA v28.4s, v16.4s, v4.s[0]
228 FMLA v30.4s, v16.4s, v5.s[0]
229 FMLA v21.4s, v17.4s, v0.s[0]
230 FMLA v23.4s, v17.4s, v1.s[0]
231 FMLA v25.4s, v17.4s, v2.s[0]
232 FMLA v27.4s, v17.4s, v3.s[0]
233 FMLA v29.4s, v17.4s, v4.s[0]
234 FMLA v31.4s, v17.4s, v5.s[0]
235 B 2b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700236
237 # Store odd width
Frank Barchard3b262062020-09-30 15:53:17 -07002384:
Frank Barchard76f43f02021-05-12 14:52:01 -0700239 TBZ x1, 2, 5f
240 STR q30, [x7], 16
241 MOV v30.16b, v31.16b
242 STR q28, [x13], 16
243 MOV v28.16b, v29.16b
244 STR q26, [x14], 16
245 MOV v26.16b, v27.16b
246 STR q24, [x17], 16
247 MOV v24.16b, v25.16b
248 STR q22, [x16], 16
249 MOV v22.16b, v23.16b
250 STR q20, [x6], 16
251 MOV v20.16b, v21.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700252
Frank Barchard3b262062020-09-30 15:53:17 -07002535:
Frank Barchard76f43f02021-05-12 14:52:01 -0700254 TBZ x1, 1, 6f
255 STR d30, [x7], 8
256 DUP d30, v30.d[1]
257 STR d28, [x13], 8
258 DUP d28, v28.d[1]
259 STR d26, [x14], 8
260 DUP d26, v26.d[1]
261 STR d24, [x17], 8
262 DUP d24, v24.d[1]
263 STR d22, [x16], 8
264 DUP d22, v22.d[1]
265 STR d20, [x6], 8
266 DUP d20, v20.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700267
Frank Barchard3b262062020-09-30 15:53:17 -07002686:
Frank Barchard76f43f02021-05-12 14:52:01 -0700269 TBZ x1, 0, 7f
270 STR s30, [x7]
271 STR s28, [x13]
272 STR s26, [x14]
273 STR s24, [x17]
274 STR s22, [x16]
275 STR s20, [x6]
Frank Barchard3b262062020-09-30 15:53:17 -07002767:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700277 RET
278
Marat Dukhande06f492020-04-09 00:19:31 -0700279END_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_ld64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700280
281#ifdef __ELF__
282.section ".note.GNU-stack","",%progbits
283#endif