blob: bc02037d37365541d058899b9d58c77dc3d80a0a [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x11 a1
31# x12 a2
32# x4 a3 / a_stride
33
34# C pointers
35# x6 c0
36# x9 c1
37# x10 c2
38# x7 c3 / cm_stride
39
40BEGIN_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128
41
42 # Load cn_stride, acc
43 LDP x14, x15, [sp]
44 # Load params pointer
45 LDR x8, [sp, 16]
46
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070047 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070048 LD2R {v4.4s, v5.4s}, [x8]
49
50 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080051 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070052 ADD x11, x3, x4 // a1 = a0 + a_stride
53 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070054 CSEL x11, x3, x11, LO // a1 = a0
55 CSEL x9, x6, x9, LO // c1 = c0
56
57 ADD x12, x11, x4 // a2 = a1 + a_stride
58 ADD x10, x9, x7 // c2 = c1 + cm_stride
59 // if mr <= 2
60 CSEL x12, x11, x12, LS // a2 = a1
61 CSEL x10, x9, x10, LS // c2 = c1
62
Frank Barchard684bbb02019-11-16 14:14:42 -080063 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070064 ADD x4, x12, x4 // a3 = a2 + a_stride
65 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 CSEL x4, x12, x4, LO // a3 = a2
67 CSEL x7, x10, x7, LO // c3 = c2
68
690:
70 # Load initial accumulators
71 LDP q16, q17, [x15], 32
72 LDP q18, q19, [x15], 32
73 LDP q28, q29, [x15], 32
74 LDP q30, q31, [x15], 32
75
76 # Is there at least 4 floats (16 bytes)?
77 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -080078 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -070079
80 # Main loop - 4 floats of A (16 bytes)
811:
82 LDR q0, [x3], 16
83 LDP q20, q21, [x5], 32
84 LDR q1, [x11], 16
85 LDR q2, [x12], 16
86 LDR q3, [x4], 16
87 FMLA v16.4s, v20.4s, v0.s[0]
88 FMLA v17.4s, v21.4s, v0.s[0]
89 FMLA v18.4s, v20.4s, v1.s[0]
90 FMLA v19.4s, v21.4s, v1.s[0]
91 LDP q22, q23, [x5], 32
92 FMLA v28.4s, v20.4s, v2.s[0]
93 FMLA v29.4s, v21.4s, v2.s[0]
94 FMLA v30.4s, v20.4s, v3.s[0]
95 FMLA v31.4s, v21.4s, v3.s[0]
96 LDP q24, q25, [x5], 32
97 FMLA v16.4s, v22.4s, v0.s[1]
98 FMLA v17.4s, v23.4s, v0.s[1]
99 FMLA v18.4s, v22.4s, v1.s[1]
100 FMLA v19.4s, v23.4s, v1.s[1]
101 LDP q26, q27, [x5], 32
102 FMLA v28.4s, v22.4s, v2.s[1]
103 FMLA v29.4s, v23.4s, v2.s[1]
104 FMLA v30.4s, v22.4s, v3.s[1]
105 FMLA v31.4s, v23.4s, v3.s[1]
106 FMLA v16.4s, v24.4s, v0.s[2]
107 FMLA v17.4s, v25.4s, v0.s[2]
108 FMLA v18.4s, v24.4s, v1.s[2]
109 FMLA v19.4s, v25.4s, v1.s[2]
110 FMLA v28.4s, v24.4s, v2.s[2]
111 FMLA v29.4s, v25.4s, v2.s[2]
112 FMLA v30.4s, v24.4s, v3.s[2]
113 FMLA v31.4s, v25.4s, v3.s[2]
114 FMLA v16.4s, v26.4s, v0.s[3]
115 FMLA v17.4s, v27.4s, v0.s[3]
116 FMLA v18.4s, v26.4s, v1.s[3]
117 FMLA v19.4s, v27.4s, v1.s[3]
118 FMLA v28.4s, v26.4s, v2.s[3]
119 FMLA v29.4s, v27.4s, v2.s[3]
120 SUBS x0, x0, 16
121 FMLA v30.4s, v26.4s, v3.s[3]
122 FMLA v31.4s, v27.4s, v3.s[3]
123 B.HS 1b
124
Frank Barchard81558542020-02-11 16:35:26 -0800125 TST x0, 15
126 B.NE 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127
Frank Barchard81558542020-02-11 16:35:26 -08001284:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700129 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700130 FMAX v16.4s, v16.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800131 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700132 FMAX v17.4s, v17.4s, v4.4s
133 FMAX v18.4s, v18.4s, v4.4s
134 FMAX v19.4s, v19.4s, v4.4s
135 FMAX v28.4s, v28.4s, v4.4s
136 FMAX v29.4s, v29.4s, v4.4s
137 FMAX v30.4s, v30.4s, v4.4s
138 FMAX v31.4s, v31.4s, v4.4s
139 FMIN v16.4s, v16.4s, v5.4s
140 FMIN v17.4s, v17.4s, v5.4s
141 FMIN v18.4s, v18.4s, v5.4s
142 FMIN v19.4s, v19.4s, v5.4s
143 FMIN v28.4s, v28.4s, v5.4s
144 FMIN v29.4s, v29.4s, v5.4s
145 FMIN v30.4s, v30.4s, v5.4s
146 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700147
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148 # Store full 4 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700149 B.LO 7f
150
Frank Barcharde67b7832019-11-12 12:48:40 -0800151 ST1 {v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700152 SUB x3, x3, x2 // a0 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800153 ST1 {v28.16b, v29.16b}, [x10], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700154 SUB x11, x11, x2 // a1 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800155 ST1 {v18.16b, v19.16b}, [x9], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700156 SUB x12, x12, x2 // a2 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800157 ST1 {v16.16b, v17.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700158 SUB x4, x4, x2 // a3 -= kc
159
XNNPACK Teamb455b122019-09-27 18:10:33 -0700160 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700161 RET
162
Frank Barchard81558542020-02-11 16:35:26 -0800163 # Remainder- 2 floats of A (8 bytes)
1645:
165 # Is there a remainder?- 2 floats of A (8 bytes)
166 TBZ x0, 3, 6f
167
168 # Remainder- 2 floats of A (8 bytes)
169 LDR d0, [x3], 8
170 LDP q20, q21, [x5], 32
171 LDR d1, [x11], 8
172 LDR d2, [x12], 8
173 LDR d3, [x4], 8
174 FMLA v16.4s, v20.4s, v0.s[0]
175 FMLA v17.4s, v21.4s, v0.s[0]
176 FMLA v18.4s, v20.4s, v1.s[0]
177 FMLA v19.4s, v21.4s, v1.s[0]
178 LDP q22, q23, [x5], 32
179 FMLA v28.4s, v20.4s, v2.s[0]
180 FMLA v29.4s, v21.4s, v2.s[0]
181 FMLA v30.4s, v20.4s, v3.s[0]
182 FMLA v31.4s, v21.4s, v3.s[0]
183 FMLA v16.4s, v22.4s, v0.s[1]
184 FMLA v17.4s, v23.4s, v0.s[1]
185 FMLA v18.4s, v22.4s, v1.s[1]
186 FMLA v19.4s, v23.4s, v1.s[1]
187 FMLA v28.4s, v22.4s, v2.s[1]
188 FMLA v29.4s, v23.4s, v2.s[1]
189 FMLA v30.4s, v22.4s, v3.s[1]
190 FMLA v31.4s, v23.4s, v3.s[1]
191
192 # Is there a remainder?- 1 floats of A (4 bytes)
193 TBZ x0, 2, 4b
194
195 # Remainder- 1 float of A (4 bytes)
1966:
197 LDR s0, [x3], 4
198 LDP q20, q21, [x5], 32
199 LDR s1, [x11], 4
200 LDR s2, [x12], 4
201 LDR s3, [x4], 4
202 FMLA v16.4s, v20.4s, v0.s[0]
203 FMLA v17.4s, v21.4s, v0.s[0]
204 FMLA v18.4s, v20.4s, v1.s[0]
205 FMLA v19.4s, v21.4s, v1.s[0]
206 FMLA v28.4s, v20.4s, v2.s[0]
207 FMLA v29.4s, v21.4s, v2.s[0]
208 FMLA v30.4s, v20.4s, v3.s[0]
209 FMLA v31.4s, v21.4s, v3.s[0]
210 B 4b
211
212
XNNPACK Teamb455b122019-09-27 18:10:33 -0700213 # Store odd width
2147:
215 TBZ x1, 2, 8f
216 STR q30, [x7], 16
217 MOV v30.16b, v31.16b
218 STR q28, [x10], 16
219 MOV v28.16b, v29.16b
220 STR q18, [x9], 16
221 MOV v18.16b, v19.16b
222 STR q16, [x6], 16
223 MOV v16.16b, v17.16b
224
2258:
226 TBZ x1, 1, 9f
227 STR d30, [x7], 8
228 DUP d30, v30.d[1]
229 STR d28, [x10], 8
230 DUP d28, v28.d[1]
231 STR d18, [x9], 8
232 DUP d18, v18.d[1]
233 STR d16, [x6], 8
234 DUP d16, v16.d[1]
235
2369:
237 TBZ x1, 0, 10f
238 STR s30, [x7]
239 STR s28, [x10]
240 STR s18, [x9]
241 STR s16, [x6]
Frank Barchard19418b52019-11-15 15:15:13 -0800242
XNNPACK Teamb455b122019-09-27 18:10:33 -070024310:
244 RET
245
246END_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128
247
248#ifdef __ELF__
249.section ".note.GNU-stack","",%progbits
250#endif