blob: 3e98548708f118bb1ee9dc820264b3cb0f992a14 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3 a0
29# x11 a1
30# x12 a2
31# x4 a3 / a_stride
32
33# C pointers
34# x6 c0
35# x9 c1
36# x10 c2
37# x7 c3 / cm_stride
38
39BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128
40
41 # Load cn_stride, params pointer
42 LDP x14, x8, [sp]
43
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070044 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070045 LD2R {v4.4s, v5.4s}, [x8]
46
47 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080048 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070049 ADD x11, x3, x4 // a1 = a0 + a_stride
50 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070051 CSEL x11, x3, x11, LO // a1 = a0
52 CSEL x9, x6, x9, LO // c1 = c0
53
54 ADD x12, x11, x4 // a2 = a1 + a_stride
55 ADD x10, x9, x7 // c2 = c1 + cm_stride
56 // if mr <= 2
57 CSEL x12, x11, x12, LS // a2 = a1
58 CSEL x10, x9, x10, LS // c2 = c1
59
Frank Barchard684bbb02019-11-16 14:14:42 -080060 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070061 ADD x4, x12, x4 // a3 = a2 + a_stride
62 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070063 CSEL x4, x12, x4, LO // a3 = a2
64 CSEL x7, x10, x7, LO // c3 = c2
65
660:
67 # Load initial bias from w into accumulators
68 LDP q16, q17, [x5], 32
69 MOV v18.16b, v16.16b
70 MOV v19.16b, v17.16b
71 MOV v28.16b, v16.16b
72 MOV v29.16b, v17.16b
73 MOV v30.16b, v16.16b
74 MOV v31.16b, v17.16b
75
76 # Is there at least 4 floats (16 bytes)?
77 SUBS x0, x2, 16 // k = kc - 16
Frank Barchard81558542020-02-11 16:35:26 -080078 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -070079
80 # Main loop - 4 floats of A (16 bytes)
811:
82 LDR q0, [x3], 16
83 LDP q20, q21, [x5], 32
84 LDR q1, [x11], 16
85 LDR q2, [x12], 16
86 LDR q3, [x4], 16
87 FMLA v16.4s, v20.4s, v0.s[0]
88 FMLA v17.4s, v21.4s, v0.s[0]
89 FMLA v18.4s, v20.4s, v1.s[0]
90 FMLA v19.4s, v21.4s, v1.s[0]
91 LDP q22, q23, [x5], 32
92 FMLA v28.4s, v20.4s, v2.s[0]
93 FMLA v29.4s, v21.4s, v2.s[0]
94 FMLA v30.4s, v20.4s, v3.s[0]
95 FMLA v31.4s, v21.4s, v3.s[0]
96 LDP q24, q25, [x5], 32
97 FMLA v16.4s, v22.4s, v0.s[1]
98 FMLA v17.4s, v23.4s, v0.s[1]
99 FMLA v18.4s, v22.4s, v1.s[1]
100 FMLA v19.4s, v23.4s, v1.s[1]
101 LDP q26, q27, [x5], 32
102 FMLA v28.4s, v22.4s, v2.s[1]
103 FMLA v29.4s, v23.4s, v2.s[1]
104 FMLA v30.4s, v22.4s, v3.s[1]
105 FMLA v31.4s, v23.4s, v3.s[1]
106 FMLA v16.4s, v24.4s, v0.s[2]
107 FMLA v17.4s, v25.4s, v0.s[2]
108 FMLA v18.4s, v24.4s, v1.s[2]
109 FMLA v19.4s, v25.4s, v1.s[2]
110 FMLA v28.4s, v24.4s, v2.s[2]
111 FMLA v29.4s, v25.4s, v2.s[2]
112 FMLA v30.4s, v24.4s, v3.s[2]
113 FMLA v31.4s, v25.4s, v3.s[2]
114 FMLA v16.4s, v26.4s, v0.s[3]
115 FMLA v17.4s, v27.4s, v0.s[3]
116 FMLA v18.4s, v26.4s, v1.s[3]
117 FMLA v19.4s, v27.4s, v1.s[3]
118 FMLA v28.4s, v26.4s, v2.s[3]
119 FMLA v29.4s, v27.4s, v2.s[3]
120 SUBS x0, x0, 16
121 FMLA v30.4s, v26.4s, v3.s[3]
122 FMLA v31.4s, v27.4s, v3.s[3]
123 B.HS 1b
124
Frank Barchard81558542020-02-11 16:35:26 -0800125 TST x0, 15
126 B.NE 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127
Frank Barchard81558542020-02-11 16:35:26 -08001284:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700129 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700130 FMAX v16.4s, v16.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800131 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700132 FMAX v17.4s, v17.4s, v4.4s
133 FMAX v18.4s, v18.4s, v4.4s
134 FMAX v19.4s, v19.4s, v4.4s
135 FMAX v28.4s, v28.4s, v4.4s
136 FMAX v29.4s, v29.4s, v4.4s
137 FMAX v30.4s, v30.4s, v4.4s
138 FMAX v31.4s, v31.4s, v4.4s
139 FMIN v16.4s, v16.4s, v5.4s
140 FMIN v17.4s, v17.4s, v5.4s
141 FMIN v18.4s, v18.4s, v5.4s
142 FMIN v19.4s, v19.4s, v5.4s
143 FMIN v28.4s, v28.4s, v5.4s
144 FMIN v29.4s, v29.4s, v5.4s
145 FMIN v30.4s, v30.4s, v5.4s
146 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700147
XNNPACK Teamb455b122019-09-27 18:10:33 -0700148 # Store full 4 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700149 B.LO 7f
150
Frank Barcharde67b7832019-11-12 12:48:40 -0800151 ST1 {v16.16b, v17.16b}, [x6], x14
Frank Barchard19418b52019-11-15 15:15:13 -0800152 SUB x3, x3, x2 // a0 -= kc
153 ST1 {v18.16b, v19.16b}, [x9], x14
154 SUB x11, x11, x2 // a1 -= kc
155 ST1 {v28.16b, v29.16b}, [x10], x14
156 SUB x12, x12, x2 // a2 -= kc
157 ST1 {v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700158 SUB x4, x4, x2 // a3 -= kc
159
XNNPACK Teamb455b122019-09-27 18:10:33 -0700160 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700161 RET
162
Frank Barchard81558542020-02-11 16:35:26 -0800163 # Remainder- 2 floats of A (8 bytes)
1645:
165 # Is there a remainder?- 2 floats of A (8 bytes)
166 TBZ x0, 3, 6f
167
168 # Remainder- 2 floats of A (8 bytes)
169 LDR d0, [x3], 8
170 LDP q20, q21, [x5], 32
171 LDR d1, [x11], 8
172 LDR d2, [x12], 8
173 LDR d3, [x4], 8
174 FMLA v16.4s, v20.4s, v0.s[0]
175 FMLA v17.4s, v21.4s, v0.s[0]
176 FMLA v18.4s, v20.4s, v1.s[0]
177 FMLA v19.4s, v21.4s, v1.s[0]
178 LDP q22, q23, [x5], 32
179 FMLA v28.4s, v20.4s, v2.s[0]
180 FMLA v29.4s, v21.4s, v2.s[0]
181 FMLA v30.4s, v20.4s, v3.s[0]
182 FMLA v31.4s, v21.4s, v3.s[0]
183 FMLA v16.4s, v22.4s, v0.s[1]
184 FMLA v17.4s, v23.4s, v0.s[1]
185 FMLA v18.4s, v22.4s, v1.s[1]
186 FMLA v19.4s, v23.4s, v1.s[1]
187 FMLA v28.4s, v22.4s, v2.s[1]
188 FMLA v29.4s, v23.4s, v2.s[1]
189 FMLA v30.4s, v22.4s, v3.s[1]
190 FMLA v31.4s, v23.4s, v3.s[1]
191
192 # Is there a remainder?- 1 floats of A (4 bytes)
193 TBZ x0, 2, 4b
194
195 # Remainder- 1 float of A (4 bytes)
1966:
197 LDR s0, [x3], 4
198 LDP q20, q21, [x5], 32
199 LDR s1, [x11], 4
200 LDR s2, [x12], 4
201 LDR s3, [x4], 4
202 FMLA v16.4s, v20.4s, v0.s[0]
203 FMLA v17.4s, v21.4s, v0.s[0]
204 FMLA v18.4s, v20.4s, v1.s[0]
205 FMLA v19.4s, v21.4s, v1.s[0]
206 FMLA v28.4s, v20.4s, v2.s[0]
207 FMLA v29.4s, v21.4s, v2.s[0]
208 FMLA v30.4s, v20.4s, v3.s[0]
209 FMLA v31.4s, v21.4s, v3.s[0]
210 B 4b
211
212
XNNPACK Teamb455b122019-09-27 18:10:33 -0700213 # Store odd width
2147:
215 TBZ x1, 2, 8f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700216 STR q16, [x6], 16
217 MOV v16.16b, v17.16b
Frank Barchard19418b52019-11-15 15:15:13 -0800218 STR q18, [x9], 16
219 MOV v18.16b, v19.16b
220 STR q28, [x10], 16
221 MOV v28.16b, v29.16b
222 STR q30, [x7], 16
223 MOV v30.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700224
2258:
226 TBZ x1, 1, 9f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700227 STR d16, [x6], 8
228 DUP d16, v16.d[1]
Frank Barchard19418b52019-11-15 15:15:13 -0800229 STR d18, [x9], 8
230 DUP d18, v18.d[1]
231 STR d28, [x10], 8
232 DUP d28, v28.d[1]
233 STR d30, [x7], 8
234 DUP d30, v30.d[1]
XNNPACK Teamb455b122019-09-27 18:10:33 -0700235
2369:
237 TBZ x1, 0, 10f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700238 STR s16, [x6]
Frank Barchard19418b52019-11-15 15:15:13 -0800239 STR s18, [x9]
240 STR s28, [x10]
241 STR s30, [x7]
242
XNNPACK Teamb455b122019-09-27 18:10:33 -070024310:
244 RET
245
246END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128
247
248#ifdef __ELF__
249.section ".note.GNU-stack","",%progbits
250#endif