blob: bdbee629c52e7177cf1aa4a0f6abf69bce8c2c47 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07005
6#include <xnnpack/assembly.h>
7
Marat Dukhande06f492020-04-09 00:19:31 -07008# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128(
XNNPACK Teamb455b122019-09-27 18:10:33 -07009# size_t mr, x0
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, x4
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, x7
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard167d6672021-06-15 10:31:54 -070020 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8)
XNNPACK Teamb455b122019-09-27 18:10:33 -070021$else:
Frank Barchard167d6672021-06-15 10:31:54 -070022 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8)
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
Frank Barchard909564c2020-06-09 03:54:33 -070024# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26# A pointers
27# x3 a0
28# x11 a1
29# x12 a2
30# x4 a3 / a_stride
31
32# C pointers
33# x6 c0
34# x9 c1
35# x10 c2
36# x7 c3 / cm_stride
37
Marat Dukhande06f492020-04-09 00:19:31 -070038BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128
XNNPACK Teamb455b122019-09-27 18:10:33 -070039
40 $if INC:
41 # Load cn_stride, acc
Frank Barchard76f43f02021-05-12 14:52:01 -070042 LDP x14, x15, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -070043 # Load params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070044 LDR x8, [sp, 16]
XNNPACK Teamb455b122019-09-27 18:10:33 -070045 $else:
46 # Load cn_stride, params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070047 LDP x14, x8, [sp]
XNNPACK Teamb455b122019-09-27 18:10:33 -070048
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070049 # Load min/max values
Frank Barchard76f43f02021-05-12 14:52:01 -070050 LD2R {v4.4s, v5.4s}, [x8]
XNNPACK Teamb455b122019-09-27 18:10:33 -070051
52 # Clamp A and C pointers
Frank Barchard76f43f02021-05-12 14:52:01 -070053 CMP x0, 2 // if mr < 2
54 ADD x11, x3, x4 // a1 = a0 + a_stride
55 ADD x9, x6, x7 // c1 = c0 + cm_stride
56 CSEL x11, x3, x11, LO // a1 = a0
57 CSEL x9, x6, x9, LO // c1 = c0
XNNPACK Teamb455b122019-09-27 18:10:33 -070058
Frank Barchard76f43f02021-05-12 14:52:01 -070059 ADD x12, x11, x4 // a2 = a1 + a_stride
60 ADD x10, x9, x7 // c2 = c1 + cm_stride
Frank Barchard7c9f1f92021-06-04 14:38:55 -070061 // if mr <= 2
Frank Barchard76f43f02021-05-12 14:52:01 -070062 CSEL x12, x11, x12, LS // a2 = a1
63 CSEL x10, x9, x10, LS // c2 = c1
XNNPACK Teamb455b122019-09-27 18:10:33 -070064
Frank Barchard76f43f02021-05-12 14:52:01 -070065 CMP x0, 4 // if mr < 4
66 ADD x4, x12, x4 // a3 = a2 + a_stride
67 ADD x7, x10, x7 // c3 = c2 + cm_stride
68 CSEL x4, x12, x4, LO // a3 = a2
69 CSEL x7, x10, x7, LO // c3 = c2
XNNPACK Teamb455b122019-09-27 18:10:33 -070070
710:
72 $if INC:
73 # Load initial accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -070074 LDP q16, q17, [x15], 32
75 LDP q18, q19, [x15], 32
76 LDP q28, q29, [x15], 32
77 LDP q30, q31, [x15], 32
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 $else:
79 # Load initial bias from w into accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -070080 LDP q16, q17, [x5], 32
81 MOV v18.16b, v16.16b
82 MOV v19.16b, v17.16b
83 MOV v28.16b, v16.16b
84 MOV v29.16b, v17.16b
85 MOV v30.16b, v16.16b
86 MOV v31.16b, v17.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -070087
88 # Is there at least 4 floats (16 bytes)?
Frank Barchard76f43f02021-05-12 14:52:01 -070089 SUBS x0, x2, 16 // k = kc - 16
90 B.LO 3f
XNNPACK Teamb455b122019-09-27 18:10:33 -070091
92 # Main loop - 4 floats of A (16 bytes)
931:
Frank Barchard76f43f02021-05-12 14:52:01 -070094 LDR q0, [x3], 16
95 LDP q20, q21, [x5], 32
96 LDR q1, [x11], 16
97 LDR q2, [x12], 16
98 LDR q3, [x4], 16
99 FMLA v16.4s, v20.4s, v0.s[0]
100 FMLA v17.4s, v21.4s, v0.s[0]
101 FMLA v18.4s, v20.4s, v1.s[0]
102 FMLA v19.4s, v21.4s, v1.s[0]
103 LDP q22, q23, [x5], 32
104 FMLA v28.4s, v20.4s, v2.s[0]
105 FMLA v29.4s, v21.4s, v2.s[0]
106 FMLA v30.4s, v20.4s, v3.s[0]
107 FMLA v31.4s, v21.4s, v3.s[0]
108 LDP q24, q25, [x5], 32
109 FMLA v16.4s, v22.4s, v0.s[1]
110 FMLA v17.4s, v23.4s, v0.s[1]
111 FMLA v18.4s, v22.4s, v1.s[1]
112 FMLA v19.4s, v23.4s, v1.s[1]
113 LDP q26, q27, [x5], 32
114 FMLA v28.4s, v22.4s, v2.s[1]
115 FMLA v29.4s, v23.4s, v2.s[1]
116 FMLA v30.4s, v22.4s, v3.s[1]
117 FMLA v31.4s, v23.4s, v3.s[1]
118 FMLA v16.4s, v24.4s, v0.s[2]
119 FMLA v17.4s, v25.4s, v0.s[2]
120 FMLA v18.4s, v24.4s, v1.s[2]
121 FMLA v19.4s, v25.4s, v1.s[2]
122 FMLA v28.4s, v24.4s, v2.s[2]
123 FMLA v29.4s, v25.4s, v2.s[2]
124 FMLA v30.4s, v24.4s, v3.s[2]
125 FMLA v31.4s, v25.4s, v3.s[2]
126 FMLA v16.4s, v26.4s, v0.s[3]
127 FMLA v17.4s, v27.4s, v0.s[3]
128 FMLA v18.4s, v26.4s, v1.s[3]
129 FMLA v19.4s, v27.4s, v1.s[3]
130 FMLA v28.4s, v26.4s, v2.s[3]
131 FMLA v29.4s, v27.4s, v2.s[3]
132 SUBS x0, x0, 16
133 FMLA v30.4s, v26.4s, v3.s[3]
134 FMLA v31.4s, v27.4s, v3.s[3]
135 B.HS 1b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700136
Frank Barchard76f43f02021-05-12 14:52:01 -0700137 TST x0, 15
138 B.NE 3f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700139
Frank Barchard3b262062020-09-30 15:53:17 -07001402:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700141 # Clamp
Frank Barchard76f43f02021-05-12 14:52:01 -0700142 FMAX v16.4s, v16.4s, v4.4s
143 SUBS x1, x1, 8
144 FMAX v17.4s, v17.4s, v4.4s
145 FMAX v18.4s, v18.4s, v4.4s
146 FMAX v19.4s, v19.4s, v4.4s
147 FMAX v28.4s, v28.4s, v4.4s
148 FMAX v29.4s, v29.4s, v4.4s
149 FMAX v30.4s, v30.4s, v4.4s
150 FMAX v31.4s, v31.4s, v4.4s
151 FMIN v16.4s, v16.4s, v5.4s
152 FMIN v17.4s, v17.4s, v5.4s
153 FMIN v18.4s, v18.4s, v5.4s
154 FMIN v19.4s, v19.4s, v5.4s
155 FMIN v28.4s, v28.4s, v5.4s
156 FMIN v29.4s, v29.4s, v5.4s
157 FMIN v30.4s, v30.4s, v5.4s
158 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700159
XNNPACK Teamb455b122019-09-27 18:10:33 -0700160 # Store full 4 x 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700161 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700162
Frank Barchard19418b52019-11-15 15:15:13 -0800163 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700164 ST1 {v30.16b, v31.16b}, [x7], x14
165 SUB x3, x3, x2 // a0 -= kc
166 ST1 {v28.16b, v29.16b}, [x10], x14
167 SUB x11, x11, x2 // a1 -= kc
168 ST1 {v18.16b, v19.16b}, [x9], x14
169 SUB x12, x12, x2 // a2 -= kc
170 ST1 {v16.16b, v17.16b}, [x6], x14
171 SUB x4, x4, x2 // a3 -= kc
Frank Barchard19418b52019-11-15 15:15:13 -0800172 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700173 ST1 {v16.16b, v17.16b}, [x6], x14
174 SUB x3, x3, x2 // a0 -= kc
175 ST1 {v18.16b, v19.16b}, [x9], x14
176 SUB x11, x11, x2 // a1 -= kc
177 ST1 {v28.16b, v29.16b}, [x10], x14
178 SUB x12, x12, x2 // a2 -= kc
179 ST1 {v30.16b, v31.16b}, [x7], x14
180 SUB x4, x4, x2 // a3 -= kc
XNNPACK Teamb455b122019-09-27 18:10:33 -0700181
Frank Barchard76f43f02021-05-12 14:52:01 -0700182 B.HI 0b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700183 RET
184
Frank Barchard81558542020-02-11 16:35:26 -0800185 # Remainder- 2 floats of A (8 bytes)
Frank Barchard3b262062020-09-30 15:53:17 -07001863:
Frank Barchard81558542020-02-11 16:35:26 -0800187 # Is there a remainder?- 2 floats of A (8 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700188 TBZ x0, 3, 4f
Frank Barchard81558542020-02-11 16:35:26 -0800189
190 # Remainder- 2 floats of A (8 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700191 LDR d0, [x3], 8
192 LDP q20, q21, [x5], 32
193 LDR d1, [x11], 8
194 LDR d2, [x12], 8
195 LDR d3, [x4], 8
196 FMLA v16.4s, v20.4s, v0.s[0]
197 FMLA v17.4s, v21.4s, v0.s[0]
198 FMLA v18.4s, v20.4s, v1.s[0]
199 FMLA v19.4s, v21.4s, v1.s[0]
200 LDP q22, q23, [x5], 32
201 FMLA v28.4s, v20.4s, v2.s[0]
202 FMLA v29.4s, v21.4s, v2.s[0]
203 FMLA v30.4s, v20.4s, v3.s[0]
204 FMLA v31.4s, v21.4s, v3.s[0]
205 FMLA v16.4s, v22.4s, v0.s[1]
206 FMLA v17.4s, v23.4s, v0.s[1]
207 FMLA v18.4s, v22.4s, v1.s[1]
208 FMLA v19.4s, v23.4s, v1.s[1]
209 FMLA v28.4s, v22.4s, v2.s[1]
210 FMLA v29.4s, v23.4s, v2.s[1]
211 FMLA v30.4s, v22.4s, v3.s[1]
212 FMLA v31.4s, v23.4s, v3.s[1]
Frank Barchard81558542020-02-11 16:35:26 -0800213
214 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700215 TBZ x0, 2, 2b
Frank Barchard81558542020-02-11 16:35:26 -0800216
217 # Remainder- 1 float of A (4 bytes)
Frank Barchard3b262062020-09-30 15:53:17 -07002184:
Frank Barchard76f43f02021-05-12 14:52:01 -0700219 LDR s0, [x3], 4
220 LDP q20, q21, [x5], 32
221 LDR s1, [x11], 4
222 LDR s2, [x12], 4
223 LDR s3, [x4], 4
224 FMLA v16.4s, v20.4s, v0.s[0]
225 FMLA v17.4s, v21.4s, v0.s[0]
226 FMLA v18.4s, v20.4s, v1.s[0]
227 FMLA v19.4s, v21.4s, v1.s[0]
228 FMLA v28.4s, v20.4s, v2.s[0]
229 FMLA v29.4s, v21.4s, v2.s[0]
230 FMLA v30.4s, v20.4s, v3.s[0]
231 FMLA v31.4s, v21.4s, v3.s[0]
232 B 2b
Frank Barchard81558542020-02-11 16:35:26 -0800233
234
XNNPACK Teamb455b122019-09-27 18:10:33 -0700235 # Store odd width
Frank Barchard3b262062020-09-30 15:53:17 -07002365:
Frank Barchard76f43f02021-05-12 14:52:01 -0700237 TBZ x1, 2, 6f
Frank Barchard19418b52019-11-15 15:15:13 -0800238 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700239 STR q30, [x7], 16
240 MOV v30.16b, v31.16b
241 STR q28, [x10], 16
242 MOV v28.16b, v29.16b
243 STR q18, [x9], 16
244 MOV v18.16b, v19.16b
245 STR q16, [x6], 16
246 MOV v16.16b, v17.16b
Frank Barchard19418b52019-11-15 15:15:13 -0800247 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700248 STR q16, [x6], 16
249 MOV v16.16b, v17.16b
250 STR q18, [x9], 16
251 MOV v18.16b, v19.16b
252 STR q28, [x10], 16
253 MOV v28.16b, v29.16b
254 STR q30, [x7], 16
255 MOV v30.16b, v31.16b
XNNPACK Teamb455b122019-09-27 18:10:33 -0700256
Frank Barchard3b262062020-09-30 15:53:17 -07002576:
Frank Barchard76f43f02021-05-12 14:52:01 -0700258 TBZ x1, 1, 7f
Frank Barchard3b262062020-09-30 15:53:17 -0700259 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700260 STR d30, [x7], 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700261 STR d28, [x10], 8
Frank Barchardb7a7c302021-09-01 16:24:25 -0700262 DUP d30, v30.d[1]
Frank Barchard76f43f02021-05-12 14:52:01 -0700263 DUP d28, v28.d[1]
264 STR d18, [x9], 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700265 STR d16, [x6], 8
Frank Barchardb7a7c302021-09-01 16:24:25 -0700266 DUP d18, v18.d[1]
Frank Barchard76f43f02021-05-12 14:52:01 -0700267 DUP d16, v16.d[1]
Frank Barchard3b262062020-09-30 15:53:17 -0700268 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700269 STR d16, [x6], 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700270 STR d18, [x9], 8
Frank Barchardb7a7c302021-09-01 16:24:25 -0700271 DUP d16, v16.d[1]
Frank Barchard76f43f02021-05-12 14:52:01 -0700272 DUP d18, v18.d[1]
273 STR d28, [x10], 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700274 STR d30, [x7], 8
Frank Barchardb7a7c302021-09-01 16:24:25 -0700275 DUP d28, v28.d[1]
Frank Barchard76f43f02021-05-12 14:52:01 -0700276 DUP d30, v30.d[1]
Frank Barchard3b262062020-09-30 15:53:17 -0700277
2787:
Frank Barchard76f43f02021-05-12 14:52:01 -0700279 TBZ x1, 0, 8f
Frank Barchard3b262062020-09-30 15:53:17 -0700280 $if INC:
Frank Barchard76f43f02021-05-12 14:52:01 -0700281 STR s30, [x7]
282 STR s28, [x10]
283 STR s18, [x9]
284 STR s16, [x6]
Frank Barchard3b262062020-09-30 15:53:17 -0700285 $else:
Frank Barchard76f43f02021-05-12 14:52:01 -0700286 STR s16, [x6]
287 STR s18, [x9]
288 STR s28, [x10]
289 STR s30, [x7]
Frank Barchard3b262062020-09-30 15:53:17 -0700290
XNNPACK Teamb455b122019-09-27 18:10:33 -07002918:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700292 RET
293
Marat Dukhande06f492020-04-09 00:19:31 -0700294END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128
XNNPACK Teamb455b122019-09-27 18:10:33 -0700295
296#ifdef __ELF__
297.section ".note.GNU-stack","",%progbits
298#endif