blob: f5de0149c8b1c449cc45badb20a1202706f2b807 [file] [log] [blame]
Frank Barchard21be34f2019-10-09 19:32:19 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_a53(
9# size_t mr, (x0) - unused. mr = 1
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, (x4) - unused
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, (x7) - unused
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020 # const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
Frank Barchard21be34f2019-10-09 19:32:19 -070021$else:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022 # const union xnn_f32_minmax_params params[restrict static 1]) [sp + 8] -> x8
Frank Barchard21be34f2019-10-09 19:32:19 -070023
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointer
28# x3 a0
29
30# C pointer
31# x6 c0
32
Frank Barchard8e3c5512019-10-18 18:45:08 -070033# Clamp v4 v5
34
35# A53 based on A57/A75 but with LD64
Frank Barchard21be34f2019-10-09 19:32:19 -070036
37BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_a53
38
39 $if INC:
40 # Load cn_stride, acc
41 LDP x14, x15, [sp]
42 # Load params pointer
43 LDR x8, [sp, 16]
44 $else:
45 # Load cn_stride, params pointer
46 LDP x14, x8, [sp]
47
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070048 # Load min/max values
Frank Barchard8e3c5512019-10-18 18:45:08 -070049 LD2R {v4.4s, v5.4s}, [x8]
Frank Barchard21be34f2019-10-09 19:32:19 -0700500:
51 $if INC:
52 # Load initial accumulators
Frank Barchard8e3c5512019-10-18 18:45:08 -070053 LDP q16, q17, [x15], 32
Frank Barchard21be34f2019-10-09 19:32:19 -070054 $else:
55 # Load initial bias from w into accumulators
Frank Barchard8e3c5512019-10-18 18:45:08 -070056 LDP q16, q17, [x5], 32
Frank Barchard21be34f2019-10-09 19:32:19 -070057
Frank Barchard8e3c5512019-10-18 18:45:08 -070058 MOVI v18.4s, 0 // second set of C for pipelining FMLA
Frank Barchard21be34f2019-10-09 19:32:19 -070059 PRFM PLDL1KEEP, [x5]
Frank Barchard8e3c5512019-10-18 18:45:08 -070060 MOVI v19.4s, 0
Frank Barchard21be34f2019-10-09 19:32:19 -070061 PRFM PLDL1KEEP, [x5, 64]
62 PRFM PLDL1KEEP, [x5, 128]
63 PRFM PLDL1KEEP, [x5, 192]
64
Frank Barchard8e3c5512019-10-18 18:45:08 -070065 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
66 SUBS x0, x2, 32 // k = kc - 32
67
Frank Barchard21be34f2019-10-09 19:32:19 -070068 B.LO 3f
69
Frank Barchard8e3c5512019-10-18 18:45:08 -070070 # 16 prologue
71 # Read first block of 1 A and B.
72 LDP q20, q21, [x5], 32
73 LDP q22, q23, [x5], 32
74 LDP q24, q25, [x5], 32
75 LDP q26, q27, [x5], 32
76 LDR q0, [x3], 16
Frank Barchard21be34f2019-10-09 19:32:19 -070077
Frank Barchard8e3c5512019-10-18 18:45:08 -070078 # Is there at least 32. yes do main loop
79 SUBS x0, x0, 32
Frank Barchard21be34f2019-10-09 19:32:19 -070080 B.LO 2f
81
Frank Barchard8e3c5512019-10-18 18:45:08 -070082 # Main loop - 8 floats of A (32 bytes)
Frank Barchard21be34f2019-10-09 19:32:19 -0700831:
Frank Barchard8e3c5512019-10-18 18:45:08 -070084 # First block of 4. FMA for first 4, loads for 2nd block of 4.
85 FMLA v16.4s, v20.4s, v0.s[0]
86 LDR q1, [x3], 16
87 FMLA v17.4s, v21.4s, v0.s[0]
88 LDR q20, [x5], 16
89 FMLA v18.4s, v22.4s, v0.s[1]
90 LDR q21, [x5], 16
91 FMLA v19.4s, v23.4s, v0.s[1]
92 LDR q22, [x5], 16
93 FMLA v16.4s, v24.4s, v0.s[2]
94 LDR q23, [x5], 16
95 FMLA v17.4s, v25.4s, v0.s[2]
96 LDR q24, [x5], 16
97 FMLA v18.4s, v26.4s, v0.s[3]
98 LDR q25, [x5], 16
99 FMLA v19.4s, v27.4s, v0.s[3]
100 LDR q26, [x5], 16
101 LDR q27, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700102
Frank Barchard8e3c5512019-10-18 18:45:08 -0700103 # Second block of 4. FMA for second 4, loads for 1st block of 4.
104 FMLA v16.4s, v20.4s, v1.s[0]
105 LDR q0, [x3], 16
106 FMLA v17.4s, v21.4s, v1.s[0]
107 LDR q20, [x5], 16
108 FMLA v18.4s, v22.4s, v1.s[1]
109 LDR q21, [x5], 16
110 FMLA v19.4s, v23.4s, v1.s[1]
111 LDR q22, [x5], 16
112 FMLA v16.4s, v24.4s, v1.s[2]
113 LDR q23, [x5], 16
114 FMLA v17.4s, v25.4s, v1.s[2]
115 LDR q24, [x5], 16
116 FMLA v18.4s, v26.4s, v1.s[3]
117 LDR q25, [x5], 16
118 FMLA v19.4s, v27.4s, v1.s[3]
119 LDR q26, [x5], 16
120 SUBS x0, x0, 32
121 LDR q27, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700122 B.HS 1b
123
Frank Barchard21be34f2019-10-09 19:32:19 -07001242:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700125 # Epilogue
Frank Barchard21be34f2019-10-09 19:32:19 -0700126
Frank Barchard8e3c5512019-10-18 18:45:08 -0700127 # First block of 4. FMA for first 4, loads for 2nd block of 4.
128 FMLA v16.4s, v20.4s, v0.s[0]
129 LDR q1, [x3], 16
130 FMLA v17.4s, v21.4s, v0.s[0]
131 LDR q20, [x5], 16
132 FMLA v18.4s, v22.4s, v0.s[1]
133 LDR q21, [x5], 16
134 FMLA v19.4s, v23.4s, v0.s[1]
135 LDR q22, [x5], 16
136 FMLA v16.4s, v24.4s, v0.s[2]
137 LDR q23, [x5], 16
138 FMLA v17.4s, v25.4s, v0.s[2]
139 LDR q24, [x5], 16
140 FMLA v18.4s, v26.4s, v0.s[3]
141 LDR q25, [x5], 16
142 FMLA v19.4s, v27.4s, v0.s[3]
143 LDR q26, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700144
Frank Barchard8e3c5512019-10-18 18:45:08 -0700145 # Second block of 4. no loads
146 FMLA v16.4s, v20.4s, v1.s[0]
147 LDR q27, [x5], 16
148 FMLA v17.4s, v21.4s, v1.s[0]
149 FMLA v18.4s, v22.4s, v1.s[1]
150 FMLA v19.4s, v23.4s, v1.s[1]
151 FMLA v16.4s, v24.4s, v1.s[2]
152 FMLA v17.4s, v25.4s, v1.s[2]
153 FMLA v18.4s, v26.4s, v1.s[3]
154 FMLA v19.4s, v27.4s, v1.s[3]
Frank Barchard21be34f2019-10-09 19:32:19 -0700155
Frank Barchard21be34f2019-10-09 19:32:19 -07001563:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700157 # Is there a remainder?- 4 floats of A (16 bytes)
158 TBNZ x0, 4, 5f
Frank Barchard21be34f2019-10-09 19:32:19 -0700159 # Is there a remainder?- 2 floats of A (8 bytes)
Frank Barchard8e3c5512019-10-18 18:45:08 -0700160 TBNZ x0, 3, 6f
Frank Barchard21be34f2019-10-09 19:32:19 -0700161 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barchard8e3c5512019-10-18 18:45:08 -0700162 TBNZ x0, 2, 8f
Frank Barchard21be34f2019-10-09 19:32:19 -0700163
1644:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700165 FADD v16.4s, v16.4s, v18.4s
166 FADD v17.4s, v17.4s, v19.4s
167
Frank Barchard21be34f2019-10-09 19:32:19 -0700168 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700169 FMAX v16.4s, v16.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800170 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700171 FMAX v17.4s, v17.4s, v4.4s
172 FMIN v16.4s, v16.4s, v5.4s
173 FMIN v17.4s, v17.4s, v5.4s
Frank Barchard21be34f2019-10-09 19:32:19 -0700174
175 # Store full 1 x 8
Frank Barchard8e3c5512019-10-18 18:45:08 -0700176 B.LO 9f
Frank Barchard21be34f2019-10-09 19:32:19 -0700177
Frank Barchard5abe43c2019-11-13 16:02:01 -0800178 ST1 {v16.16b, v17.16b}, [x6], x14
Frank Barchard21be34f2019-10-09 19:32:19 -0700179 SUB x3, x3, x2 // a0 -= kc
180
Frank Barchard21be34f2019-10-09 19:32:19 -0700181 B.HI 0b
182
183 RET
184
1855:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700186 # Remainder- 4 floats of A (16 bytes)
187 LDR q20, [x5], 16
188 LDR q21, [x5], 16
189 LDR q0, [x3], 16
190 FMLA v16.4s, v20.4s, v0.s[0]
191 FMLA v17.4s, v21.4s, v0.s[0]
192 LDR q22, [x5], 16
193 LDR q23, [x5], 16
194 LDR q24, [x5], 16
195 LDR q25, [x5], 16
196 LDR q26, [x5], 16
197 LDR q27, [x5], 16
198 FMLA v18.4s, v22.4s, v0.s[1]
199 FMLA v19.4s, v23.4s, v0.s[1]
200 FMLA v16.4s, v24.4s, v0.s[2]
201 FMLA v17.4s, v25.4s, v0.s[2]
202 FMLA v18.4s, v26.4s, v0.s[3]
203 FMLA v19.4s, v27.4s, v0.s[3]
Frank Barchard21be34f2019-10-09 19:32:19 -0700204
Frank Barchard8e3c5512019-10-18 18:45:08 -0700205 TBZ x0, 3, 7f
Frank Barchard21be34f2019-10-09 19:32:19 -07002066:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700207 # Remainder- 2 floats of A (8 bytes)
208 LDR q20, [x5], 16
209 LDR q21, [x5], 16
210 LDR d0, [x3], 8
211 FMLA v16.4s, v20.4s, v0.s[0]
212 FMLA v17.4s, v21.4s, v0.s[0]
213 LDR q22, [x5], 16
214 LDR q23, [x5], 16
215 FMLA v18.4s, v22.4s, v0.s[1]
216 FMLA v19.4s, v23.4s, v0.s[1]
2177:
218 TBZ x0, 2, 4b
2198:
220 # Remainder- 1 float of A (4 bytes)
221 LDR q20, [x5], 16
222 LDR q21, [x5], 16
223 LDR s0, [x3], 4
224 FMLA v16.4s, v20.4s, v0.s[0]
225 FMLA v17.4s, v21.4s, v0.s[0]
Frank Barchard21be34f2019-10-09 19:32:19 -0700226 B 4b
227
Frank Barchard8e3c5512019-10-18 18:45:08 -0700228 # Store odd channels
Frank Barchard21be34f2019-10-09 19:32:19 -07002299:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700230 TBZ x1, 2, 10f
231 STR q16, [x6], 16
232 MOV v16.16b, v17.16b
233
Frank Barchard21be34f2019-10-09 19:32:19 -070023410:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700235 TBZ x1, 1, 11f
236 STR d16, [x6], 8
237 DUP d16, v16.d[1]
238
23911:
240 TBZ x1, 0, 12f
241 STR s16, [x6]
24212:
Frank Barchard21be34f2019-10-09 19:32:19 -0700243 RET
244
245END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_1x8__aarch64_neonfma_cortex_a53
246
247#ifdef __ELF__
248.section ".note.GNU-stack","",%progbits
249#endif