blob: b6f868f25c75371526ee730dac8c44ab5c8faa39 [file] [log] [blame]
Frank Barchardbaa9ead2019-10-18 18:06:41 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
Frank Barchard21be34f2019-10-09 19:32:19 -07005
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53(
9# size_t mr, (x0) - unused. mr = 1
10# size_t nc, x1
11# size_t kc, x2 / x0
12# size_t ks, x3 / x9
13# const float**restrict a, x4
14# const float*restrict w, x5
15# float*restrict c, x6
16# size_t cm_stride, (x7) - unused
17# size_t cn_stride, [sp] -> x10
18# size_t a_offset, [sp + 8] -> x11
19# const float* zero, [sp + 16] -> x12
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020# const xnn_f32_minmax_params params [sp + 24] -> x8
Frank Barchard21be34f2019-10-09 19:32:19 -070021
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointer
26# x8 a0
27
28# C pointer
29# x6 c0
30
Frank Barchard8e3c5512019-10-18 18:45:08 -070031# A53 based on a53/75 but with LD64
Frank Barchard21be34f2019-10-09 19:32:19 -070032
33BEGIN_FUNCTION xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53
34
35 # Load cn_stride, a_offset
36 LDP x10, x11, [sp]
37
38 # Load zero, clamping params pointer
39 LDP x12, x8, [sp, 16]
40
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070041 # Load min/max values
Frank Barchard21be34f2019-10-09 19:32:19 -070042 LD2R {v30.4s, v31.4s}, [x8]
43
440:
45 # Load initial bias from w into accumulators
Frank Barchard8e3c5512019-10-18 18:45:08 -070046 LDP q16, q17, [x5], 32
47 MOVI v18.4s, 0 // second set of C for pipelining FMLA
Frank Barchard21be34f2019-10-09 19:32:19 -070048 PRFM PLDL1KEEP, [x5]
Frank Barchard8e3c5512019-10-18 18:45:08 -070049 MOVI v19.4s, 0
Frank Barchard21be34f2019-10-09 19:32:19 -070050 PRFM PLDL1KEEP, [x5, 64]
51 PRFM PLDL1KEEP, [x5, 128]
52 PRFM PLDL1KEEP, [x5, 192]
53
54 MOV x9, x3 // p = ks
55
561:
57 # Load next A pointer
58 LDR x8, [x4], 8
59
60 CMP x8, x12 // if a0 == zero
61 ADD x8, x8, x11 // a0 += a_offset
62 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset
63
Frank Barchard8e3c5512019-10-18 18:45:08 -070064 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
65 SUBS x0, x2, 32 // k = kc - 32 // k = kc
Frank Barchard81558542020-02-11 16:35:26 -080066 B.LO 5f
Frank Barchard21be34f2019-10-09 19:32:19 -070067
Frank Barchard8e3c5512019-10-18 18:45:08 -070068 # 16 prologue
69 # Read first block of A and B.
70 LDP q20, q21, [x5], 32
71 LDP q22, q23, [x5], 32
72 LDP q24, q25, [x5], 32
73 LDP q26, q27, [x5], 32
74 LDR q0, [x8], 16
Frank Barchard21be34f2019-10-09 19:32:19 -070075
Frank Barchard8e3c5512019-10-18 18:45:08 -070076 # Is there at least 8. yes do main loop
77 SUBS x0, x0, 32
Frank Barchard21be34f2019-10-09 19:32:19 -070078 B.LO 3f
79
Frank Barchard8e3c5512019-10-18 18:45:08 -070080 # Main loop - 8 floats of A (32 bytes)
Frank Barchard21be34f2019-10-09 19:32:19 -0700812:
Frank Barchard8e3c5512019-10-18 18:45:08 -070082 # First block of 4. FMA for first 4, loads for 2nd block of 4.
83 FMLA v16.4s, v20.4s, v0.s[0]
84 LDR q1, [x8], 16
85 FMLA v17.4s, v21.4s, v0.s[0]
86 LDR q20, [x5], 16
87 FMLA v18.4s, v22.4s, v0.s[1]
88 LDR q21, [x5], 16
89 FMLA v19.4s, v23.4s, v0.s[1]
90 LDR q22, [x5], 16
91 FMLA v16.4s, v24.4s, v0.s[2]
92 LDR q23, [x5], 16
93 FMLA v17.4s, v25.4s, v0.s[2]
94 LDR q24, [x5], 16
95 FMLA v18.4s, v26.4s, v0.s[3]
96 LDR q25, [x5], 16
97 FMLA v19.4s, v27.4s, v0.s[3]
98 LDR q26, [x5], 16
99 LDR q27, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700100
Frank Barchard8e3c5512019-10-18 18:45:08 -0700101 # Second block of 4. FMA for second 4, loads for 1st block of 4.
102 FMLA v16.4s, v20.4s, v1.s[0]
103 LDR q0, [x8], 16
104 FMLA v17.4s, v21.4s, v1.s[0]
105 LDR q20, [x5], 16
106 FMLA v18.4s, v22.4s, v1.s[1]
107 LDR q21, [x5], 16
108 FMLA v19.4s, v23.4s, v1.s[1]
109 LDR q22, [x5], 16
110 FMLA v16.4s, v24.4s, v1.s[2]
111 LDR q23, [x5], 16
112 FMLA v17.4s, v25.4s, v1.s[2]
113 LDR q24, [x5], 16
114 FMLA v18.4s, v26.4s, v1.s[3]
115 LDR q25, [x5], 16
116 FMLA v19.4s, v27.4s, v1.s[3]
117 LDR q26, [x5], 16
118 SUBS x0, x0, 32
119 LDR q27, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700120 B.HS 2b
121
Frank Barchard21be34f2019-10-09 19:32:19 -07001223:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700123 # Epilogue
Frank Barchard21be34f2019-10-09 19:32:19 -0700124
Frank Barchard8e3c5512019-10-18 18:45:08 -0700125 # First block of 4. FMA for first 4, loads for 2nd block of 4.
126 FMLA v16.4s, v20.4s, v0.s[0]
127 LDR q1, [x8], 16
128 FMLA v17.4s, v21.4s, v0.s[0]
129 LDR q20, [x5], 16
130 FMLA v18.4s, v22.4s, v0.s[1]
131 LDR q21, [x5], 16
132 FMLA v19.4s, v23.4s, v0.s[1]
133 LDR q22, [x5], 16
134 FMLA v16.4s, v24.4s, v0.s[2]
135 LDR q23, [x5], 16
136 FMLA v17.4s, v25.4s, v0.s[2]
137 LDR q24, [x5], 16
138 FMLA v18.4s, v26.4s, v0.s[3]
139 LDR q25, [x5], 16
140 FMLA v19.4s, v27.4s, v0.s[3]
141 LDR q26, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700142
Frank Barchard8e3c5512019-10-18 18:45:08 -0700143 # Second block of 4. no loads
144 FMLA v16.4s, v20.4s, v1.s[0]
145 LDR q27, [x5], 16
146 FMLA v17.4s, v21.4s, v1.s[0]
147 FMLA v18.4s, v22.4s, v1.s[1]
148 FMLA v19.4s, v23.4s, v1.s[1]
149 FMLA v16.4s, v24.4s, v1.s[2]
150 FMLA v17.4s, v25.4s, v1.s[2]
Frank Barchard81558542020-02-11 16:35:26 -0800151 TST x0, 31
Frank Barchard8e3c5512019-10-18 18:45:08 -0700152 FMLA v18.4s, v26.4s, v1.s[3]
153 FMLA v19.4s, v27.4s, v1.s[3]
Frank Barchard81558542020-02-11 16:35:26 -0800154 # Is there a remainder?- 4 floats of A (16 bytes) or less
155 B.NE 5f
Frank Barchard21be34f2019-10-09 19:32:19 -0700156
1574:
Frank Barchard21be34f2019-10-09 19:32:19 -0700158 # ks loop
159 SUBS x9, x9, 8 // ks -= MR * sizeof(void*)
Frank Barchard16d72722020-02-12 15:46:20 -0800160 B.HI 1b
Frank Barchard21be34f2019-10-09 19:32:19 -0700161
Frank Barchard8e3c5512019-10-18 18:45:08 -0700162 FADD v16.4s, v16.4s, v18.4s
163 FADD v17.4s, v17.4s, v19.4s
164
Frank Barchard21be34f2019-10-09 19:32:19 -0700165 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700166 FMAX v16.4s, v16.4s, v30.4s
167 FMAX v17.4s, v17.4s, v30.4s
168 FMIN v16.4s, v16.4s, v31.4s
169 FMIN v17.4s, v17.4s, v31.4s
Frank Barchard21be34f2019-10-09 19:32:19 -0700170
171 # Store full 1 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800172 SUBS x1, x1, 8
Frank Barchard8e3c5512019-10-18 18:45:08 -0700173 B.LO 10f
Frank Barchard21be34f2019-10-09 19:32:19 -0700174
Frank Barchard5abe43c2019-11-13 16:02:01 -0800175 ST1 {v16.16b, v17.16b}, [x6], x10
Frank Barchard21be34f2019-10-09 19:32:19 -0700176 SUB x4, x4, x3 // a -= ks
177
178 # nc loop
Frank Barchard21be34f2019-10-09 19:32:19 -0700179 B.HI 0b
180
181 RET
182
Frank Barchard81558542020-02-11 16:35:26 -08001835:
184 # Is there a remainder?- 2 floats of A (8 bytes)
185 TBZ x0, 4, 6f
186
Frank Barchard8e3c5512019-10-18 18:45:08 -0700187 # Remainder- 4 floats of A (16 bytes)
188 LDR q20, [x5], 16
189 LDR q21, [x5], 16
190 LDR q0, [x8], 16
191 FMLA v16.4s, v20.4s, v0.s[0]
192 FMLA v17.4s, v21.4s, v0.s[0]
193 LDR q22, [x5], 16
194 LDR q23, [x5], 16
195 LDR q24, [x5], 16
196 LDR q25, [x5], 16
197 LDR q26, [x5], 16
198 LDR q27, [x5], 16
199 FMLA v18.4s, v22.4s, v0.s[1]
200 FMLA v19.4s, v23.4s, v0.s[1]
201 FMLA v16.4s, v24.4s, v0.s[2]
202 FMLA v17.4s, v25.4s, v0.s[2]
203 FMLA v18.4s, v26.4s, v0.s[3]
204 FMLA v19.4s, v27.4s, v0.s[3]
Frank Barchard21be34f2019-10-09 19:32:19 -0700205
Frank Barchard81558542020-02-11 16:35:26 -08002066:
207 TBZ x0, 3, 7f
Frank Barchard8e3c5512019-10-18 18:45:08 -0700208 # Remainder- 2 floats of A (8 bytes)
209 LDR q20, [x5], 16
210 LDR q21, [x5], 16
211 LDR d0, [x8], 8
212 FMLA v16.4s, v20.4s, v0.s[0]
213 FMLA v17.4s, v21.4s, v0.s[0]
214 LDR q22, [x5], 16
215 LDR q23, [x5], 16
216 FMLA v18.4s, v22.4s, v0.s[1]
217 FMLA v19.4s, v23.4s, v0.s[1]
Frank Barchard81558542020-02-11 16:35:26 -08002187:
219 TBZ x0, 2, 4b
Frank Barchard8e3c5512019-10-18 18:45:08 -0700220 # Remainder- 1 float of A (4 bytes)
221 LDR q20, [x5], 16
222 LDR q21, [x5], 16
223 LDR s0, [x8], 4
224 FMLA v16.4s, v20.4s, v0.s[0]
225 FMLA v17.4s, v21.4s, v0.s[0]
Frank Barchard81558542020-02-11 16:35:26 -0800226 B 4b
Frank Barchard21be34f2019-10-09 19:32:19 -0700227
Frank Barchard21be34f2019-10-09 19:32:19 -070022810:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700229 # Store odd channels
230 TBZ x1, 2, 11f
231 STR q16, [x6], 16
232 MOV v16.16b, v17.16b
233
Frank Barchard21be34f2019-10-09 19:32:19 -070023411:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700235 TBZ x1, 1, 12f
236 STR d16, [x6], 8
237 DUP d16, v16.d[1]
238
23912:
240 TBZ x1, 0, 13f
241 STR s16, [x6], 4
24213:
Frank Barchard21be34f2019-10-09 19:32:19 -0700243 RET
244
245END_FUNCTION xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53
246
247#ifdef __ELF__
248.section ".note.GNU-stack","",%progbits
249#endif