blob: 575d1111128e30b2c672ccb7502c0e34837227a2 [file] [log] [blame]
Frank Barchard21be34f2019-10-09 19:32:19 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
Marat Dukhande06f492020-04-09 00:19:31 -07008# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53(
Frank Barchard21be34f2019-10-09 19:32:19 -07009# size_t mr, (x0) - unused. mr = 1
10# size_t nc, x1
11# size_t kc, x2 / x0
12# const uint8_t*restrict a, x3
13# size_t a_stride, (x4) - unused
14# const void*restrict w, x5
15# uint8_t*restrict c, x6
16# size_t cm_stride, (x7) - unused
17# size_t cn_stride, [sp] -> x14
18$if INC:
19 # const float*restrict acc, [sp + 8] -> x15
Frank Barchard167d6672021-06-15 10:31:54 -070020 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8)
Frank Barchard21be34f2019-10-09 19:32:19 -070021$else:
Frank Barchard167d6672021-06-15 10:31:54 -070022 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8)
Frank Barchard21be34f2019-10-09 19:32:19 -070023
Frank Barchard909564c2020-06-09 03:54:33 -070024# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
Frank Barchard21be34f2019-10-09 19:32:19 -070025
26# A pointer
27# x3 a0
28
29# C pointer
30# x6 c0
31
Frank Barchard8e3c5512019-10-18 18:45:08 -070032# Clamp v4 v5
33
34# A53 based on A57/A75 but with LD64
Frank Barchard21be34f2019-10-09 19:32:19 -070035
Marat Dukhande06f492020-04-09 00:19:31 -070036BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53
Frank Barchard21be34f2019-10-09 19:32:19 -070037
38 $if INC:
39 # Load cn_stride, acc
Frank Barchard76f43f02021-05-12 14:52:01 -070040 LDP x14, x15, [sp]
Frank Barchard21be34f2019-10-09 19:32:19 -070041 # Load params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070042 LDR x8, [sp, 16]
Frank Barchard21be34f2019-10-09 19:32:19 -070043 $else:
44 # Load cn_stride, params pointer
Frank Barchard76f43f02021-05-12 14:52:01 -070045 LDP x14, x8, [sp]
Frank Barchard21be34f2019-10-09 19:32:19 -070046
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070047 # Load min/max values
Frank Barchard76f43f02021-05-12 14:52:01 -070048 LD2R {v4.4s, v5.4s}, [x8]
Frank Barchard21be34f2019-10-09 19:32:19 -0700490:
50 $if INC:
51 # Load initial accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -070052 LDP q16, q17, [x15], 32
Frank Barchard21be34f2019-10-09 19:32:19 -070053 $else:
54 # Load initial bias from w into accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -070055 LDP q16, q17, [x5], 32
Frank Barchard21be34f2019-10-09 19:32:19 -070056
Frank Barchard76f43f02021-05-12 14:52:01 -070057 MOVI v18.4s, 0 // second set of C for pipelining FMLA
Frank Barchardcbfa3382021-05-07 10:30:05 -070058 PRFM PLDL1KEEP, [x5]
Frank Barchard76f43f02021-05-12 14:52:01 -070059 MOVI v19.4s, 0
Frank Barchardcbfa3382021-05-07 10:30:05 -070060 PRFM PLDL1KEEP, [x5, 64]
61 PRFM PLDL1KEEP, [x5, 128]
62 PRFM PLDL1KEEP, [x5, 192]
Frank Barchard21be34f2019-10-09 19:32:19 -070063
Frank Barchard8e3c5512019-10-18 18:45:08 -070064 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
Frank Barchard76f43f02021-05-12 14:52:01 -070065 SUBS x0, x2, 32 // k = kc - 32
Frank Barchard8e3c5512019-10-18 18:45:08 -070066
Frank Barchard76f43f02021-05-12 14:52:01 -070067 B.LO 3f
Frank Barchard21be34f2019-10-09 19:32:19 -070068
Frank Barchard8e3c5512019-10-18 18:45:08 -070069 # 16 prologue
70 # Read first block of 1 A and B.
Frank Barchard76f43f02021-05-12 14:52:01 -070071 LDP q20, q21, [x5], 32
72 LDP q22, q23, [x5], 32
73 LDP q24, q25, [x5], 32
74 LDP q26, q27, [x5], 32
75 LDR q0, [x3], 16
Frank Barchard21be34f2019-10-09 19:32:19 -070076
Frank Barchard8e3c5512019-10-18 18:45:08 -070077 # Is there at least 32. yes do main loop
Frank Barchard76f43f02021-05-12 14:52:01 -070078 SUBS x0, x0, 32
79 B.LO 2f
Frank Barchard21be34f2019-10-09 19:32:19 -070080
Frank Barchard8e3c5512019-10-18 18:45:08 -070081 # Main loop - 8 floats of A (32 bytes)
Frank Barchard21be34f2019-10-09 19:32:19 -0700821:
Frank Barchard8e3c5512019-10-18 18:45:08 -070083 # First block of 4. FMA for first 4, loads for 2nd block of 4.
Frank Barchard76f43f02021-05-12 14:52:01 -070084 FMLA v16.4s, v20.4s, v0.s[0]
85 LDR q1, [x3], 16
86 FMLA v17.4s, v21.4s, v0.s[0]
87 LDR q20, [x5], 16
88 FMLA v18.4s, v22.4s, v0.s[1]
89 LDR q21, [x5], 16
90 FMLA v19.4s, v23.4s, v0.s[1]
91 LDR q22, [x5], 16
92 FMLA v16.4s, v24.4s, v0.s[2]
93 LDR q23, [x5], 16
94 FMLA v17.4s, v25.4s, v0.s[2]
95 LDR q24, [x5], 16
96 FMLA v18.4s, v26.4s, v0.s[3]
97 LDR q25, [x5], 16
98 FMLA v19.4s, v27.4s, v0.s[3]
99 LDR q26, [x5], 16
100 LDR q27, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700101
Frank Barchard8e3c5512019-10-18 18:45:08 -0700102 # Second block of 4. FMA for second 4, loads for 1st block of 4.
Frank Barchard76f43f02021-05-12 14:52:01 -0700103 FMLA v16.4s, v20.4s, v1.s[0]
104 LDR q0, [x3], 16
105 FMLA v17.4s, v21.4s, v1.s[0]
106 LDR q20, [x5], 16
107 FMLA v18.4s, v22.4s, v1.s[1]
108 LDR q21, [x5], 16
109 FMLA v19.4s, v23.4s, v1.s[1]
110 LDR q22, [x5], 16
111 FMLA v16.4s, v24.4s, v1.s[2]
112 LDR q23, [x5], 16
113 FMLA v17.4s, v25.4s, v1.s[2]
114 LDR q24, [x5], 16
115 FMLA v18.4s, v26.4s, v1.s[3]
116 LDR q25, [x5], 16
117 FMLA v19.4s, v27.4s, v1.s[3]
118 LDR q26, [x5], 16
119 SUBS x0, x0, 32
120 LDR q27, [x5], 16
121 B.HS 1b
Frank Barchard21be34f2019-10-09 19:32:19 -0700122
Frank Barchard21be34f2019-10-09 19:32:19 -07001232:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700124 # Epilogue
Frank Barchard21be34f2019-10-09 19:32:19 -0700125
Frank Barchard8e3c5512019-10-18 18:45:08 -0700126 # First block of 4. FMA for first 4, loads for 2nd block of 4.
Frank Barchard76f43f02021-05-12 14:52:01 -0700127 FMLA v16.4s, v20.4s, v0.s[0]
128 LDR q1, [x3], 16
129 FMLA v17.4s, v21.4s, v0.s[0]
130 LDR q20, [x5], 16
131 FMLA v18.4s, v22.4s, v0.s[1]
132 LDR q21, [x5], 16
133 FMLA v19.4s, v23.4s, v0.s[1]
134 LDR q22, [x5], 16
135 FMLA v16.4s, v24.4s, v0.s[2]
136 LDR q23, [x5], 16
137 FMLA v17.4s, v25.4s, v0.s[2]
138 LDR q24, [x5], 16
139 FMLA v18.4s, v26.4s, v0.s[3]
140 LDR q25, [x5], 16
141 FMLA v19.4s, v27.4s, v0.s[3]
142 LDR q26, [x5], 16
Frank Barchard21be34f2019-10-09 19:32:19 -0700143
Frank Barchard8e3c5512019-10-18 18:45:08 -0700144 # Second block of 4. no loads
Frank Barchard76f43f02021-05-12 14:52:01 -0700145 FMLA v16.4s, v20.4s, v1.s[0]
146 LDR q27, [x5], 16
147 FMLA v17.4s, v21.4s, v1.s[0]
148 FMLA v18.4s, v22.4s, v1.s[1]
149 FMLA v19.4s, v23.4s, v1.s[1]
150 FMLA v16.4s, v24.4s, v1.s[2]
151 FMLA v17.4s, v25.4s, v1.s[2]
152 FMLA v18.4s, v26.4s, v1.s[3]
153 FMLA v19.4s, v27.4s, v1.s[3]
Frank Barchard21be34f2019-10-09 19:32:19 -0700154
Frank Barchard21be34f2019-10-09 19:32:19 -07001553:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700156 # Is there a remainder?- 4 floats of A (16 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700157 TBNZ x0, 4, 5f
Frank Barchard21be34f2019-10-09 19:32:19 -0700158 # Is there a remainder?- 2 floats of A (8 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700159 TBNZ x0, 3, 6f
Frank Barchard21be34f2019-10-09 19:32:19 -0700160 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700161 TBNZ x0, 2, 8f
Frank Barchard21be34f2019-10-09 19:32:19 -0700162
1634:
Frank Barchard76f43f02021-05-12 14:52:01 -0700164 FADD v16.4s, v16.4s, v18.4s
165 FADD v17.4s, v17.4s, v19.4s
Frank Barchard8e3c5512019-10-18 18:45:08 -0700166
Frank Barchard21be34f2019-10-09 19:32:19 -0700167 # Clamp
Frank Barchard76f43f02021-05-12 14:52:01 -0700168 FMAX v16.4s, v16.4s, v4.4s
169 SUBS x1, x1, 8
170 FMAX v17.4s, v17.4s, v4.4s
171 FMIN v16.4s, v16.4s, v5.4s
172 FMIN v17.4s, v17.4s, v5.4s
Frank Barchard21be34f2019-10-09 19:32:19 -0700173
174 # Store full 1 x 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700175 B.LO 9f
Frank Barchard21be34f2019-10-09 19:32:19 -0700176
Frank Barchard76f43f02021-05-12 14:52:01 -0700177 ST1 {v16.16b, v17.16b}, [x6], x14
178 SUB x3, x3, x2 // a0 -= kc
Frank Barchard21be34f2019-10-09 19:32:19 -0700179
Frank Barchard76f43f02021-05-12 14:52:01 -0700180 B.HI 0b
Frank Barchard21be34f2019-10-09 19:32:19 -0700181
182 RET
183
1845:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700185 # Remainder- 4 floats of A (16 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700186 LDR q20, [x5], 16
187 LDR q21, [x5], 16
188 LDR q0, [x3], 16
189 FMLA v16.4s, v20.4s, v0.s[0]
190 FMLA v17.4s, v21.4s, v0.s[0]
191 LDR q22, [x5], 16
192 LDR q23, [x5], 16
193 LDR q24, [x5], 16
194 LDR q25, [x5], 16
195 LDR q26, [x5], 16
196 LDR q27, [x5], 16
197 FMLA v18.4s, v22.4s, v0.s[1]
198 FMLA v19.4s, v23.4s, v0.s[1]
199 FMLA v16.4s, v24.4s, v0.s[2]
200 FMLA v17.4s, v25.4s, v0.s[2]
201 FMLA v18.4s, v26.4s, v0.s[3]
202 FMLA v19.4s, v27.4s, v0.s[3]
Frank Barchard21be34f2019-10-09 19:32:19 -0700203
Frank Barchard76f43f02021-05-12 14:52:01 -0700204 TBZ x0, 3, 7f
Frank Barchard21be34f2019-10-09 19:32:19 -07002056:
Frank Barchard8e3c5512019-10-18 18:45:08 -0700206 # Remainder- 2 floats of A (8 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700207 LDR q20, [x5], 16
208 LDR q21, [x5], 16
209 LDR d0, [x3], 8
210 FMLA v16.4s, v20.4s, v0.s[0]
211 FMLA v17.4s, v21.4s, v0.s[0]
212 LDR q22, [x5], 16
213 LDR q23, [x5], 16
214 FMLA v18.4s, v22.4s, v0.s[1]
215 FMLA v19.4s, v23.4s, v0.s[1]
Frank Barchard8e3c5512019-10-18 18:45:08 -07002167:
Frank Barchard76f43f02021-05-12 14:52:01 -0700217 TBZ x0, 2, 4b
Frank Barchard8e3c5512019-10-18 18:45:08 -07002188:
219 # Remainder- 1 float of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700220 LDR q20, [x5], 16
221 LDR q21, [x5], 16
222 LDR s0, [x3], 4
223 FMLA v16.4s, v20.4s, v0.s[0]
224 FMLA v17.4s, v21.4s, v0.s[0]
225 B 4b
Frank Barchard21be34f2019-10-09 19:32:19 -0700226
Frank Barchard8e3c5512019-10-18 18:45:08 -0700227 # Store odd channels
Frank Barchard21be34f2019-10-09 19:32:19 -07002289:
Frank Barchard76f43f02021-05-12 14:52:01 -0700229 TBZ x1, 2, 10f
230 STR q16, [x6], 16
231 MOV v16.16b, v17.16b
Frank Barchard8e3c5512019-10-18 18:45:08 -0700232
Frank Barchard21be34f2019-10-09 19:32:19 -070023310:
Frank Barchard76f43f02021-05-12 14:52:01 -0700234 TBZ x1, 1, 11f
235 STR d16, [x6], 8
236 DUP d16, v16.d[1]
Frank Barchard8e3c5512019-10-18 18:45:08 -0700237
23811:
Frank Barchard76f43f02021-05-12 14:52:01 -0700239 TBZ x1, 0, 12f
240 STR s16, [x6]
Frank Barchard8e3c5512019-10-18 18:45:08 -070024112:
Frank Barchard21be34f2019-10-09 19:32:19 -0700242 RET
243
Marat Dukhande06f492020-04-09 00:19:31 -0700244END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53
Frank Barchard21be34f2019-10-09 19:32:19 -0700245
246#ifdef __ELF__
247.section ".note.GNU-stack","",%progbits
248#endif