blob: 6f9149e2a237806c60ba63e40c79a0b2e28ba9ac [file] [log] [blame]
Frank Barchard387c2d12019-12-16 19:14:07 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-igemm/1x8-aarch64-neonfma-cortex-a75.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75(
13# size_t mr, (x0) - unused. mr = 1
14# size_t nc, x1
15# size_t kc, x2 / x0
16# size_t ks, x3 / x9
17# const float**restrict a, x4
18# const float*restrict w, x5
19# float*restrict c, x6
20# size_t cm_stride, (x7) - unused
21# size_t cn_stride, [sp] -> x10
22# size_t a_offset, [sp + 8] -> x11
23# const float* zero, [sp + 16] -> x12
24# const xnn_f32_output_params params [sp + 24] -> x8
25
26# d8-d15 need to be preserved if used.
27# x19-30 need to be preserved if used.
28
Frank Barchardafbca9a2019-10-07 18:20:45 -070029# A pointer
30# x8 a0
XNNPACK Teamb455b122019-09-27 18:10:33 -070031
Frank Barchardafbca9a2019-10-07 18:20:45 -070032# C pointer
XNNPACK Teamb455b122019-09-27 18:10:33 -070033# x6 c0
34
35BEGIN_FUNCTION xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75
36
37 # Load cn_stride, a_offset
38 LDP x10, x11, [sp]
39
40 # Load zero, clamping params pointer
41 LDP x12, x8, [sp, 16]
42
43 # Load clamping_params values
44 LD2R {v30.4s, v31.4s}, [x8]
45
XNNPACK Teamb455b122019-09-27 18:10:33 -0700460:
47 # Load initial bias from w into accumulators
48 LDP q16, q17, [x5], 32
49 MOVI v18.4s, 0 // second set of C for pipelining FMLA
Frank Barchardafbca9a2019-10-07 18:20:45 -070050 PRFM PLDL1KEEP, [x5]
XNNPACK Teamb455b122019-09-27 18:10:33 -070051 MOVI v19.4s, 0
Frank Barchardafbca9a2019-10-07 18:20:45 -070052 PRFM PLDL1KEEP, [x5, 64]
53 PRFM PLDL1KEEP, [x5, 128]
54 PRFM PLDL1KEEP, [x5, 192]
XNNPACK Teamb455b122019-09-27 18:10:33 -070055
56 MOV x9, x3 // p = ks
57
581:
59 # Load next A pointer
Frank Barchardafbca9a2019-10-07 18:20:45 -070060 LDR x8, [x4], 8
XNNPACK Teamb455b122019-09-27 18:10:33 -070061
Frank Barchardafbca9a2019-10-07 18:20:45 -070062 CMP x8, x12 // if a0 == zero
63 ADD x8, x8, x11 // a0 += a_offset
64 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset
XNNPACK Teamb455b122019-09-27 18:10:33 -070065
66 # Is there at least 8 floats (32 bytes) for prologue + epilogue?
67 SUBS x0, x2, 32 // k = kc - 32 // k = kc
68 B.LO 4f
69
70 # 16 prologue
71 # Read first block of A and B.
72 LDP q20, q21, [x5], 32
73 LDP q22, q23, [x5], 32
74 LDP q24, q25, [x5], 32
75 LDP q26, q27, [x5], 32
Frank Barchardafbca9a2019-10-07 18:20:45 -070076 LDR q0, [x8], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -070077
78 # Is there at least 8. yes do main loop
79 SUBS x0, x0, 32
80 B.LO 3f
81
82 # Main loop - 8 floats of A (32 bytes)
832:
84 # First block of 4. FMA for first 4, loads for 2nd block of 4.
85 FMLA v16.4s, v20.4s, v0.s[0]
Frank Barchardafbca9a2019-10-07 18:20:45 -070086 LDR q1, [x8], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -070087 FMLA v17.4s, v21.4s, v0.s[0]
88 LDP q20, q21, [x5], 32
89 FMLA v18.4s, v22.4s, v0.s[1]
90 FMLA v19.4s, v23.4s, v0.s[1]
91 LDP q22, q23, [x5], 32
92 FMLA v16.4s, v24.4s, v0.s[2]
93 FMLA v17.4s, v25.4s, v0.s[2]
94 LDP q24, q25, [x5], 32
95 PRFM PLDL1KEEP, [x5, 128]
96 FMLA v18.4s, v26.4s, v0.s[3]
97 PRFM PLDL1KEEP, [x5, 256]
98 FMLA v19.4s, v27.4s, v0.s[3]
99 LDP q26, q27, [x5], 32
100
101 # Second block of 4. FMA for second 4, loads for 1st block of 4.
102 FMLA v16.4s, v20.4s, v1.s[0]
Frank Barchardafbca9a2019-10-07 18:20:45 -0700103 LDR q0, [x8], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700104 FMLA v17.4s, v21.4s, v1.s[0]
105 LDP q20, q21, [x5], 32
106 FMLA v18.4s, v22.4s, v1.s[1]
107 FMLA v19.4s, v23.4s, v1.s[1]
108 LDP q22, q23, [x5], 32
109 FMLA v16.4s, v24.4s, v1.s[2]
110 FMLA v17.4s, v25.4s, v1.s[2]
111 LDP q24, q25, [x5], 32
112 PRFM PLDL1KEEP, [x5, 128]
113 FMLA v18.4s, v26.4s, v1.s[3]
114 PRFM PLDL1KEEP, [x5, 256]
115 FMLA v19.4s, v27.4s, v1.s[3]
116 SUBS x0, x0, 32
117 LDP q26, q27, [x5], 32
118 B.HS 2b
119
1203:
121 # Epilogue
122
123 # First block of 4. FMA for first 4, loads for 2nd block of 4.
124 FMLA v16.4s, v20.4s, v0.s[0]
Frank Barchardafbca9a2019-10-07 18:20:45 -0700125 LDR q1, [x8], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700126 FMLA v17.4s, v21.4s, v0.s[0]
127 LDP q20, q21, [x5], 32
128 FMLA v18.4s, v22.4s, v0.s[1]
129 FMLA v19.4s, v23.4s, v0.s[1]
130 LDP q22, q23, [x5], 32
131 FMLA v16.4s, v24.4s, v0.s[2]
132 FMLA v17.4s, v25.4s, v0.s[2]
133 LDP q24, q25, [x5], 32
134 PRFM PLDL1KEEP, [x5, 128]
135 FMLA v18.4s, v26.4s, v0.s[3]
136 PRFM PLDL1KEEP, [x5, 256]
137 FMLA v19.4s, v27.4s, v0.s[3]
138 LDP q26, q27, [x5], 32
139
140 # Second block of 4. no loads
141 FMLA v16.4s, v20.4s, v1.s[0]
142 FMLA v17.4s, v21.4s, v1.s[0]
143 FMLA v18.4s, v22.4s, v1.s[1]
144 FMLA v19.4s, v23.4s, v1.s[1]
145 FMLA v16.4s, v24.4s, v1.s[2]
146 FMLA v17.4s, v25.4s, v1.s[2]
147 FMLA v18.4s, v26.4s, v1.s[3]
148 FMLA v19.4s, v27.4s, v1.s[3]
149
1504:
151 # Is there a remainder?- 4 floats of A (16 bytes)
152 TBNZ x0, 4, 6f
153 # Is there a remainder?- 2 floats of A (8 bytes)
154 TBNZ x0, 3, 7f
155 # Is there a remainder?- 1 floats of A (4 bytes)
156 TBNZ x0, 2, 9f
157
1585:
159 # ks loop
160 SUBS x9, x9, 8 // ks -= MR * sizeof(void*)
161 B.NE 1b
162
163 FADD v16.4s, v16.4s, v18.4s
164 FADD v17.4s, v17.4s, v19.4s
165
166 # Clamp
167 FMIN v16.4s, v16.4s, v30.4s
168 FMIN v17.4s, v17.4s, v30.4s
169 FMAX v16.4s, v16.4s, v31.4s
170 FMAX v17.4s, v17.4s, v31.4s
171
172 # Store full 1 x 8
Frank Barchard6383f492019-12-04 22:33:49 -0800173 SUBS x1, x1, 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700174 B.LO 10f
175
176 STP q16, q17, [x6]
177 ADD x6, x6, x10
178
179 SUB x4, x4, x3 // a -= ks
180
181 # nc loop
XNNPACK Teamb455b122019-09-27 18:10:33 -0700182 B.HI 0b
183
XNNPACK Teamb455b122019-09-27 18:10:33 -0700184 RET
185
1866:
187 # Remainder- 4 floats of A (16 bytes)
188 LDP q20, q21, [x5], 32
Frank Barchardafbca9a2019-10-07 18:20:45 -0700189 LDR q0, [x8], 16
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190 FMLA v16.4s, v20.4s, v0.s[0]
191 FMLA v17.4s, v21.4s, v0.s[0]
192 LDP q22, q23, [x5], 32
193 LDP q24, q25, [x5], 32
194 LDP q26, q27, [x5], 32
195 FMLA v18.4s, v22.4s, v0.s[1]
196 FMLA v19.4s, v23.4s, v0.s[1]
197 FMLA v16.4s, v24.4s, v0.s[2]
198 FMLA v17.4s, v25.4s, v0.s[2]
199 FMLA v18.4s, v26.4s, v0.s[3]
200 FMLA v19.4s, v27.4s, v0.s[3]
201
202 TBZ x0, 3, 8f
2037:
204 # Remainder- 2 floats of A (8 bytes)
205 LDP q20, q21, [x5], 32
Frank Barchardafbca9a2019-10-07 18:20:45 -0700206 LDR d0, [x8], 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700207 FMLA v16.4s, v20.4s, v0.s[0]
208 FMLA v17.4s, v21.4s, v0.s[0]
209 LDP q22, q23, [x5], 32
210 FMLA v18.4s, v22.4s, v0.s[1]
211 FMLA v19.4s, v23.4s, v0.s[1]
2128:
213 TBZ x0, 2, 5b
2149:
215 # Remainder- 1 float of A (4 bytes)
216 LDP q20, q21, [x5], 32
Frank Barchardafbca9a2019-10-07 18:20:45 -0700217 LDR s0, [x8], 4
XNNPACK Teamb455b122019-09-27 18:10:33 -0700218 FMLA v16.4s, v20.4s, v0.s[0]
219 FMLA v17.4s, v21.4s, v0.s[0]
220 B 5b
221
22210:
223 # Store odd channels
224 TBZ x1, 2, 11f
225 STR q16, [x6], 16
226 MOV v16.16b, v17.16b
227
22811:
229 TBZ x1, 1, 12f
230 STR d16, [x6], 8
231 DUP d16, v16.d[1]
232
23312:
234 TBZ x1, 0, 13f
235 STR s16, [x6], 4
23613:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700237 RET
238
239END_FUNCTION xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75
240
241#ifdef __ELF__
242.section ".note.GNU-stack","",%progbits
243#endif