blob: 02e2c5a67c10e199e6b827767226093ff5a63634 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in
3// Generator: tools/xngen
4//
Frank Barchardbaa9ead2019-10-18 18:06:41 -07005// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
XNNPACK Teamb455b122019-09-27 18:10:33 -07009
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64(
13# size_t mr, x0
14# size_t nc, x1
15# size_t kc, x2 / x0
16# const uint8_t*restrict a, x3
17# size_t a_stride, x4
18# const void*restrict w, x5
19# uint8_t*restrict c, x6
20# size_t cm_stride, x7
21# size_t cn_stride, [sp] -> x14
22# const float*restrict acc, [sp + 8] -> x15
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070023# const union xnn_f32_minmax_params params[restrict static 1]) [sp + 16] -> x8
XNNPACK Teamb455b122019-09-27 18:10:33 -070024
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3 a0
30# x11 a1
31# x12 a2
32# x4 a3 / a_stride
33
34# C pointers
35# x6 c0
36# x9 c1
37# x10 c2
38# x7 c3 / cm_stride
39
40BEGIN_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64
41
42 # Load cn_stride, acc
43 LDP x14, x15, [sp]
44 # Load params pointer
45 LDR x8, [sp, 16]
46
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070047 # Load min/max values
XNNPACK Teamb455b122019-09-27 18:10:33 -070048 LD2R {v4.4s, v5.4s}, [x8]
49
50 # Clamp A and C pointers
Frank Barchard684bbb02019-11-16 14:14:42 -080051 CMP x0, 2 // if mr < 2
XNNPACK Teamb455b122019-09-27 18:10:33 -070052 ADD x11, x3, x4 // a1 = a0 + a_stride
53 ADD x9, x6, x7 // c1 = c0 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070054 CSEL x11, x3, x11, LO // a1 = a0
55 CSEL x9, x6, x9, LO // c1 = c0
56
57 ADD x12, x11, x4 // a2 = a1 + a_stride
58 ADD x10, x9, x7 // c2 = c1 + cm_stride
59 // if mr <= 2
60 CSEL x12, x11, x12, LS // a2 = a1
61 CSEL x10, x9, x10, LS // c2 = c1
62
Frank Barchard684bbb02019-11-16 14:14:42 -080063 CMP x0, 4 // if mr < 4
XNNPACK Teamb455b122019-09-27 18:10:33 -070064 ADD x4, x12, x4 // a3 = a2 + a_stride
65 ADD x7, x10, x7 // c3 = c2 + cm_stride
XNNPACK Teamb455b122019-09-27 18:10:33 -070066 CSEL x4, x12, x4, LO // a3 = a2
67 CSEL x7, x10, x7, LO // c3 = c2
68
690:
70 # Load initial accumulators
71 LDP q16, q17, [x15], 32
72 LDP q18, q19, [x15], 32
73 LDP q28, q29, [x15], 32
74 LDP q30, q31, [x15], 32
75
76 # Is there at least 2 floats (8 bytes)?
77 SUBS x0, x2, 8 // k = kc - 8
Frank Barchard81558542020-02-11 16:35:26 -080078 B.LO 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -070079
80 # Main loop - 2 floats of A (8 bytes)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700811:
82 LDR d0, [x3], 8
83 LDP q20, q21, [x5], 32
84 LDR d1, [x11], 8
85 LDR d2, [x12], 8
86 LDR d3, [x4], 8
87 FMLA v16.4s, v20.4s, v0.s[0]
88 FMLA v17.4s, v21.4s, v0.s[0]
89 FMLA v18.4s, v20.4s, v1.s[0]
90 FMLA v19.4s, v21.4s, v1.s[0]
91 LDP q22, q23, [x5], 32
92 FMLA v28.4s, v20.4s, v2.s[0]
93 FMLA v29.4s, v21.4s, v2.s[0]
94 FMLA v30.4s, v20.4s, v3.s[0]
95 FMLA v31.4s, v21.4s, v3.s[0]
96 FMLA v16.4s, v22.4s, v0.s[1]
97 FMLA v17.4s, v23.4s, v0.s[1]
98 FMLA v18.4s, v22.4s, v1.s[1]
99 FMLA v19.4s, v23.4s, v1.s[1]
100 FMLA v28.4s, v22.4s, v2.s[1]
101 FMLA v29.4s, v23.4s, v2.s[1]
102 SUBS x0, x0, 8
103 FMLA v30.4s, v22.4s, v3.s[1]
104 FMLA v31.4s, v23.4s, v3.s[1]
105 B.HS 1b
Frank Barchard79ade182020-02-11 11:44:23 -0800106
Frank Barchard81558542020-02-11 16:35:26 -0800107 # Is there a remainder?- 1 floats of A (4 bytes)
108 TBNZ x0, 2, 5f
XNNPACK Teamb455b122019-09-27 18:10:33 -0700109
Frank Barchard81558542020-02-11 16:35:26 -08001104:
XNNPACK Teamb455b122019-09-27 18:10:33 -0700111 # Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700112 FMAX v16.4s, v16.4s, v4.4s
Frank Barchard73ccfb42019-12-11 22:15:22 -0800113 SUBS x1, x1, 8
Marat Dukhana51cf482020-04-08 16:16:19 -0700114 FMAX v17.4s, v17.4s, v4.4s
115 FMAX v18.4s, v18.4s, v4.4s
116 FMAX v19.4s, v19.4s, v4.4s
117 FMAX v28.4s, v28.4s, v4.4s
118 FMAX v29.4s, v29.4s, v4.4s
119 FMAX v30.4s, v30.4s, v4.4s
120 FMAX v31.4s, v31.4s, v4.4s
121 FMIN v16.4s, v16.4s, v5.4s
122 FMIN v17.4s, v17.4s, v5.4s
123 FMIN v18.4s, v18.4s, v5.4s
124 FMIN v19.4s, v19.4s, v5.4s
125 FMIN v28.4s, v28.4s, v5.4s
126 FMIN v29.4s, v29.4s, v5.4s
127 FMIN v30.4s, v30.4s, v5.4s
128 FMIN v31.4s, v31.4s, v5.4s
XNNPACK Teamb455b122019-09-27 18:10:33 -0700129
130 # Store full 4 x 8
XNNPACK Teamb455b122019-09-27 18:10:33 -0700131 B.LO 7f
132
Frank Barcharde67b7832019-11-12 12:48:40 -0800133 ST1 {v30.16b, v31.16b}, [x7], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700134 SUB x3, x3, x2 // a0 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800135 ST1 {v28.16b, v29.16b}, [x10], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700136 SUB x11, x11, x2 // a1 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800137 ST1 {v18.16b, v19.16b}, [x9], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700138 SUB x12, x12, x2 // a2 -= kc
Frank Barcharde67b7832019-11-12 12:48:40 -0800139 ST1 {v16.16b, v17.16b}, [x6], x14
XNNPACK Teamb455b122019-09-27 18:10:33 -0700140 SUB x4, x4, x2 // a3 -= kc
141
XNNPACK Teamb455b122019-09-27 18:10:33 -0700142 B.HI 0b
143
144 RET
145
Frank Barchard81558542020-02-11 16:35:26 -0800146 # Remainder- 1 float of A (4 bytes)
1475:
148 LDR s0, [x3], 4
149 LDP q20, q21, [x5], 32
150 LDR s1, [x11], 4
151 LDR s2, [x12], 4
152 LDR s3 , [x4], 4
153 FMLA v16.4s, v20.4s, v0.s[0]
154 FMLA v17.4s, v21.4s, v0.s[0]
155 FMLA v18.4s, v20.4s, v1.s[0]
156 FMLA v19.4s, v21.4s, v1.s[0]
157 FMLA v28.4s, v20.4s, v2.s[0]
158 FMLA v29.4s, v21.4s, v2.s[0]
159 FMLA v30.4s, v20.4s, v3.s[0]
160 FMLA v31.4s, v21.4s, v3.s[0]
161 B 4b
162
XNNPACK Teamb455b122019-09-27 18:10:33 -0700163 # Store odd width
1647:
165 TBZ x1, 2, 8f
166 STR q30, [x7], 16
167 MOV v30.16b, v31.16b
168 STR q28, [x10], 16
169 MOV v28.16b, v29.16b
170 STR q18, [x9], 16
171 MOV v18.16b, v19.16b
172 STR q16, [x6], 16
173 MOV v16.16b, v17.16b
174
1758:
176 TBZ x1, 1, 9f
177 STR d30, [x7], 8
178 DUP d30, v30.d[1]
179 STR d28, [x10], 8
180 DUP d28, v28.d[1]
181 STR d18, [x9], 8
182 DUP d18, v18.d[1]
183 STR d16, [x6], 8
184 DUP d16, v16.d[1]
185
1869:
187 TBZ x1, 0, 10f
188 STR s30, [x7]
189 STR s28, [x10]
190 STR s18, [x9]
191 STR s16, [x6]
19210:
193 RET
194
195END_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64
196
197#ifdef __ELF__
198.section ".note.GNU-stack","",%progbits
199#endif