blob: 1c9bf9ffe3d96099c2c2b61184b90c99f789e490 [file] [log] [blame]
Marat Dukhan5098c3e2019-11-07 12:01:19 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-dwconv/up-psimd.c.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <psimd.h>
13
14#include <xnnpack/dwconv.h>
15
16
17void xnn_f32_dwconv_ukernel_up8x25__psimd_acc2(
18 size_t channels,
19 size_t output_width,
20 const float** input,
21 const float* weights,
22 float* output,
23 size_t input_stride,
24 size_t output_increment,
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070025 const union xnn_f32_minmax_params params[restrict static 1])
Marat Dukhan5098c3e2019-11-07 12:01:19 -080026{
27 assert(channels != 0);
28 assert(output_width != 0);
29
30 const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
31 const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
32 do {
33 const float* i0 = input[0];
Marat Dukhan68660992020-02-03 13:31:12 -080034 assert(i0 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080035 const float* i1 = input[1];
Marat Dukhan68660992020-02-03 13:31:12 -080036 assert(i1 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080037 const float* i2 = input[2];
Marat Dukhan68660992020-02-03 13:31:12 -080038 assert(i2 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080039 const float* i3 = input[3];
Marat Dukhan68660992020-02-03 13:31:12 -080040 assert(i3 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080041 const float* i4 = input[4];
Marat Dukhan68660992020-02-03 13:31:12 -080042 assert(i4 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080043 const float* i5 = input[5];
Marat Dukhan68660992020-02-03 13:31:12 -080044 assert(i5 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080045 const float* i6 = input[6];
Marat Dukhan68660992020-02-03 13:31:12 -080046 assert(i6 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080047 const float* i7 = input[7];
Marat Dukhan68660992020-02-03 13:31:12 -080048 assert(i7 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080049 const float* i8 = input[8];
Marat Dukhan68660992020-02-03 13:31:12 -080050 assert(i8 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080051 const float* i9 = input[9];
Marat Dukhan68660992020-02-03 13:31:12 -080052 assert(i9 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080053 const float* i10 = input[10];
Marat Dukhan68660992020-02-03 13:31:12 -080054 assert(i10 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080055 const float* i11 = input[11];
Marat Dukhan68660992020-02-03 13:31:12 -080056 assert(i11 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080057 const float* i12 = input[12];
Marat Dukhan68660992020-02-03 13:31:12 -080058 assert(i12 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080059 const float* i13 = input[13];
Marat Dukhan68660992020-02-03 13:31:12 -080060 assert(i13 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080061 const float* i14 = input[14];
Marat Dukhan68660992020-02-03 13:31:12 -080062 assert(i14 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080063 const float* i15 = input[15];
Marat Dukhan68660992020-02-03 13:31:12 -080064 assert(i15 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080065 const float* i16 = input[16];
Marat Dukhan68660992020-02-03 13:31:12 -080066 assert(i16 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080067 const float* i17 = input[17];
Marat Dukhan68660992020-02-03 13:31:12 -080068 assert(i17 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080069 const float* i18 = input[18];
Marat Dukhan68660992020-02-03 13:31:12 -080070 assert(i18 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080071 const float* i19 = input[19];
Marat Dukhan68660992020-02-03 13:31:12 -080072 assert(i19 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080073 const float* i20 = input[20];
Marat Dukhan68660992020-02-03 13:31:12 -080074 assert(i20 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080075 const float* i21 = input[21];
Marat Dukhan68660992020-02-03 13:31:12 -080076 assert(i21 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080077 const float* i22 = input[22];
Marat Dukhan68660992020-02-03 13:31:12 -080078 assert(i22 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080079 const float* i23 = input[23];
Marat Dukhan68660992020-02-03 13:31:12 -080080 assert(i23 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080081 const float* i24 = input[24];
Marat Dukhan68660992020-02-03 13:31:12 -080082 assert(i24 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080083 input = (const float**) ((uintptr_t) input + input_stride);
84
85 size_t c = channels;
86 const float* w = weights;
87 for (; c >= 8; c -= 8) {
88 psimd_f32 vacc0123p0 = psimd_load_f32(w);
89 psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
90
91
92 const psimd_f32 vi0x0123 = psimd_load_f32(i0);
93 const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
94 i0 += 8;
95
96 const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
97 const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
98 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
99 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
100
101 const psimd_f32 vi1x0123 = psimd_load_f32(i1);
102 const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
103 i1 += 8;
104
105 const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
106 const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
107 psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
108 psimd_f32 vacc4567p1 = psimd_mul_f32(vi1x4567, vk1x4567);
109
110 const psimd_f32 vi2x0123 = psimd_load_f32(i2);
111 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
112 i2 += 8;
113
114 const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
115 const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
116 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
117 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
118
119 const psimd_f32 vi3x0123 = psimd_load_f32(i3);
120 const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
121 i3 += 8;
122
123 const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
124 const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
125 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
126 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi3x4567, vk3x4567);
127
128 const psimd_f32 vi4x0123 = psimd_load_f32(i4);
129 const psimd_f32 vi4x4567 = psimd_load_f32(i4 + 4);
130 i4 += 8;
131
132 const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
133 const psimd_f32 vk4x4567 = psimd_load_f32(w + 44);
134 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
135 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi4x4567, vk4x4567);
136
137 const psimd_f32 vi5x0123 = psimd_load_f32(i5);
138 const psimd_f32 vi5x4567 = psimd_load_f32(i5 + 4);
139 i5 += 8;
140
141 const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
142 const psimd_f32 vk5x4567 = psimd_load_f32(w + 52);
143 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
144 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi5x4567, vk5x4567);
145
146 const psimd_f32 vi6x0123 = psimd_load_f32(i6);
147 const psimd_f32 vi6x4567 = psimd_load_f32(i6 + 4);
148 i6 += 8;
149
150 const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
151 const psimd_f32 vk6x4567 = psimd_load_f32(w + 60);
152 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
153 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi6x4567, vk6x4567);
154
155 const psimd_f32 vi7x0123 = psimd_load_f32(i7);
156 const psimd_f32 vi7x4567 = psimd_load_f32(i7 + 4);
157 i7 += 8;
158
159 const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
160 const psimd_f32 vk7x4567 = psimd_load_f32(w + 68);
161 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
162 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi7x4567, vk7x4567);
163
164 const psimd_f32 vi8x0123 = psimd_load_f32(i8);
165 const psimd_f32 vi8x4567 = psimd_load_f32(i8 + 4);
166 i8 += 8;
167
168 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
169 const psimd_f32 vk8x4567 = psimd_load_f32(w + 76);
170 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
171 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi8x4567, vk8x4567);
172
173 const psimd_f32 vi9x0123 = psimd_load_f32(i9);
174 const psimd_f32 vi9x4567 = psimd_load_f32(i9 + 4);
175 i9 += 8;
176
177 const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
178 const psimd_f32 vk9x4567 = psimd_load_f32(w + 84);
179 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
180 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi9x4567, vk9x4567);
181
182 const psimd_f32 vi10x0123 = psimd_load_f32(i10);
183 const psimd_f32 vi10x4567 = psimd_load_f32(i10 + 4);
184 i10 += 8;
185
186 const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
187 const psimd_f32 vk10x4567 = psimd_load_f32(w + 92);
188 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
189 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi10x4567, vk10x4567);
190
191 const psimd_f32 vi11x0123 = psimd_load_f32(i11);
192 const psimd_f32 vi11x4567 = psimd_load_f32(i11 + 4);
193 i11 += 8;
194
195 const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
196 const psimd_f32 vk11x4567 = psimd_load_f32(w + 100);
197 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
198 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi11x4567, vk11x4567);
199
200 const psimd_f32 vi12x0123 = psimd_load_f32(i12);
201 const psimd_f32 vi12x4567 = psimd_load_f32(i12 + 4);
202 i12 += 8;
203
204 const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
205 const psimd_f32 vk12x4567 = psimd_load_f32(w + 108);
206 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
207 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi12x4567, vk12x4567);
208
209 const psimd_f32 vi13x0123 = psimd_load_f32(i13);
210 const psimd_f32 vi13x4567 = psimd_load_f32(i13 + 4);
211 i13 += 8;
212
213 const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
214 const psimd_f32 vk13x4567 = psimd_load_f32(w + 116);
215 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
216 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi13x4567, vk13x4567);
217
218 const psimd_f32 vi14x0123 = psimd_load_f32(i14);
219 const psimd_f32 vi14x4567 = psimd_load_f32(i14 + 4);
220 i14 += 8;
221
222 const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
223 const psimd_f32 vk14x4567 = psimd_load_f32(w + 124);
224 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
225 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi14x4567, vk14x4567);
226
227 const psimd_f32 vi15x0123 = psimd_load_f32(i15);
228 const psimd_f32 vi15x4567 = psimd_load_f32(i15 + 4);
229 i15 += 8;
230
231 const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
232 const psimd_f32 vk15x4567 = psimd_load_f32(w + 132);
233 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
234 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi15x4567, vk15x4567);
235
236 const psimd_f32 vi16x0123 = psimd_load_f32(i16);
237 const psimd_f32 vi16x4567 = psimd_load_f32(i16 + 4);
238 i16 += 8;
239
240 const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
241 const psimd_f32 vk16x4567 = psimd_load_f32(w + 140);
242 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
243 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi16x4567, vk16x4567);
244
245 const psimd_f32 vi17x0123 = psimd_load_f32(i17);
246 const psimd_f32 vi17x4567 = psimd_load_f32(i17 + 4);
247 i17 += 8;
248
249 const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
250 const psimd_f32 vk17x4567 = psimd_load_f32(w + 148);
251 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
252 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi17x4567, vk17x4567);
253
254 const psimd_f32 vi18x0123 = psimd_load_f32(i18);
255 const psimd_f32 vi18x4567 = psimd_load_f32(i18 + 4);
256 i18 += 8;
257
258 const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
259 const psimd_f32 vk18x4567 = psimd_load_f32(w + 156);
260 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
261 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi18x4567, vk18x4567);
262
263 const psimd_f32 vi19x0123 = psimd_load_f32(i19);
264 const psimd_f32 vi19x4567 = psimd_load_f32(i19 + 4);
265 i19 += 8;
266
267 const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
268 const psimd_f32 vk19x4567 = psimd_load_f32(w + 164);
269 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
270 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi19x4567, vk19x4567);
271
272 const psimd_f32 vi20x0123 = psimd_load_f32(i20);
273 const psimd_f32 vi20x4567 = psimd_load_f32(i20 + 4);
274 i20 += 8;
275
276 const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
277 const psimd_f32 vk20x4567 = psimd_load_f32(w + 172);
278 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
279 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi20x4567, vk20x4567);
280
281 const psimd_f32 vi21x0123 = psimd_load_f32(i21);
282 const psimd_f32 vi21x4567 = psimd_load_f32(i21 + 4);
283 i21 += 8;
284
285 const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
286 const psimd_f32 vk21x4567 = psimd_load_f32(w + 180);
287 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
288 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi21x4567, vk21x4567);
289
290 const psimd_f32 vi22x0123 = psimd_load_f32(i22);
291 const psimd_f32 vi22x4567 = psimd_load_f32(i22 + 4);
292 i22 += 8;
293
294 const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
295 const psimd_f32 vk22x4567 = psimd_load_f32(w + 188);
296 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
297 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi22x4567, vk22x4567);
298
299 const psimd_f32 vi23x0123 = psimd_load_f32(i23);
300 const psimd_f32 vi23x4567 = psimd_load_f32(i23 + 4);
301 i23 += 8;
302
303 const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
304 const psimd_f32 vk23x4567 = psimd_load_f32(w + 196);
305 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
306 vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi23x4567, vk23x4567);
307
308 const psimd_f32 vi24x0123 = psimd_load_f32(i24);
309 const psimd_f32 vi24x4567 = psimd_load_f32(i24 + 4);
310 i24 += 8;
311
312 const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
313 const psimd_f32 vk24x4567 = psimd_load_f32(w + 204);
314 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
315 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi24x4567, vk24x4567);
316
317 w += 208;
318
319 // Add up all accumulators to vacc01234567p0
320 vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
321 vacc4567p0 = psimd_add_f32(vacc4567p0, vacc4567p1);
322
323 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
324 psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
325 vacc0123 = psimd_min_f32(vacc0123, vmax);
326 vacc4567 = psimd_min_f32(vacc4567, vmax);
327
328 psimd_store_f32(output, vacc0123);
329 psimd_store_f32(output + 4, vacc4567);
330 output += 8;
331 }
332 for (; c >= 4; c -= 4) {
333 psimd_f32 vacc0123p0 = psimd_load_f32(w);
334
335 const psimd_f32 vi0x0123 = psimd_load_f32(i0);
336 i0 += 4;
337
338 const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
339 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
340
341 const psimd_f32 vi1x0123 = psimd_load_f32(i1);
342 i1 += 4;
343
344 const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
345 psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
346
347 const psimd_f32 vi2x0123 = psimd_load_f32(i2);
348 i2 += 4;
349
350 const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
351 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
352
353 const psimd_f32 vi3x0123 = psimd_load_f32(i3);
354 i3 += 4;
355
356 const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
357 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
358
359 const psimd_f32 vi4x0123 = psimd_load_f32(i4);
360 i4 += 4;
361
362 const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
363 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
364
365 const psimd_f32 vi5x0123 = psimd_load_f32(i5);
366 i5 += 4;
367
368 const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
369 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
370
371 const psimd_f32 vi6x0123 = psimd_load_f32(i6);
372 i6 += 4;
373
374 const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
375 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
376
377 const psimd_f32 vi7x0123 = psimd_load_f32(i7);
378 i7 += 4;
379
380 const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
381 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
382
383 const psimd_f32 vi8x0123 = psimd_load_f32(i8);
384 i8 += 4;
385
386 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
387 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
388
389 const psimd_f32 vi9x0123 = psimd_load_f32(i9);
390 i9 += 4;
391
392 const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
393 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
394
395 const psimd_f32 vi10x0123 = psimd_load_f32(i10);
396 i10 += 4;
397
398 const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
399 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
400
401 const psimd_f32 vi11x0123 = psimd_load_f32(i11);
402 i11 += 4;
403
404 const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
405 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
406
407 const psimd_f32 vi12x0123 = psimd_load_f32(i12);
408 i12 += 4;
409
410 const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
411 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
412
413 const psimd_f32 vi13x0123 = psimd_load_f32(i13);
414 i13 += 4;
415
416 const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
417 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
418
419 const psimd_f32 vi14x0123 = psimd_load_f32(i14);
420 i14 += 4;
421
422 const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
423 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
424
425 const psimd_f32 vi15x0123 = psimd_load_f32(i15);
426 i15 += 4;
427
428 const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
429 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
430
431 const psimd_f32 vi16x0123 = psimd_load_f32(i16);
432 i16 += 4;
433
434 const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
435 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
436
437 const psimd_f32 vi17x0123 = psimd_load_f32(i17);
438 i17 += 4;
439
440 const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
441 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
442
443 const psimd_f32 vi18x0123 = psimd_load_f32(i18);
444 i18 += 4;
445
446 const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
447 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
448
449 const psimd_f32 vi19x0123 = psimd_load_f32(i19);
450 i19 += 4;
451
452 const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
453 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
454
455 const psimd_f32 vi20x0123 = psimd_load_f32(i20);
456 i20 += 4;
457
458 const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
459 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
460
461 const psimd_f32 vi21x0123 = psimd_load_f32(i21);
462 i21 += 4;
463
464 const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
465 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
466
467 const psimd_f32 vi22x0123 = psimd_load_f32(i22);
468 i22 += 4;
469
470 const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
471 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
472
473 const psimd_f32 vi23x0123 = psimd_load_f32(i23);
474 i23 += 4;
475
476 const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
477 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
478
479 const psimd_f32 vi24x0123 = psimd_load_f32(i24);
480 i24 += 4;
481
482 const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
483 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
484
485 w += 4;
486
487 // Add up all accumulators to vacc01234567p0
488 vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
489
490 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
491 vacc0123 = psimd_min_f32(vacc0123, vmax);
492
493 psimd_store_f32(output, vacc0123);
494 output += 4;
495 }
496 if XNN_UNLIKELY(c != 0) {
497 psimd_f32 vacc0123p0 = psimd_load_f32(w);
498
499 const psimd_f32 vi0x0123 = psimd_load_f32(i0);
500 const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
501 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
502
503 const psimd_f32 vi1x0123 = psimd_load_f32(i1);
504 const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
505 psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
506
507 const psimd_f32 vi2x0123 = psimd_load_f32(i2);
508 const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
509 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
510
511 const psimd_f32 vi3x0123 = psimd_load_f32(i3);
512 const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
513 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
514
515 const psimd_f32 vi4x0123 = psimd_load_f32(i4);
516 const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
517 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
518
519 const psimd_f32 vi5x0123 = psimd_load_f32(i5);
520 const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
521 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
522
523 const psimd_f32 vi6x0123 = psimd_load_f32(i6);
524 const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
525 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
526
527 const psimd_f32 vi7x0123 = psimd_load_f32(i7);
528 const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
529 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
530
531 const psimd_f32 vi8x0123 = psimd_load_f32(i8);
532 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
533 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
534
535 const psimd_f32 vi9x0123 = psimd_load_f32(i9);
536 const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
537 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
538
539 const psimd_f32 vi10x0123 = psimd_load_f32(i10);
540 const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
541 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
542
543 const psimd_f32 vi11x0123 = psimd_load_f32(i11);
544 const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
545 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
546
547 const psimd_f32 vi12x0123 = psimd_load_f32(i12);
548 const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
549 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
550
551 const psimd_f32 vi13x0123 = psimd_load_f32(i13);
552 const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
553 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
554
555 const psimd_f32 vi14x0123 = psimd_load_f32(i14);
556 const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
557 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
558
559 const psimd_f32 vi15x0123 = psimd_load_f32(i15);
560 const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
561 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
562
563 const psimd_f32 vi16x0123 = psimd_load_f32(i16);
564 const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
565 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
566
567 const psimd_f32 vi17x0123 = psimd_load_f32(i17);
568 const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
569 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
570
571 const psimd_f32 vi18x0123 = psimd_load_f32(i18);
572 const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
573 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
574
575 const psimd_f32 vi19x0123 = psimd_load_f32(i19);
576 const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
577 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
578
579 const psimd_f32 vi20x0123 = psimd_load_f32(i20);
580 const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
581 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
582
583 const psimd_f32 vi21x0123 = psimd_load_f32(i21);
584 const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
585 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
586
587 const psimd_f32 vi22x0123 = psimd_load_f32(i22);
588 const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
589 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
590
591 const psimd_f32 vi23x0123 = psimd_load_f32(i23);
592 const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
593 vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
594
595 const psimd_f32 vi24x0123 = psimd_load_f32(i24);
596 const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
597 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
598
599 // Add up all accumulators to vacc01234567p0
600 vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
601
602 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
603 vacc0123 = psimd_min_f32(vacc0123, vmax);
604
605 if (c & 2) {
606 psimd_store2_f32(output, vacc0123);
607 vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
608 output += 2;
609 }
610 if (c & 1) {
611 psimd_store1_f32(output, vacc0123);
612 output += 1;
613 }
614 }
615
616 output = (float*) ((uintptr_t) output + output_increment);
617 } while (--output_width != 0);
618}