blob: 961e00e8e1e239850e652ae08ab259923ee09b7c [file] [log] [blame]
Marat Dukhan5098c3e2019-11-07 12:01:19 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-dwconv/up-psimd.c.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <psimd.h>
13
14#include <xnnpack/dwconv.h>
15
16
17void xnn_f32_dwconv_ukernel_up8x25__psimd(
18 size_t channels,
19 size_t output_width,
20 const float** input,
21 const float* weights,
22 float* output,
23 size_t input_stride,
24 size_t output_increment,
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070025 const union xnn_f32_minmax_params params[restrict static 1])
Marat Dukhan5098c3e2019-11-07 12:01:19 -080026{
27 assert(channels != 0);
28 assert(output_width != 0);
29
30 const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
31 const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
32 do {
33 const float* i0 = input[0];
Marat Dukhan68660992020-02-03 13:31:12 -080034 assert(i0 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080035 const float* i1 = input[1];
Marat Dukhan68660992020-02-03 13:31:12 -080036 assert(i1 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080037 const float* i2 = input[2];
Marat Dukhan68660992020-02-03 13:31:12 -080038 assert(i2 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080039 const float* i3 = input[3];
Marat Dukhan68660992020-02-03 13:31:12 -080040 assert(i3 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080041 const float* i4 = input[4];
Marat Dukhan68660992020-02-03 13:31:12 -080042 assert(i4 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080043 const float* i5 = input[5];
Marat Dukhan68660992020-02-03 13:31:12 -080044 assert(i5 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080045 const float* i6 = input[6];
Marat Dukhan68660992020-02-03 13:31:12 -080046 assert(i6 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080047 const float* i7 = input[7];
Marat Dukhan68660992020-02-03 13:31:12 -080048 assert(i7 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080049 const float* i8 = input[8];
Marat Dukhan68660992020-02-03 13:31:12 -080050 assert(i8 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080051 const float* i9 = input[9];
Marat Dukhan68660992020-02-03 13:31:12 -080052 assert(i9 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080053 const float* i10 = input[10];
Marat Dukhan68660992020-02-03 13:31:12 -080054 assert(i10 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080055 const float* i11 = input[11];
Marat Dukhan68660992020-02-03 13:31:12 -080056 assert(i11 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080057 const float* i12 = input[12];
Marat Dukhan68660992020-02-03 13:31:12 -080058 assert(i12 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080059 const float* i13 = input[13];
Marat Dukhan68660992020-02-03 13:31:12 -080060 assert(i13 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080061 const float* i14 = input[14];
Marat Dukhan68660992020-02-03 13:31:12 -080062 assert(i14 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080063 const float* i15 = input[15];
Marat Dukhan68660992020-02-03 13:31:12 -080064 assert(i15 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080065 const float* i16 = input[16];
Marat Dukhan68660992020-02-03 13:31:12 -080066 assert(i16 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080067 const float* i17 = input[17];
Marat Dukhan68660992020-02-03 13:31:12 -080068 assert(i17 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080069 const float* i18 = input[18];
Marat Dukhan68660992020-02-03 13:31:12 -080070 assert(i18 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080071 const float* i19 = input[19];
Marat Dukhan68660992020-02-03 13:31:12 -080072 assert(i19 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080073 const float* i20 = input[20];
Marat Dukhan68660992020-02-03 13:31:12 -080074 assert(i20 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080075 const float* i21 = input[21];
Marat Dukhan68660992020-02-03 13:31:12 -080076 assert(i21 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080077 const float* i22 = input[22];
Marat Dukhan68660992020-02-03 13:31:12 -080078 assert(i22 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080079 const float* i23 = input[23];
Marat Dukhan68660992020-02-03 13:31:12 -080080 assert(i23 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080081 const float* i24 = input[24];
Marat Dukhan68660992020-02-03 13:31:12 -080082 assert(i24 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080083 input = (const float**) ((uintptr_t) input + input_stride);
84
85 size_t c = channels;
86 const float* w = weights;
87 for (; c >= 8; c -= 8) {
88 psimd_f32 vacc0123p0 = psimd_load_f32(w);
89 psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
90
91
92 const psimd_f32 vi0x0123 = psimd_load_f32(i0);
93 const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
94 i0 += 8;
95
96 const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
97 const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
98 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
99 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
100
101 const psimd_f32 vi1x0123 = psimd_load_f32(i1);
102 const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
103 i1 += 8;
104
105 const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
106 const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
107 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
108 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi1x4567, vk1x4567);
109
110 const psimd_f32 vi2x0123 = psimd_load_f32(i2);
111 const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
112 i2 += 8;
113
114 const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
115 const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
116 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
117 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
118
119 const psimd_f32 vi3x0123 = psimd_load_f32(i3);
120 const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
121 i3 += 8;
122
123 const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
124 const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
125 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
126 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi3x4567, vk3x4567);
127
128 const psimd_f32 vi4x0123 = psimd_load_f32(i4);
129 const psimd_f32 vi4x4567 = psimd_load_f32(i4 + 4);
130 i4 += 8;
131
132 const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
133 const psimd_f32 vk4x4567 = psimd_load_f32(w + 44);
134 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
135 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi4x4567, vk4x4567);
136
137 const psimd_f32 vi5x0123 = psimd_load_f32(i5);
138 const psimd_f32 vi5x4567 = psimd_load_f32(i5 + 4);
139 i5 += 8;
140
141 const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
142 const psimd_f32 vk5x4567 = psimd_load_f32(w + 52);
143 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
144 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi5x4567, vk5x4567);
145
146 const psimd_f32 vi6x0123 = psimd_load_f32(i6);
147 const psimd_f32 vi6x4567 = psimd_load_f32(i6 + 4);
148 i6 += 8;
149
150 const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
151 const psimd_f32 vk6x4567 = psimd_load_f32(w + 60);
152 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
153 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi6x4567, vk6x4567);
154
155 const psimd_f32 vi7x0123 = psimd_load_f32(i7);
156 const psimd_f32 vi7x4567 = psimd_load_f32(i7 + 4);
157 i7 += 8;
158
159 const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
160 const psimd_f32 vk7x4567 = psimd_load_f32(w + 68);
161 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
162 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi7x4567, vk7x4567);
163
164 const psimd_f32 vi8x0123 = psimd_load_f32(i8);
165 const psimd_f32 vi8x4567 = psimd_load_f32(i8 + 4);
166 i8 += 8;
167
168 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
169 const psimd_f32 vk8x4567 = psimd_load_f32(w + 76);
170 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
171 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi8x4567, vk8x4567);
172
173 const psimd_f32 vi9x0123 = psimd_load_f32(i9);
174 const psimd_f32 vi9x4567 = psimd_load_f32(i9 + 4);
175 i9 += 8;
176
177 const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
178 const psimd_f32 vk9x4567 = psimd_load_f32(w + 84);
179 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
180 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi9x4567, vk9x4567);
181
182 const psimd_f32 vi10x0123 = psimd_load_f32(i10);
183 const psimd_f32 vi10x4567 = psimd_load_f32(i10 + 4);
184 i10 += 8;
185
186 const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
187 const psimd_f32 vk10x4567 = psimd_load_f32(w + 92);
188 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
189 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi10x4567, vk10x4567);
190
191 const psimd_f32 vi11x0123 = psimd_load_f32(i11);
192 const psimd_f32 vi11x4567 = psimd_load_f32(i11 + 4);
193 i11 += 8;
194
195 const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
196 const psimd_f32 vk11x4567 = psimd_load_f32(w + 100);
197 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
198 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi11x4567, vk11x4567);
199
200 const psimd_f32 vi12x0123 = psimd_load_f32(i12);
201 const psimd_f32 vi12x4567 = psimd_load_f32(i12 + 4);
202 i12 += 8;
203
204 const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
205 const psimd_f32 vk12x4567 = psimd_load_f32(w + 108);
206 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
207 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi12x4567, vk12x4567);
208
209 const psimd_f32 vi13x0123 = psimd_load_f32(i13);
210 const psimd_f32 vi13x4567 = psimd_load_f32(i13 + 4);
211 i13 += 8;
212
213 const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
214 const psimd_f32 vk13x4567 = psimd_load_f32(w + 116);
215 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
216 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi13x4567, vk13x4567);
217
218 const psimd_f32 vi14x0123 = psimd_load_f32(i14);
219 const psimd_f32 vi14x4567 = psimd_load_f32(i14 + 4);
220 i14 += 8;
221
222 const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
223 const psimd_f32 vk14x4567 = psimd_load_f32(w + 124);
224 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
225 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi14x4567, vk14x4567);
226
227 const psimd_f32 vi15x0123 = psimd_load_f32(i15);
228 const psimd_f32 vi15x4567 = psimd_load_f32(i15 + 4);
229 i15 += 8;
230
231 const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
232 const psimd_f32 vk15x4567 = psimd_load_f32(w + 132);
233 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
234 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi15x4567, vk15x4567);
235
236 const psimd_f32 vi16x0123 = psimd_load_f32(i16);
237 const psimd_f32 vi16x4567 = psimd_load_f32(i16 + 4);
238 i16 += 8;
239
240 const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
241 const psimd_f32 vk16x4567 = psimd_load_f32(w + 140);
242 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
243 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi16x4567, vk16x4567);
244
245 const psimd_f32 vi17x0123 = psimd_load_f32(i17);
246 const psimd_f32 vi17x4567 = psimd_load_f32(i17 + 4);
247 i17 += 8;
248
249 const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
250 const psimd_f32 vk17x4567 = psimd_load_f32(w + 148);
251 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
252 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi17x4567, vk17x4567);
253
254 const psimd_f32 vi18x0123 = psimd_load_f32(i18);
255 const psimd_f32 vi18x4567 = psimd_load_f32(i18 + 4);
256 i18 += 8;
257
258 const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
259 const psimd_f32 vk18x4567 = psimd_load_f32(w + 156);
260 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
261 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi18x4567, vk18x4567);
262
263 const psimd_f32 vi19x0123 = psimd_load_f32(i19);
264 const psimd_f32 vi19x4567 = psimd_load_f32(i19 + 4);
265 i19 += 8;
266
267 const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
268 const psimd_f32 vk19x4567 = psimd_load_f32(w + 164);
269 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
270 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi19x4567, vk19x4567);
271
272 const psimd_f32 vi20x0123 = psimd_load_f32(i20);
273 const psimd_f32 vi20x4567 = psimd_load_f32(i20 + 4);
274 i20 += 8;
275
276 const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
277 const psimd_f32 vk20x4567 = psimd_load_f32(w + 172);
278 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
279 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi20x4567, vk20x4567);
280
281 const psimd_f32 vi21x0123 = psimd_load_f32(i21);
282 const psimd_f32 vi21x4567 = psimd_load_f32(i21 + 4);
283 i21 += 8;
284
285 const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
286 const psimd_f32 vk21x4567 = psimd_load_f32(w + 180);
287 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
288 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi21x4567, vk21x4567);
289
290 const psimd_f32 vi22x0123 = psimd_load_f32(i22);
291 const psimd_f32 vi22x4567 = psimd_load_f32(i22 + 4);
292 i22 += 8;
293
294 const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
295 const psimd_f32 vk22x4567 = psimd_load_f32(w + 188);
296 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
297 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi22x4567, vk22x4567);
298
299 const psimd_f32 vi23x0123 = psimd_load_f32(i23);
300 const psimd_f32 vi23x4567 = psimd_load_f32(i23 + 4);
301 i23 += 8;
302
303 const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
304 const psimd_f32 vk23x4567 = psimd_load_f32(w + 196);
305 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
306 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi23x4567, vk23x4567);
307
308 const psimd_f32 vi24x0123 = psimd_load_f32(i24);
309 const psimd_f32 vi24x4567 = psimd_load_f32(i24 + 4);
310 i24 += 8;
311
312 const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
313 const psimd_f32 vk24x4567 = psimd_load_f32(w + 204);
314 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
315 vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi24x4567, vk24x4567);
316
317 w += 208;
318
319
320 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
321 psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
322 vacc0123 = psimd_min_f32(vacc0123, vmax);
323 vacc4567 = psimd_min_f32(vacc4567, vmax);
324
325 psimd_store_f32(output, vacc0123);
326 psimd_store_f32(output + 4, vacc4567);
327 output += 8;
328 }
329 for (; c >= 4; c -= 4) {
330 psimd_f32 vacc0123p0 = psimd_load_f32(w);
331
332 const psimd_f32 vi0x0123 = psimd_load_f32(i0);
333 i0 += 4;
334
335 const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
336 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
337
338 const psimd_f32 vi1x0123 = psimd_load_f32(i1);
339 i1 += 4;
340
341 const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
342 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
343
344 const psimd_f32 vi2x0123 = psimd_load_f32(i2);
345 i2 += 4;
346
347 const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
348 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
349
350 const psimd_f32 vi3x0123 = psimd_load_f32(i3);
351 i3 += 4;
352
353 const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
354 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
355
356 const psimd_f32 vi4x0123 = psimd_load_f32(i4);
357 i4 += 4;
358
359 const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
360 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
361
362 const psimd_f32 vi5x0123 = psimd_load_f32(i5);
363 i5 += 4;
364
365 const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
366 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
367
368 const psimd_f32 vi6x0123 = psimd_load_f32(i6);
369 i6 += 4;
370
371 const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
372 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
373
374 const psimd_f32 vi7x0123 = psimd_load_f32(i7);
375 i7 += 4;
376
377 const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
378 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
379
380 const psimd_f32 vi8x0123 = psimd_load_f32(i8);
381 i8 += 4;
382
383 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
384 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
385
386 const psimd_f32 vi9x0123 = psimd_load_f32(i9);
387 i9 += 4;
388
389 const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
390 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
391
392 const psimd_f32 vi10x0123 = psimd_load_f32(i10);
393 i10 += 4;
394
395 const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
396 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
397
398 const psimd_f32 vi11x0123 = psimd_load_f32(i11);
399 i11 += 4;
400
401 const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
402 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
403
404 const psimd_f32 vi12x0123 = psimd_load_f32(i12);
405 i12 += 4;
406
407 const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
408 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
409
410 const psimd_f32 vi13x0123 = psimd_load_f32(i13);
411 i13 += 4;
412
413 const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
414 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
415
416 const psimd_f32 vi14x0123 = psimd_load_f32(i14);
417 i14 += 4;
418
419 const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
420 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
421
422 const psimd_f32 vi15x0123 = psimd_load_f32(i15);
423 i15 += 4;
424
425 const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
426 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
427
428 const psimd_f32 vi16x0123 = psimd_load_f32(i16);
429 i16 += 4;
430
431 const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
432 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
433
434 const psimd_f32 vi17x0123 = psimd_load_f32(i17);
435 i17 += 4;
436
437 const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
438 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
439
440 const psimd_f32 vi18x0123 = psimd_load_f32(i18);
441 i18 += 4;
442
443 const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
444 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
445
446 const psimd_f32 vi19x0123 = psimd_load_f32(i19);
447 i19 += 4;
448
449 const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
450 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
451
452 const psimd_f32 vi20x0123 = psimd_load_f32(i20);
453 i20 += 4;
454
455 const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
456 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
457
458 const psimd_f32 vi21x0123 = psimd_load_f32(i21);
459 i21 += 4;
460
461 const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
462 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
463
464 const psimd_f32 vi22x0123 = psimd_load_f32(i22);
465 i22 += 4;
466
467 const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
468 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
469
470 const psimd_f32 vi23x0123 = psimd_load_f32(i23);
471 i23 += 4;
472
473 const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
474 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
475
476 const psimd_f32 vi24x0123 = psimd_load_f32(i24);
477 i24 += 4;
478
479 const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
480 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
481
482 w += 4;
483
484
485 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
486 vacc0123 = psimd_min_f32(vacc0123, vmax);
487
488 psimd_store_f32(output, vacc0123);
489 output += 4;
490 }
491 if XNN_UNLIKELY(c != 0) {
492 psimd_f32 vacc0123p0 = psimd_load_f32(w);
493
494 const psimd_f32 vi0x0123 = psimd_load_f32(i0);
495 const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
496 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
497
498 const psimd_f32 vi1x0123 = psimd_load_f32(i1);
499 const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
500 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
501
502 const psimd_f32 vi2x0123 = psimd_load_f32(i2);
503 const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
504 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
505
506 const psimd_f32 vi3x0123 = psimd_load_f32(i3);
507 const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
508 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
509
510 const psimd_f32 vi4x0123 = psimd_load_f32(i4);
511 const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
512 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
513
514 const psimd_f32 vi5x0123 = psimd_load_f32(i5);
515 const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
516 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
517
518 const psimd_f32 vi6x0123 = psimd_load_f32(i6);
519 const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
520 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
521
522 const psimd_f32 vi7x0123 = psimd_load_f32(i7);
523 const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
524 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
525
526 const psimd_f32 vi8x0123 = psimd_load_f32(i8);
527 const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
528 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
529
530 const psimd_f32 vi9x0123 = psimd_load_f32(i9);
531 const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
532 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
533
534 const psimd_f32 vi10x0123 = psimd_load_f32(i10);
535 const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
536 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
537
538 const psimd_f32 vi11x0123 = psimd_load_f32(i11);
539 const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
540 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
541
542 const psimd_f32 vi12x0123 = psimd_load_f32(i12);
543 const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
544 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
545
546 const psimd_f32 vi13x0123 = psimd_load_f32(i13);
547 const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
548 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
549
550 const psimd_f32 vi14x0123 = psimd_load_f32(i14);
551 const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
552 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
553
554 const psimd_f32 vi15x0123 = psimd_load_f32(i15);
555 const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
556 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
557
558 const psimd_f32 vi16x0123 = psimd_load_f32(i16);
559 const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
560 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
561
562 const psimd_f32 vi17x0123 = psimd_load_f32(i17);
563 const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
564 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
565
566 const psimd_f32 vi18x0123 = psimd_load_f32(i18);
567 const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
568 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
569
570 const psimd_f32 vi19x0123 = psimd_load_f32(i19);
571 const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
572 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
573
574 const psimd_f32 vi20x0123 = psimd_load_f32(i20);
575 const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
576 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
577
578 const psimd_f32 vi21x0123 = psimd_load_f32(i21);
579 const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
580 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
581
582 const psimd_f32 vi22x0123 = psimd_load_f32(i22);
583 const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
584 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
585
586 const psimd_f32 vi23x0123 = psimd_load_f32(i23);
587 const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
588 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
589
590 const psimd_f32 vi24x0123 = psimd_load_f32(i24);
591 const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
592 vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
593
594
595 psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
596 vacc0123 = psimd_min_f32(vacc0123, vmax);
597
598 if (c & 2) {
599 psimd_store2_f32(output, vacc0123);
600 vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
601 output += 2;
602 }
603 if (c & 1) {
604 psimd_store1_f32(output, vacc0123);
605 output += 1;
606 }
607 }
608
609 output = (float*) ((uintptr_t) output + output_increment);
610 } while (--output_width != 0);
611}