blob: 617c95af7653fb92f8edffb5692bc2c37159dd5d [file] [log] [blame]
Marat Dukhan56b10cd2020-05-18 09:35:49 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-conv-hwc/3x3s2p0p1c3-neon-x1.c.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <assert.h>
12
13#include <arm_neon.h>
14
15#include <xnnpack/conv.h>
16#include <xnnpack/math.h>
17
18
19void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1(
20 size_t input_height,
21 size_t input_width,
22 size_t output_y_start,
23 size_t output_y_end,
24 const float* input,
25 const float* zero,
26 const float* weights,
27 float* output,
28 size_t input_padding_top,
29 size_t output_channels,
30 size_t output_height_stride,
31 size_t output_width_stride,
Frank Barchard361e44a2020-06-08 18:18:25 -070032 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
Marat Dukhan56b10cd2020-05-18 09:35:49 -070033{
34 assert(input_width != 0);
35 assert(output_y_end > output_y_start);
36 assert(input_padding_top <= 1);
37 assert(output_channels != 0);
38
39 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40 const size_t input_width_decrement = (4 + ((input_width - 1) & 1) * 2 + (round_down_po2(input_width - 1, 2) * 3 /* channels */)) * sizeof(float);
41 const size_t output_width = input_width / 2;
42 const size_t output_channel_decrement = output_width * output_width_stride - 8 * sizeof(float);
43 const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 8) * sizeof(float);
44
45 // Adjustment for padding processed below
46 const float* i0 = (const float*) ((uintptr_t) input +
47 input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53 float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54
55 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56 i0 = zero;
57 }
58
59 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
60 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
61
62 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64 const size_t input_y4 = input_y2 + 2;
65 if XNN_UNPREDICTABLE(input_y2 > input_height) {
66 i1 = zero;
67 }
68 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69 i2 = zero;
70 }
71 if XNN_UNPREDICTABLE(input_y4 > input_height) {
72 i3 = zero;
73 }
74 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75 i4 = zero;
76 }
77 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78 o1 = o0;
79 }
80
81 const float* w = weights;
82 size_t c = output_channels;
83 do {
84 // viMx0 = ( iM1c0, iM0c2, iM0c1, iM0c0 )
85 float32x4_t vi0x0 = vld1q_f32(i0); i0 += 4;
86 float32x4_t vi1x0 = vld1q_f32(i1); i1 += 4;
87 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4;
88 float32x4_t vi3x0 = vld1q_f32(i3); i3 += 4;
89 float32x4_t vi4x0 = vld1q_f32(i4); i4 += 4;
90
91 size_t iw = input_width - 1;
92 for (; iw >= 2; iw -= 2) {
93 float32x4_t vo0c0123 = vld1q_f32(w);
94 float32x4_t vo0c4567 = vld1q_f32(w + 4);
95 float32x4_t vo1c0123 = vo0c0123;
96 float32x4_t vo1c4567 = vo0c4567;
97
98 const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
99 const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
100
101 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
102 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
103 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 0);
104 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0);
105
106 const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
107 const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
108
109 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
110 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
111 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 0);
112 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 0);
113
114 const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
115 const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
116
117 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
118 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
119 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0);
120 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 0);
121
122 const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
123 const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
124
125 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
126 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
127 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c1x4567, vget_low_f32(vi0x0), 1);
128 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1);
129
130 const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
131 const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
132
133 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
134 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
135 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c1x4567, vget_low_f32(vi1x0), 1);
136 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c1x4567, vget_low_f32(vi3x0), 1);
137
138 const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
139 const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
140
141 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
142 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
143 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1);
144 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c1x4567, vget_low_f32(vi4x0), 1);
145
146 const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
147 const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
148
149 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
150 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
151 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 0);
152 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 0);
153
154 const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
155 const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
156
157 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
158 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
159 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 0);
160 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 0);
161
162 const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
163 const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
164
165 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
166 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
167 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 0);
168 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 0);
169
170 const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
171 const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
172
173 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
174 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
175 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c0x4567, vget_high_f32(vi0x0), 1);
176 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c0x4567, vget_high_f32(vi2x0), 1);
177
178 const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
179 const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
180
181 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
182 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
183 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c0x4567, vget_high_f32(vi1x0), 1);
184 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c0x4567, vget_high_f32(vi3x0), 1);
185
186 const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
187 const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
188
189 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
190 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
191 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c0x4567, vget_high_f32(vi2x0), 1);
192 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c0x4567, vget_high_f32(vi4x0), 1);
193
194 const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
195 const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
196
197 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
198 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
199 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
200 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
201 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
202 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
203
204 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
205 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
206 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c1x4567, vget_low_f32(vi0x1), 0);
207 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 0);
208
209 const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
210 const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
211
212 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
213 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
214 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c1x4567, vget_low_f32(vi1x1), 0);
215 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c1x4567, vget_low_f32(vi3x1), 0);
216
217 const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
218 const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
219
220 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
221 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
222 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 0);
223 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c1x4567, vget_low_f32(vi4x1), 0);
224
225 const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
226 const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
227
228 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
229 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
230 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c2x4567, vget_low_f32(vi0x1), 1);
231 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c2x4567, vget_low_f32(vi2x1), 1);
232
233 const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
234 const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
235
236 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
237 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
238 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c2x4567, vget_low_f32(vi1x1), 1);
239 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c2x4567, vget_low_f32(vi3x1), 1);
240
241 const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
242 const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
243
244 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
245 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
246 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c2x4567, vget_low_f32(vi2x1), 1);
247 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c2x4567, vget_low_f32(vi4x1), 1);
248
249 const float32x4_t vk02c0x0123 = vld1q_f32(w + 152);
250 const float32x4_t vk02c0x4567 = vld1q_f32(w + 156);
251
252 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
253 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
254 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c0x4567, vget_high_f32(vi0x1), 0);
255 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c0x4567, vget_high_f32(vi2x1), 0);
256
257 const float32x4_t vk12c0x0123 = vld1q_f32(w + 160);
258 const float32x4_t vk12c0x4567 = vld1q_f32(w + 164);
259
260 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
261 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
262 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c0x4567, vget_high_f32(vi1x1), 0);
263 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c0x4567, vget_high_f32(vi3x1), 0);
264
265 const float32x4_t vk22c0x0123 = vld1q_f32(w + 168);
266 const float32x4_t vk22c0x4567 = vld1q_f32(w + 172);
267
268 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
269 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
270 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c0x4567, vget_high_f32(vi2x1), 0);
271 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c0x4567, vget_high_f32(vi4x1), 0);
272
273 const float32x4_t vk02c1x0123 = vld1q_f32(w + 176);
274 const float32x4_t vk02c1x4567 = vld1q_f32(w + 180);
275
276 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
277 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
278 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c1x4567, vget_high_f32(vi0x1), 1);
279 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c1x4567, vget_high_f32(vi2x1), 1);
280
281 const float32x4_t vk12c1x0123 = vld1q_f32(w + 184);
282 const float32x4_t vk12c1x4567 = vld1q_f32(w + 188);
283
284 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
285 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
286 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c1x4567, vget_high_f32(vi1x1), 1);
287 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c1x4567, vget_high_f32(vi3x1), 1);
288
289 const float32x4_t vk22c1x0123 = vld1q_f32(w + 192);
290 const float32x4_t vk22c1x4567 = vld1q_f32(w + 196);
291
292 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
293 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
294 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c1x4567, vget_high_f32(vi2x1), 1);
295 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c1x4567, vget_high_f32(vi4x1), 1);
296
297 const float32x4_t vk02c2x0123 = vld1q_f32(w + 200);
298 const float32x4_t vk02c2x4567 = vld1q_f32(w + 204);
299
300 // viMx2 = ( iM2c2, iM2c1 )
301 const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
302 const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
303 const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
304 const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
305 const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
306
307 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 0);
308 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 0);
309 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk02c2x4567, vi0x2, 0);
310 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk02c2x4567, vi2x2, 0);
311
312 const float32x4_t vk12c2x0123 = vld1q_f32(w + 208);
313 const float32x4_t vk12c2x4567 = vld1q_f32(w + 212);
314
315 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 0);
316 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 0);
317 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk12c2x4567, vi1x2, 0);
318 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk12c2x4567, vi3x2, 0);
319
320 const float32x4_t vk22c2x0123 = vld1q_f32(w + 216);
321 const float32x4_t vk22c2x4567 = vld1q_f32(w + 220);
322
323 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 0);
324 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 0);
325 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk22c2x4567, vi2x2, 0);
326 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk22c2x4567, vi4x2, 0);
327
328 vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
329 vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
330 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
331 vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
332 vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
333
334
335 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
336 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
337 vo0c4567 = vmaxq_f32(vo0c4567, vmin);
338 vo1c4567 = vmaxq_f32(vo1c4567, vmin);
339
340 vo0c0123 = vminq_f32(vo0c0123, vmax);
341 vo1c0123 = vminq_f32(vo1c0123, vmax);
342 vo0c4567 = vminq_f32(vo0c4567, vmax);
343 vo1c4567 = vminq_f32(vo1c4567, vmax);
344
345 if XNN_LIKELY(c >= 8) {
346 vst1q_f32(o1, vo1c0123);
347 vst1q_f32(o1 + 4, vo1c4567);
348 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
349 vst1q_f32(o0, vo0c0123);
350 vst1q_f32(o0 + 4, vo0c4567);
351 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
352 } else {
353 float* o0_tmp = o0;
354 float* o1_tmp = o1;
355 if (c & 4) {
356 vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
357 vo1c0123 = vo1c4567;
358 vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
359 vo0c0123 = vo0c4567;
360 }
361 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
362 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
363 if (c & 2) {
364 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
365 vo1c01 = vget_high_f32(vo1c0123);
366 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
367 vo0c01 = vget_high_f32(vo0c0123);
368 }
369 if (c & 1) {
370 vst1_lane_f32(o1_tmp, vo1c01, 0);
371 vst1_lane_f32(o0_tmp, vo0c01, 0);
372 }
373
374 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
375 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
376 }
377 }
378 assert(iw < 2);
379 if XNN_LIKELY(iw & 1) {
380 float32x4_t vo0c0123 = vld1q_f32(w);
381 float32x4_t vo0c4567 = vld1q_f32(w + 4);
382 float32x4_t vo1c0123 = vo0c0123;
383 float32x4_t vo1c4567 = vo0c4567;
384
385 const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
386 const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
387
388 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
389 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
390 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 0);
391 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0);
392
393 const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
394 const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
395
396 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
397 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
398 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 0);
399 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 0);
400
401 const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
402 const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
403
404 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
405 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
406 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0);
407 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 0);
408
409 const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
410 const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
411
412 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
413 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
414 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c1x4567, vget_low_f32(vi0x0), 1);
415 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1);
416
417 const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
418 const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
419
420 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
421 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
422 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c1x4567, vget_low_f32(vi1x0), 1);
423 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c1x4567, vget_low_f32(vi3x0), 1);
424
425 const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
426 const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
427
428 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
429 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
430 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1);
431 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c1x4567, vget_low_f32(vi4x0), 1);
432
433 const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
434 const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
435
436 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
437 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
438 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 0);
439 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 0);
440
441 const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
442 const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
443
444 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
445 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
446 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 0);
447 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 0);
448
449 const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
450 const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
451
452 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
453 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
454 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 0);
455 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 0);
456
457 const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
458 const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
459
460 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
461 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
462 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c0x4567, vget_high_f32(vi0x0), 1);
463 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c0x4567, vget_high_f32(vi2x0), 1);
464
465 const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
466 const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
467
468 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
469 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
470 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c0x4567, vget_high_f32(vi1x0), 1);
471 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c0x4567, vget_high_f32(vi3x0), 1);
472
473 const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
474 const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
475
476 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
477 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
478 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c0x4567, vget_high_f32(vi2x0), 1);
479 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c0x4567, vget_high_f32(vi4x0), 1);
480
481 const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
482 const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
483
484 // viMx1 = ( iM1c2, iM1c1 )
485 const float32x2_t vi0x1 = vld1_f32(i0); i0 += 2;
486 const float32x2_t vi1x1 = vld1_f32(i1); i1 += 2;
487 const float32x2_t vi2x1 = vld1_f32(i2); i2 += 2;
488 const float32x2_t vi3x1 = vld1_f32(i3); i3 += 2;
489 const float32x2_t vi4x1 = vld1_f32(i4); i4 += 2;
490
491 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vi0x1, 0);
492 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vi2x1, 0);
493 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c1x4567, vi0x1, 0);
494 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vi2x1, 0);
495
496 const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
497 const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
498
499 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vi1x1, 0);
500 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vi3x1, 0);
501 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c1x4567, vi1x1, 0);
502 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c1x4567, vi3x1, 0);
503
504 const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
505 const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
506
507 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vi2x1, 0);
508 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vi4x1, 0);
509 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vi2x1, 0);
510 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c1x4567, vi4x1, 0);
511
512 const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
513 const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
514
515 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vi0x1, 1);
516 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vi2x1, 1);
517 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk01c2x4567, vi0x1, 1);
518 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c2x4567, vi2x1, 1);
519
520 const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
521 const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
522
523 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vi1x1, 1);
524 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vi3x1, 1);
525 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk11c2x4567, vi1x1, 1);
526 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk11c2x4567, vi3x1, 1);
527
528 const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
529 const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
530
531 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vi2x1, 1);
532 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vi4x1, 1);
533 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c2x4567, vi2x1, 1);
534 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk21c2x4567, vi4x1, 1);
535
536
537 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
538 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
539 vo0c4567 = vmaxq_f32(vo0c4567, vmin);
540 vo1c4567 = vmaxq_f32(vo1c4567, vmin);
541
542 vo0c0123 = vminq_f32(vo0c0123, vmax);
543 vo1c0123 = vminq_f32(vo1c0123, vmax);
544 vo0c4567 = vminq_f32(vo0c4567, vmax);
545 vo1c4567 = vminq_f32(vo1c4567, vmax);
546
547 if XNN_LIKELY(c >= 8) {
548 vst1q_f32(o1, vo1c0123);
549 vst1q_f32(o1 + 4, vo1c4567);
550 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
551 vst1q_f32(o0, vo0c0123);
552 vst1q_f32(o0 + 4, vo0c4567);
553 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
554 } else {
555 float* o0_tmp = o0;
556 float* o1_tmp = o1;
557 if (c & 4) {
558 vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
559 vo1c0123 = vo1c4567;
560 vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
561 vo0c0123 = vo0c4567;
562 }
563 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
564 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
565 if (c & 2) {
566 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
567 vo1c01 = vget_high_f32(vo1c0123);
568 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
569 vo0c01 = vget_high_f32(vo0c0123);
570 }
571 if (c & 1) {
572 vst1_lane_f32(o1_tmp, vo1c01, 0);
573 vst1_lane_f32(o0_tmp, vo0c01, 0);
574 }
575 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
576 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
577 }
578 }
579 // Move output pointers back to the position of the first pixel in a row,
580 // and forward to the next block of output channels
581 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
582 o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
583 // Revert input pointers to the position of the first pixel in a row
584 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
585 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
586 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
587 i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
588 i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
589 // Move to the block of weights for the next 8 output channels
590 w += 224;
591 c = doz(c, 8);
592 } while (c != 0);
593 // Move output pointers back to the position of the first channel, and forward to the next block of rows
594 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
595 o1 = (float*) ((uintptr_t) o1 + output_height_increment);
596 // Move input pointers forward to the next four rows
597 i0 = i4;
598 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
599 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
600 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
601 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
602 }
603}