blob: 06ddb9f5ce9efd00cf11346d31dd4d2ae5022ca1 [file] [log] [blame]
Marat Dukhance7a3f82020-05-17 21:46:44 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-conv-hwc/3x3s2p0p1c3-neon-x2.c.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <assert.h>
12
13#include <arm_neon.h>
14
15#include <xnnpack/conv.h>
16#include <xnnpack/math.h>
17
18
19void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2(
20 size_t input_height,
21 size_t input_width,
22 size_t output_y_start,
23 size_t output_y_end,
24 const float* input,
25 const float* zero,
26 const float* weights,
27 float* output,
28 size_t input_padding_top,
29 size_t output_channels,
30 size_t output_height_stride,
31 size_t output_width_stride,
32 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33{
34 assert(input_width != 0);
35 assert(output_y_end > output_y_start);
36 assert(input_padding_top <= 1);
37 assert(output_channels != 0);
38
39 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40 const size_t input_width_decrement = (4 + ((input_width - 1) & 1) * 2 + (round_down_po2(input_width - 1, 2) * 3 /* channels */)) * sizeof(float);
41 const size_t output_width = input_width / 2;
42 const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
43 const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 4) * sizeof(float);
44
45 // Adjustment for padding processed below
46 const float* i0 = (const float*) ((uintptr_t) input +
47 input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53 float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54
55 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56 i0 = zero;
57 }
58
Marat Dukhance7a3f82020-05-17 21:46:44 -070059
60 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
61 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
62 const size_t input_y4 = input_y2 + 2;
63 if XNN_UNPREDICTABLE(input_y2 > input_height) {
64 i1 = zero;
65 }
66 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
67 i2 = zero;
68 }
69 if XNN_UNPREDICTABLE(input_y4 > input_height) {
70 i3 = zero;
71 }
72 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
73 i4 = zero;
74 }
75 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
76 o1 = o0;
77 }
78
79 const float* w = weights;
80 size_t c = output_channels;
81 do {
82 // viMx0 = ( iM1c0, iM0c2, iM0c1, iM0c0 )
83 float32x4_t vi0x0 = vld1q_f32(i0); i0 += 4;
84 float32x4_t vi1x0 = vld1q_f32(i1); i1 += 4;
85 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4;
86 float32x4_t vi3x0 = vld1q_f32(i3); i3 += 4;
87 float32x4_t vi4x0 = vld1q_f32(i4); i4 += 4;
88
89 size_t iw = input_width - 1;
90 for (; iw >= 4; iw -= 4) {
91 float32x4_t vo0x0c0123 = vld1q_f32(w);
92 float32x4_t vo1x0c0123 = vo0x0c0123;
93 float32x4_t vo0x1c0123 = vo0x0c0123;
94 float32x4_t vo1x1c0123 = vo0x0c0123;
95
96 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
97
98 // viMx1 = ( iM2c1, iM2c0, iM1c2, iM1c1 )
99 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
100 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
101 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
102 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
103 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
104
105 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
106 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
107
108 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk00c0x0123, vget_high_f32(vi0x1), 0);
109 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 0);
110
111 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
112
113 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
114 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
115
116 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk10c0x0123, vget_high_f32(vi1x1), 0);
117 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk10c0x0123, vget_high_f32(vi3x1), 0);
118
119 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
120
121 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
122 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
123
124 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 0);
125 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk20c0x0123, vget_high_f32(vi4x1), 0);
126
127 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
128
129 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
130 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
131
132 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk00c1x0123, vget_high_f32(vi0x1), 1);
133 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_high_f32(vi2x1), 1);
134
135 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
136
137 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
138 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
139
140 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk10c1x0123, vget_high_f32(vi1x1), 1);
141 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk10c1x0123, vget_high_f32(vi3x1), 1);
142
143 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
144
145 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
146 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
147
148 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_high_f32(vi2x1), 1);
149 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk20c1x0123, vget_high_f32(vi4x1), 1);
150
151 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
152
153 // viMx2 = ( iM3c2, iM3c1, iM3c0, iM2c2 )
154 const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4;
155 const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4;
156 const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4;
157 const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4;
158 const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4;
159
160 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
161 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
162
163 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk00c2x0123, vget_low_f32(vi0x2), 0);
164 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c2x0123, vget_low_f32(vi2x2), 0);
165
166 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
167
168 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
169 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
170
171 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk10c2x0123, vget_low_f32(vi1x2), 0);
172 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk10c2x0123, vget_low_f32(vi3x2), 0);
173
174 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
175
176 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
177 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
178
179 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c2x0123, vget_low_f32(vi2x2), 0);
180 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk20c2x0123, vget_low_f32(vi4x2), 0);
181
182 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
183
184 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
185 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
186
187 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk01c0x0123, vget_low_f32(vi0x2), 1);
188 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk01c0x0123, vget_low_f32(vi2x2), 1);
189
190 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
191
192 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
193 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
194
195 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk11c0x0123, vget_low_f32(vi1x2), 1);
196 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk11c0x0123, vget_low_f32(vi3x2), 1);
197
198 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
199
200 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
201 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
202
203 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk21c0x0123, vget_low_f32(vi2x2), 1);
204 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk21c0x0123, vget_low_f32(vi4x2), 1);
205
206 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
207
208 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
209 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
210
211 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk01c1x0123, vget_high_f32(vi0x2), 0);
212 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk01c1x0123, vget_high_f32(vi2x2), 0);
213
214 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
215
216 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
217 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
218
219 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk11c1x0123, vget_high_f32(vi1x2), 0);
220 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk11c1x0123, vget_high_f32(vi3x2), 0);
221
222 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
223
224 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
225 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
226
227 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk21c1x0123, vget_high_f32(vi2x2), 0);
228 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk21c1x0123, vget_high_f32(vi4x2), 0);
229
230 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
231
232 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
233 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
234
235 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk01c2x0123, vget_high_f32(vi0x2), 1);
236 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk01c2x0123, vget_high_f32(vi2x2), 1);
237
238 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
239
240 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
241 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
242
243 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk11c2x0123, vget_high_f32(vi1x2), 1);
244 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk11c2x0123, vget_high_f32(vi3x2), 1);
245
246 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
247
248 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
249 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
250
251 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk21c2x0123, vget_high_f32(vi2x2), 1);
252 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk21c2x0123, vget_high_f32(vi4x2), 1);
253
254 const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
255
256 // viMx3 = ( iM5c0, iM4c2, iM4c1, iM4c0 )
257 const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4;
258 const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4;
259 const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4;
260 const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4;
261 const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4;
262
263 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
264 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
265
266 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk02c0x0123, vget_low_f32(vi0x3), 0);
267 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk02c0x0123, vget_low_f32(vi2x3), 0);
268
269 const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
270
271 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
272 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
273
274 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk12c0x0123, vget_low_f32(vi1x3), 0);
275 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk12c0x0123, vget_low_f32(vi3x3), 0);
276
277 const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
278
279 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
280 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
281
282 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk22c0x0123, vget_low_f32(vi2x3), 0);
283 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c0x0123, vget_low_f32(vi4x3), 0);
284
285 const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
286
287 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
288 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
289
290 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk02c1x0123, vget_low_f32(vi0x3), 1);
291 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk02c1x0123, vget_low_f32(vi2x3), 1);
292
293 const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
294
295 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
296 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
297
298 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk12c1x0123, vget_low_f32(vi1x3), 1);
299 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk12c1x0123, vget_low_f32(vi3x3), 1);
300
301 const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
302
303 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
304 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
305
306 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk22c1x0123, vget_low_f32(vi2x3), 1);
307 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c1x0123, vget_low_f32(vi4x3), 1);
308
309 const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
310
311 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk02c2x0123, vget_low_f32(vi0x2), 0);
312 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c2x0123, vget_low_f32(vi2x2), 0);
313
314 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk02c2x0123, vget_high_f32(vi0x3), 0);
315 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk02c2x0123, vget_high_f32(vi2x3), 0);
316
317 const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
318
319 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk12c2x0123, vget_low_f32(vi1x2), 0);
320 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk12c2x0123, vget_low_f32(vi3x2), 0);
321
322 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk12c2x0123, vget_high_f32(vi1x3), 0);
323 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk12c2x0123, vget_high_f32(vi3x3), 0);
324
325 const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
326
327 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk22c2x0123, vget_low_f32(vi2x2), 0);
328 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk22c2x0123, vget_low_f32(vi4x2), 0);
329
330 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk22c2x0123, vget_high_f32(vi2x3), 0);
331 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c2x0123, vget_high_f32(vi4x3), 0);
332
333 vi0x0 = vi0x3;
334 vi1x0 = vi1x3;
335 vi2x0 = vi2x3;
336 vi3x0 = vi3x3;
337 vi4x0 = vi4x3;
338
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700339 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
340 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
341
Marat Dukhance7a3f82020-05-17 21:46:44 -0700342 vo0x0c0123 = vmaxq_f32(vo0x0c0123, vmin);
343 vo1x0c0123 = vmaxq_f32(vo1x0c0123, vmin);
344
345 vo0x1c0123 = vmaxq_f32(vo0x1c0123, vmin);
346 vo1x1c0123 = vmaxq_f32(vo1x1c0123, vmin);
347
348 vo0x0c0123 = vminq_f32(vo0x0c0123, vmax);
349 vo1x0c0123 = vminq_f32(vo1x0c0123, vmax);
350
351 vo0x1c0123 = vminq_f32(vo0x1c0123, vmax);
352 vo1x1c0123 = vminq_f32(vo1x1c0123, vmax);
353
354 if XNN_LIKELY(c >= 4) {
355 vst1q_f32(o1, vo1x0c0123);
356 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
357 vst1q_f32(o0, vo0x0c0123);
358 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
359
360 vst1q_f32(o1, vo1x1c0123);
361 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
362 vst1q_f32(o0, vo0x1c0123);
363 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
364 } else {
365 float* o0_tmp = o0;
366 float* o1_tmp = o1;
367 float32x2_t vo0x0c01 = vget_low_f32(vo0x0c0123);
368 float32x2_t vo1x0c01 = vget_low_f32(vo1x0c0123);
369 float32x2_t vo0x1c01 = vget_low_f32(vo0x1c0123);
370 float32x2_t vo1x1c01 = vget_low_f32(vo1x1c0123);
371 if (c & 2) {
372 vst1_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01);
373 vo1x1c01 = vget_high_f32(vo1x1c0123);
374 vst1_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01);
375 vo0x1c01 = vget_high_f32(vo0x1c0123);
376
377 vst1_f32(o1_tmp, vo1x0c01); o1_tmp += 2;
378 vo1x0c01 = vget_high_f32(vo1x0c0123);
379 vst1_f32(o0_tmp, vo0x0c01); o0_tmp += 2;
380 vo0x0c01 = vget_high_f32(vo0x0c0123);
381 }
382 if (c & 1) {
383 vst1_lane_f32(o1_tmp, vo1x0c01, 0);
384 vst1_lane_f32(o0_tmp, vo0x0c01, 0);
385
386 vst1_lane_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01, 0);
387 vst1_lane_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01, 0);
388 }
389
390 o0 = (float*) ((uintptr_t) o0 + output_width_stride * 2);
391 o1 = (float*) ((uintptr_t) o1 + output_width_stride * 2);
392 }
393 }
394 assert(iw < 4);
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700395 if XNN_LIKELY(iw & 2) {
Marat Dukhance7a3f82020-05-17 21:46:44 -0700396 float32x4_t vo0c0123 = vld1q_f32(w);
397 float32x4_t vo1c0123 = vo0c0123;
398
399 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
400
401 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
402 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
403
404 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
405
406 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
407 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
408
409 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
410
411 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
412 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
413
414 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
415
416 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
417 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
418
419 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
420
421 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
422 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
423
424 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
425
426 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
427 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
428
429 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
430
431 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
432 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
433
434 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
435
436 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
437 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
438
439 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
440
441 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
442 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
443
444 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
445
446 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
447 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
448
449 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
450
451 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
452 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
453
454 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
455
456 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
457 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
458
459 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
460
461 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
462 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
463 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
464 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
465 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
466 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
467
468 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
469 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
470
471 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
472
473 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
474 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
475
476 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
477
478 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
479 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
480
481 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
482
483 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
484 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
485
486 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
487
488 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
489 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
490
491 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
492
493 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
494 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
495
496 const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
497
498 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
499 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
500
501 const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
502
503 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
504 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
505
506 const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
507
508 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
509 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
510
511 const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
512
513 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
514 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
515
516 const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
517
518 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
519 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
520
521 const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
522
523 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
524 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
525
526 const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
527
528 // viMx2 = ( iM2c2, iM2c1 )
529 const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
530 const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
531 const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
532 const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
533 const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
534
535 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 0);
536 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 0);
537
538 const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
539
540 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 0);
541 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 0);
542
543 const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
544
545 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 0);
546 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 0);
547
548 vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
549 vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
550 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
551 vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
552 vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
553
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700554 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
555 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
556
Marat Dukhance7a3f82020-05-17 21:46:44 -0700557 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
558 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
559
560 vo0c0123 = vminq_f32(vo0c0123, vmax);
561 vo1c0123 = vminq_f32(vo1c0123, vmax);
562
563 if XNN_LIKELY(c >= 4) {
564 vst1q_f32(o1, vo1c0123);
565 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
566 vst1q_f32(o0, vo0c0123);
567 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
568 } else {
569 float* o0_tmp = o0;
570 float* o1_tmp = o1;
571 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
572 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
573 if (c & 2) {
574 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
575 vo1c01 = vget_high_f32(vo1c0123);
576 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
577 vo0c01 = vget_high_f32(vo0c0123);
578 }
579 if (c & 1) {
580 vst1_lane_f32(o1_tmp, vo1c01, 0);
581 vst1_lane_f32(o0_tmp, vo0c01, 0);
582 }
583
584 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
585 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
586 }
587 }
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700588 if XNN_LIKELY(iw & 1) {
Marat Dukhance7a3f82020-05-17 21:46:44 -0700589 float32x4_t vo0c0123 = vld1q_f32(w);
590 float32x4_t vo1c0123 = vo0c0123;
591
592 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
593
594 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
595 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
596
597 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
598
599 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
600 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
601
602 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
603
604 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
605 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
606
607 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
608
609 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
610 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
611
612 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
613
614 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
615 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
616
617 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
618
619 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
620 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
621
622 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
623
624 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
625 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
626
627 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
628
629 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
630 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
631
632 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
633
634 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
635 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
636
637 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
638
639 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
640 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
641
642 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
643
644 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
645 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
646
647 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
648
649 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
650 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
651
652 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
653
654 // viMx1 = ( iM1c2, iM1c1 )
655 const float32x2_t vi0x1 = vld1_f32(i0); i0 += 2;
656 const float32x2_t vi1x1 = vld1_f32(i1); i1 += 2;
657 const float32x2_t vi2x1 = vld1_f32(i2); i2 += 2;
658 const float32x2_t vi3x1 = vld1_f32(i3); i3 += 2;
659 const float32x2_t vi4x1 = vld1_f32(i4); i4 += 2;
660
661 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vi0x1, 0);
662 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vi2x1, 0);
663
664 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
665
666 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vi1x1, 0);
667 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vi3x1, 0);
668
669 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
670
671 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vi2x1, 0);
672 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vi4x1, 0);
673
674 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
675
676 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vi0x1, 1);
677 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vi2x1, 1);
678
679 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
680
681 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vi1x1, 1);
682 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vi3x1, 1);
683
684 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
685
686 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vi2x1, 1);
687 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vi4x1, 1);
688
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700689 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
690 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
691
Marat Dukhance7a3f82020-05-17 21:46:44 -0700692 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
693 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
694
695 vo0c0123 = vminq_f32(vo0c0123, vmax);
696 vo1c0123 = vminq_f32(vo1c0123, vmax);
697
698 if XNN_LIKELY(c >= 4) {
699 vst1q_f32(o1, vo1c0123);
700 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
701 vst1q_f32(o0, vo0c0123);
702 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
703 } else {
704 float* o0_tmp = o0;
705 float* o1_tmp = o1;
706 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
707 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
708 if (c & 2) {
709 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
710 vo1c01 = vget_high_f32(vo1c0123);
711 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
712 vo0c01 = vget_high_f32(vo0c0123);
713 }
714 if (c & 1) {
715 vst1_lane_f32(o1_tmp, vo1c01, 0);
716 vst1_lane_f32(o0_tmp, vo0c01, 0);
717 }
718 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
719 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
720 }
721 }
722 // Move output pointers back to the position of the first pixel in a row,
723 // and forward to the next block of output channels
724 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
725 o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
726 // Revert input pointers to the position of the first pixel in a row
727 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
728 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
729 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
730 i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
731 i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
732 // Move to the block of weights for the next 4 output channels
733 w += 112;
734 c = doz(c, 4);
735 } while (c != 0);
736 // Move output pointers back to the position of the first channel, and forward to the next block of rows
737 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
738 o1 = (float*) ((uintptr_t) o1 + output_height_increment);
739 // Move input pointers forward to the next four rows
740 i0 = i4;
741 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
742 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
743 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
744 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
745 }
746}