blob: a322bf943c417296e7f1a585981218cfaa576dfc [file] [log] [blame]
Marat Dukhance7a3f82020-05-17 21:46:44 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert CHANNEL_TILE % 4 == 0
7$assert HEIGHT_TILE == 2
8$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
9$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
10
11#include <assert.h>
12
13#include <arm_neon.h>
14
15#include <xnnpack/conv.h>
16#include <xnnpack/math.h>
17
18
19void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x${CHANNEL_TILE}__${"neonfma" if FMA else "neon"}_${HEIGHT_TILE}x2(
20 size_t input_height,
21 size_t input_width,
22 size_t output_y_start,
23 size_t output_y_end,
24 const float* input,
25 const float* zero,
26 const float* weights,
27 float* output,
28 size_t input_padding_top,
29 size_t output_channels,
30 size_t output_height_stride,
31 size_t output_width_stride,
32 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33{
34 assert(input_width != 0);
35 assert(output_y_end > output_y_start);
36 assert(input_padding_top <= 1);
37 assert(output_channels != 0);
38
39 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40 const size_t input_width_decrement = input_width * 3 /* channels */ * sizeof(float);
41 const size_t output_width = (input_width + 1) / 2;
42 const size_t output_channel_decrement = output_width * output_width_stride - ${CHANNEL_TILE} * sizeof(float);
43 const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, ${CHANNEL_TILE}) * sizeof(float);
44
45 // Adjustment for padding processed below
46 const float* i0 = (const float*) ((uintptr_t) input +
47 input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48 $for Y in range(HEIGHT_TILE + 3 - 1):
49 const float* i${Y+1} = (const float*) ((uintptr_t) i${Y} + input_height_stride);
50 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
51 $for Y in range(HEIGHT_TILE - 1):
52 float* o${Y+1} = (float*) ((uintptr_t) o${Y} + output_height_stride);
53
54 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
55 i0 = zero;
56 }
57
Marat Dukhan56b10cd2020-05-18 09:35:49 -070058 $if FMA:
59 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
60 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
Marat Dukhance7a3f82020-05-17 21:46:44 -070061
62 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64 const size_t input_y4 = input_y2 + 2;
65 if XNN_UNPREDICTABLE(input_y2 > input_height) {
66 i1 = zero;
67 }
68 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69 i2 = zero;
70 }
71 if XNN_UNPREDICTABLE(input_y4 > input_height) {
72 i3 = zero;
73 }
74 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75 i4 = zero;
76 }
77 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78 o1 = o0;
79 }
80
81 const float* w = weights;
82 size_t c = output_channels;
83 do {
84 // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
85 $for Y in range(HEIGHT_TILE + 3):
86 float32x4_t vi${Y}x0 = vmovq_n_f32(0.0f);
87
88 size_t iw = input_width;
89 for (; iw >= 4; iw -= 4) {
90 float32x4_t vo0x0c${ABC[0:4]} = vld1q_f32(w);
91 $for C in range(4, CHANNEL_TILE, 4):
92 float32x4_t vo0x0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
93 $for Y in range(1, HEIGHT_TILE):
94 $for C in range(0, CHANNEL_TILE, 4):
95 float32x4_t vo${Y}x0c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
96 $for Y in range(HEIGHT_TILE):
97 $for C in range(0, CHANNEL_TILE, 4):
98 float32x4_t vo${Y}x1c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
99
100 $for C in range(0, CHANNEL_TILE, 4):
101 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
102
103 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
104 $for Y in range(HEIGHT_TILE + 3):
105 const float32x4_t vi${Y}x1 = vld1q_f32(i${Y}); i${Y} += 4;
106
107 $for C in range(0, CHANNEL_TILE, 4):
108 $for Y in range(HEIGHT_TILE):
109 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x0), 1);
110
111 $for C in range(0, CHANNEL_TILE, 4):
112 $for Y in range(HEIGHT_TILE):
113 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x1), 1);
114
115 $for C in range(0, CHANNEL_TILE, 4):
116 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
117
118 $for C in range(0, CHANNEL_TILE, 4):
119 $for Y in range(HEIGHT_TILE):
120 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x0), 1);
121
122 $for C in range(0, CHANNEL_TILE, 4):
123 $for Y in range(HEIGHT_TILE):
124 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x1), 1);
125
126 $for C in range(0, CHANNEL_TILE, 4):
127 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
128
129 $for C in range(0, CHANNEL_TILE, 4):
130 $for Y in range(HEIGHT_TILE):
131 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x0), 1);
132
133 $for C in range(0, CHANNEL_TILE, 4):
134 $for Y in range(HEIGHT_TILE):
135 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x1), 1);
136
137 $for C in range(0, CHANNEL_TILE, 4):
138 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
139
140 // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
141 $for Y in range(HEIGHT_TILE + 3):
142 const float32x4_t vi${Y}x2 = vld1q_f32(i${Y}); i${Y} += 4;
143
144 $for C in range(0, CHANNEL_TILE, 4):
145 $for Y in range(HEIGHT_TILE):
146 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x0), 0);
147
148 $for C in range(0, CHANNEL_TILE, 4):
149 $for Y in range(HEIGHT_TILE):
150 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x2), 0);
151
152 $for C in range(0, CHANNEL_TILE, 4):
153 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
154
155 $for C in range(0, CHANNEL_TILE, 4):
156 $for Y in range(HEIGHT_TILE):
157 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x0), 0);
158
159 $for C in range(0, CHANNEL_TILE, 4):
160 $for Y in range(HEIGHT_TILE):
161 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x2), 0);
162
163 $for C in range(0, CHANNEL_TILE, 4):
164 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
165
166 $for C in range(0, CHANNEL_TILE, 4):
167 $for Y in range(HEIGHT_TILE):
168 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x0), 0);
169
170 $for C in range(0, CHANNEL_TILE, 4):
171 $for Y in range(HEIGHT_TILE):
172 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x2), 0);
173
174 $for C in range(0, CHANNEL_TILE, 4):
175 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
176
177 $for C in range(0, CHANNEL_TILE, 4):
178 $for Y in range(HEIGHT_TILE):
179 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x0), 1);
180
181 $for C in range(0, CHANNEL_TILE, 4):
182 $for Y in range(HEIGHT_TILE):
183 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x2), 1);
184
185 $for C in range(0, CHANNEL_TILE, 4):
186 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
187
188 $for C in range(0, CHANNEL_TILE, 4):
189 $for Y in range(HEIGHT_TILE):
190 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x0), 1);
191
192 $for C in range(0, CHANNEL_TILE, 4):
193 $for Y in range(HEIGHT_TILE):
194 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x2), 1);
195
196 $for C in range(0, CHANNEL_TILE, 4):
197 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
198
199 $for C in range(0, CHANNEL_TILE, 4):
200 $for Y in range(HEIGHT_TILE):
201 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x0), 1);
202
203 $for C in range(0, CHANNEL_TILE, 4):
204 $for Y in range(HEIGHT_TILE):
205 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x2), 1);
206
207 $for C in range(0, CHANNEL_TILE, 4):
208 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
209
210 $for C in range(0, CHANNEL_TILE, 4):
211 $for Y in range(HEIGHT_TILE):
212 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x1), 0);
213
214 $for C in range(0, CHANNEL_TILE, 4):
215 $for Y in range(HEIGHT_TILE):
216 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x2), 0);
217
218 $for C in range(0, CHANNEL_TILE, 4):
219 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
220
221 $for C in range(0, CHANNEL_TILE, 4):
222 $for Y in range(HEIGHT_TILE):
223 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x1), 0);
224
225 $for C in range(0, CHANNEL_TILE, 4):
226 $for Y in range(HEIGHT_TILE):
227 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x2), 0);
228
229 $for C in range(0, CHANNEL_TILE, 4):
230 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
231
232 $for C in range(0, CHANNEL_TILE, 4):
233 $for Y in range(HEIGHT_TILE):
234 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x1), 0);
235
236 $for C in range(0, CHANNEL_TILE, 4):
237 $for Y in range(HEIGHT_TILE):
238 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x2), 0);
239
240 $for C in range(0, CHANNEL_TILE, 4):
241 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
242
243 $for C in range(0, CHANNEL_TILE, 4):
244 $for Y in range(HEIGHT_TILE):
245 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x1), 1);
246
247 $for C in range(0, CHANNEL_TILE, 4):
248 $for Y in range(HEIGHT_TILE):
249 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x2), 1);
250
251 $for C in range(0, CHANNEL_TILE, 4):
252 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
253
254 $for C in range(0, CHANNEL_TILE, 4):
255 $for Y in range(HEIGHT_TILE):
256 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x1), 1);
257
258 $for C in range(0, CHANNEL_TILE, 4):
259 $for Y in range(HEIGHT_TILE):
260 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x2), 1);
261
262 $for C in range(0, CHANNEL_TILE, 4):
263 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
264
265 $for C in range(0, CHANNEL_TILE, 4):
266 $for Y in range(HEIGHT_TILE):
267 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x1), 1);
268
269 $for C in range(0, CHANNEL_TILE, 4):
270 $for Y in range(HEIGHT_TILE):
271 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x2), 1);
272
273 $for C in range(0, CHANNEL_TILE, 4):
274 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
275
276 // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
277 $for Y in range(HEIGHT_TILE + 3):
278 const float32x4_t vi${Y}x3 = vld1q_f32(i${Y}); i${Y} += 4;
279
280 $for C in range(0, CHANNEL_TILE, 4):
281 $for Y in range(HEIGHT_TILE):
282 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x1), 0);
283
284 $for C in range(0, CHANNEL_TILE, 4):
285 $for Y in range(HEIGHT_TILE):
286 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x3), 0);
287
288 $for C in range(0, CHANNEL_TILE, 4):
289 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
290
291 $for C in range(0, CHANNEL_TILE, 4):
292 $for Y in range(HEIGHT_TILE):
293 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x1), 0);
294
295 $for C in range(0, CHANNEL_TILE, 4):
296 $for Y in range(HEIGHT_TILE):
297 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x3), 0);
298
299 $for C in range(0, CHANNEL_TILE, 4):
300 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
301
302 $for C in range(0, CHANNEL_TILE, 4):
303 $for Y in range(HEIGHT_TILE):
304 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x1), 0);
305
306 $for C in range(0, CHANNEL_TILE, 4):
307 $for Y in range(HEIGHT_TILE):
308 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x3), 0);
309
310 $for C in range(0, CHANNEL_TILE, 4):
311 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
312
313 $for C in range(0, CHANNEL_TILE, 4):
314 $for Y in range(HEIGHT_TILE):
315 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x1), 1);
316
317 $for C in range(0, CHANNEL_TILE, 4):
318 $for Y in range(HEIGHT_TILE):
319 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x3), 1);
320
321 $for C in range(0, CHANNEL_TILE, 4):
322 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
323
324 $for C in range(0, CHANNEL_TILE, 4):
325 $for Y in range(HEIGHT_TILE):
326 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x1), 1);
327
328 $for C in range(0, CHANNEL_TILE, 4):
329 $for Y in range(HEIGHT_TILE):
330 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x3), 1);
331
332 $for C in range(0, CHANNEL_TILE, 4):
333 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
334
335 $for C in range(0, CHANNEL_TILE, 4):
336 $for Y in range(HEIGHT_TILE):
337 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x1), 1);
338
339 $for C in range(0, CHANNEL_TILE, 4):
340 $for Y in range(HEIGHT_TILE):
341 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x3), 1);
342
343 $for C in range(0, CHANNEL_TILE, 4):
344 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
345
346 $for C in range(0, CHANNEL_TILE, 4):
347 $for Y in range(HEIGHT_TILE):
348 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x2), 0);
349
350 $for C in range(0, CHANNEL_TILE, 4):
351 $for Y in range(HEIGHT_TILE):
352 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x3), 0);
353
354 $for C in range(0, CHANNEL_TILE, 4):
355 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
356
357 $for C in range(0, CHANNEL_TILE, 4):
358 $for Y in range(HEIGHT_TILE):
359 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x2), 0);
360
361 $for C in range(0, CHANNEL_TILE, 4):
362 $for Y in range(HEIGHT_TILE):
363 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x3), 0);
364
365 $for C in range(0, CHANNEL_TILE, 4):
366 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
367
368 $for C in range(0, CHANNEL_TILE, 4):
369 $for Y in range(HEIGHT_TILE):
370 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x2), 0);
371
372 $for C in range(0, CHANNEL_TILE, 4):
373 $for Y in range(HEIGHT_TILE):
374 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x3), 0);
375
376 $for C in range(0, CHANNEL_TILE, 4):
377 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
378
379 $for C in range(0, CHANNEL_TILE, 4):
380 $for Y in range(HEIGHT_TILE):
381 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x2), 1);
382
383 $for C in range(0, CHANNEL_TILE, 4):
384 $for Y in range(HEIGHT_TILE):
385 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x3), 1);
386
387 $for C in range(0, CHANNEL_TILE, 4):
388 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
389
390 $for C in range(0, CHANNEL_TILE, 4):
391 $for Y in range(HEIGHT_TILE):
392 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x2), 1);
393
394 $for C in range(0, CHANNEL_TILE, 4):
395 $for Y in range(HEIGHT_TILE):
396 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x3), 1);
397
398 $for C in range(0, CHANNEL_TILE, 4):
399 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
400
401 $for C in range(0, CHANNEL_TILE, 4):
402 $for Y in range(HEIGHT_TILE):
403 vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x2), 1);
404
405 $for C in range(0, CHANNEL_TILE, 4):
406 $for Y in range(HEIGHT_TILE):
407 vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x3), 1);
408
409 $for Y in range(HEIGHT_TILE + 3):
410 vi${Y}x0 = vi${Y}x3;
411
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700412 $if not FMA:
413 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
414 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
415
Marat Dukhance7a3f82020-05-17 21:46:44 -0700416 $for C in range(0, CHANNEL_TILE, 4):
417 $for Y in range(HEIGHT_TILE):
418 vo${Y}x0c${ABC[C:C+4]} = vmaxq_f32(vo${Y}x0c${ABC[C:C+4]}, vmin);
419
420 $for C in range(0, CHANNEL_TILE, 4):
421 $for Y in range(HEIGHT_TILE):
422 vo${Y}x1c${ABC[C:C+4]} = vmaxq_f32(vo${Y}x1c${ABC[C:C+4]}, vmin);
423
424 $for C in range(0, CHANNEL_TILE, 4):
425 $for Y in range(HEIGHT_TILE):
426 vo${Y}x0c${ABC[C:C+4]} = vminq_f32(vo${Y}x0c${ABC[C:C+4]}, vmax);
427
428 $for C in range(0, CHANNEL_TILE, 4):
429 $for Y in range(HEIGHT_TILE):
430 vo${Y}x1c${ABC[C:C+4]} = vminq_f32(vo${Y}x1c${ABC[C:C+4]}, vmax);
431
432 if XNN_LIKELY(c >= ${CHANNEL_TILE}) {
433 $for Y in reversed(range(HEIGHT_TILE)):
434 vst1q_f32(o${Y}, vo${Y}x0c${ABC[0:4]});
435 $for C in range(4, CHANNEL_TILE, 4):
436 vst1q_f32(o${Y} + 4, vo${Y}x0c${ABC[C:C+4]});
437 o${Y} = (float*) ((uintptr_t) o${Y} + output_width_stride);
438
439 $for Y in reversed(range(HEIGHT_TILE)):
440 vst1q_f32(o${Y}, vo${Y}x1c${ABC[0:4]});
441 $for C in range(4, CHANNEL_TILE, 4):
442 vst1q_f32(o${Y} + 4, vo${Y}x1c${ABC[C:C+4]});
443 o${Y} = (float*) ((uintptr_t) o${Y} + output_width_stride);
444 } else {
445 $for Y in range(HEIGHT_TILE):
446 float* o${Y}_tmp = o${Y};
447 $for LOG2_CHANNEL_TILE in reversed(range(CHANNEL_TILE.bit_length())):
448 $if CHANNEL_TILE != 1 << LOG2_CHANNEL_TILE:
449 $if LOG2_CHANNEL_TILE == 1:
450 $for Y in range(HEIGHT_TILE):
451 float32x2_t vo${Y}x0c${ABC[0:2]} = vget_low_f32(vo${Y}x0c${ABC[0:4]});
452 $for Y in range(HEIGHT_TILE):
453 float32x2_t vo${Y}x1c${ABC[0:2]} = vget_low_f32(vo${Y}x1c${ABC[0:4]});
454 if (c & ${1 << LOG2_CHANNEL_TILE}) {
455 $if LOG2_CHANNEL_TILE >= 2:
456 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
457 $for Y in reversed(range(HEIGHT_TILE)):
458 vst1q_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[C:C+4]});
459 vo${Y}x1c${ABC[C:C+4]} = vo${Y}x1c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
460
461 $for Y in reversed(range(HEIGHT_TILE)):
462 vst1q_f32(o${Y}_tmp, vo${Y}x0c${ABC[C:C+4]}); o${Y}_tmp += 4;
463 vo${Y}x0c${ABC[C:C+4]} = vo${Y}x0c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
464 $elif LOG2_CHANNEL_TILE == 1:
465 $for Y in reversed(range(HEIGHT_TILE)):
466 vst1_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[0:2]});
467 vo${Y}x1c${ABC[0:2]} = vget_high_f32(vo${Y}x1c${ABC[0:4]});
468
469 $for Y in reversed(range(HEIGHT_TILE)):
470 vst1_f32(o${Y}_tmp, vo${Y}x0c${ABC[0:2]}); o${Y}_tmp += 2;
471 vo${Y}x0c${ABC[0:2]} = vget_high_f32(vo${Y}x0c${ABC[0:4]});
472 $elif LOG2_CHANNEL_TILE == 0:
473 $for Y in reversed(range(HEIGHT_TILE)):
474 vst1_lane_f32(o${Y}_tmp, vo${Y}x0c${ABC[0:2]}, 0);
475
476 $for Y in reversed(range(HEIGHT_TILE)):
477 vst1_lane_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[0:2]}, 0);
478 }
479
480 $for Y in range(HEIGHT_TILE):
481 o${Y} = (float*) ((uintptr_t) o${Y} + output_width_stride * 2);
482 }
483 }
484 assert(iw < 4);
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700485 if XNN_UNLIKELY(iw & 2) {
Marat Dukhance7a3f82020-05-17 21:46:44 -0700486 float32x4_t vo0c${ABC[0:4]} = vld1q_f32(w);
487 $for C in range(4, CHANNEL_TILE, 4):
488 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
489 $for Y in range(1, HEIGHT_TILE):
490 $for C in range(0, CHANNEL_TILE, 4):
491 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
492
493 $for C in range(0, CHANNEL_TILE, 4):
494 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
495
496 $for C in range(0, CHANNEL_TILE, 4):
497 $for Y in range(HEIGHT_TILE):
498 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x0), 1);
499
500 $for C in range(0, CHANNEL_TILE, 4):
501 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
502
503 $for C in range(0, CHANNEL_TILE, 4):
504 $for Y in range(HEIGHT_TILE):
505 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x0), 1);
506
507 $for C in range(0, CHANNEL_TILE, 4):
508 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
509
510 $for C in range(0, CHANNEL_TILE, 4):
511 $for Y in range(HEIGHT_TILE):
512 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x0), 1);
513
514 $for C in range(0, CHANNEL_TILE, 4):
515 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
516
517 $for C in range(0, CHANNEL_TILE, 4):
518 $for Y in range(HEIGHT_TILE):
519 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x0), 0);
520
521 $for C in range(0, CHANNEL_TILE, 4):
522 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
523
524 $for C in range(0, CHANNEL_TILE, 4):
525 $for Y in range(HEIGHT_TILE):
526 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x0), 0);
527
528 $for C in range(0, CHANNEL_TILE, 4):
529 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
530
531 $for C in range(0, CHANNEL_TILE, 4):
532 $for Y in range(HEIGHT_TILE):
533 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x0), 0);
534
535 $for C in range(0, CHANNEL_TILE, 4):
536 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
537
538 $for C in range(0, CHANNEL_TILE, 4):
539 $for Y in range(HEIGHT_TILE):
540 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x0), 1);
541
542 $for C in range(0, CHANNEL_TILE, 4):
543 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
544
545 $for C in range(0, CHANNEL_TILE, 4):
546 $for Y in range(HEIGHT_TILE):
547 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x0), 1);
548
549 $for C in range(0, CHANNEL_TILE, 4):
550 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
551
552 $for C in range(0, CHANNEL_TILE, 4):
553 $for Y in range(HEIGHT_TILE):
554 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x0), 1);
555
556 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
557 $for Y in range(HEIGHT_TILE + 3):
558 const float32x4_t vi${Y}x1 = vld1q_f32(i${Y}); i${Y} += 4;
559
560 $for C in range(0, CHANNEL_TILE, 4):
561 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
562
563 $for C in range(0, CHANNEL_TILE, 4):
564 $for Y in range(HEIGHT_TILE):
565 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x1), 0);
566
567 $for C in range(0, CHANNEL_TILE, 4):
568 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
569
570 $for C in range(0, CHANNEL_TILE, 4):
571 $for Y in range(HEIGHT_TILE):
572 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x1), 0);
573
574 $for C in range(0, CHANNEL_TILE, 4):
575 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
576
577 $for C in range(0, CHANNEL_TILE, 4):
578 $for Y in range(HEIGHT_TILE):
579 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x1), 0);
580
581 $for C in range(0, CHANNEL_TILE, 4):
582 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
583
584 $for C in range(0, CHANNEL_TILE, 4):
585 $for Y in range(HEIGHT_TILE):
586 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x1), 1);
587
588 $for C in range(0, CHANNEL_TILE, 4):
589 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
590
591 $for C in range(0, CHANNEL_TILE, 4):
592 $for Y in range(HEIGHT_TILE):
593 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x1), 1);
594
595 $for C in range(0, CHANNEL_TILE, 4):
596 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
597
598 $for C in range(0, CHANNEL_TILE, 4):
599 $for Y in range(HEIGHT_TILE):
600 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x1), 1);
601
602 $for C in range(0, CHANNEL_TILE, 4):
603 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
604
605 $for C in range(0, CHANNEL_TILE, 4):
606 $for Y in range(HEIGHT_TILE):
607 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x1), 0);
608
609 $for C in range(0, CHANNEL_TILE, 4):
610 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
611
612 $for C in range(0, CHANNEL_TILE, 4):
613 $for Y in range(HEIGHT_TILE):
614 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x1), 0);
615
616 $for C in range(0, CHANNEL_TILE, 4):
617 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
618
619 $for C in range(0, CHANNEL_TILE, 4):
620 $for Y in range(HEIGHT_TILE):
621 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x1), 0);
622
623 $for C in range(0, CHANNEL_TILE, 4):
624 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
625
626 $for C in range(0, CHANNEL_TILE, 4):
627 $for Y in range(HEIGHT_TILE):
628 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x1), 1);
629
630 $for C in range(0, CHANNEL_TILE, 4):
631 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
632
633 $for C in range(0, CHANNEL_TILE, 4):
634 $for Y in range(HEIGHT_TILE):
635 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x1), 1);
636
637 $for C in range(0, CHANNEL_TILE, 4):
638 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
639
640 $for C in range(0, CHANNEL_TILE, 4):
641 $for Y in range(HEIGHT_TILE):
642 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x1), 1);
643
644 // viMx2 = ( iM2c2, iM2c1 )
645 $for Y in range(HEIGHT_TILE + 3):
646 const float32x2_t vi${Y}x2 = vld1_f32(i${Y}); i${Y} += 2;
647
648 $for C in range(0, CHANNEL_TILE, 4):
649 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
650
651 $for C in range(0, CHANNEL_TILE, 4):
652 $for Y in range(HEIGHT_TILE):
653 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vi${Y*2}x2, 0);
654
655 $for C in range(0, CHANNEL_TILE, 4):
656 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
657
658 $for C in range(0, CHANNEL_TILE, 4):
659 $for Y in range(HEIGHT_TILE):
660 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vi${Y*2+1}x2, 0);
661
662 $for C in range(0, CHANNEL_TILE, 4):
663 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
664
665 $for C in range(0, CHANNEL_TILE, 4):
666 $for Y in range(HEIGHT_TILE):
667 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vi${Y*2+2}x2, 0);
668
669 $for C in range(0, CHANNEL_TILE, 4):
670 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
671
672 $for C in range(0, CHANNEL_TILE, 4):
673 $for Y in range(HEIGHT_TILE):
674 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vi${Y*2}x2, 1);
675
676 $for C in range(0, CHANNEL_TILE, 4):
677 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
678
679 $for C in range(0, CHANNEL_TILE, 4):
680 $for Y in range(HEIGHT_TILE):
681 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vi${Y*2+1}x2, 1);
682
683 $for C in range(0, CHANNEL_TILE, 4):
684 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
685
686 $for C in range(0, CHANNEL_TILE, 4):
687 $for Y in range(HEIGHT_TILE):
688 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vi${Y*2+2}x2, 1);
689
690 $for Y in range(HEIGHT_TILE + 3):
691 vi${Y}x0 = vcombine_f32(vget_high_f32(vi${Y}x1), vi${Y}x2);
692
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700693 $if not FMA:
694 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
695 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
696
Marat Dukhance7a3f82020-05-17 21:46:44 -0700697 $for C in range(0, CHANNEL_TILE, 4):
698 $for Y in range(HEIGHT_TILE):
699 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
700
701 $for C in range(0, CHANNEL_TILE, 4):
702 $for Y in range(HEIGHT_TILE):
703 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
704
705 if XNN_LIKELY(c >= ${CHANNEL_TILE}) {
706 $for Y in reversed(range(HEIGHT_TILE)):
707 vst1q_f32(o${Y}, vo${Y}c${ABC[0:4]});
708 $for C in range(4, CHANNEL_TILE, 4):
709 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
710 o${Y} = (float*) ((uintptr_t) o${Y} + output_width_stride);
711 } else {
712 $for Y in range(HEIGHT_TILE):
713 float* o${Y}_tmp = o${Y};
714 $for LOG2_CHANNEL_TILE in reversed(range(CHANNEL_TILE.bit_length())):
715 $if CHANNEL_TILE != 1 << LOG2_CHANNEL_TILE:
716 $if LOG2_CHANNEL_TILE == 1:
717 $for Y in range(HEIGHT_TILE):
718 float32x2_t vo${Y}c${ABC[0:2]} = vget_low_f32(vo${Y}c${ABC[0:4]});
719 if (c & ${1 << LOG2_CHANNEL_TILE}) {
720 $if LOG2_CHANNEL_TILE >= 2:
721 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
722 $for Y in reversed(range(HEIGHT_TILE)):
723 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
724 vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
725 $elif LOG2_CHANNEL_TILE == 1:
726 $for Y in reversed(range(HEIGHT_TILE)):
727 vst1_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}); o${Y}_tmp += 2;
728 vo${Y}c${ABC[0:2]} = vget_high_f32(vo${Y}c${ABC[0:4]});
729 $elif LOG2_CHANNEL_TILE == 0:
730 $for Y in reversed(range(HEIGHT_TILE)):
731 vst1_lane_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}, 0);
732 }
733
734 $for Y in range(HEIGHT_TILE):
735 o${Y} = (float*) ((uintptr_t) o${Y} + output_width_stride);
736 }
737 }
738 if XNN_UNLIKELY(iw & 1) {
739 float32x4_t vo0c${ABC[0:4]} = vld1q_f32(w);
740 $for C in range(4, CHANNEL_TILE, 4):
741 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
742 $for Y in range(1, HEIGHT_TILE):
743 $for C in range(0, CHANNEL_TILE, 4):
744 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
745
746 $for C in range(0, CHANNEL_TILE, 4):
747 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
748
749 $for C in range(0, CHANNEL_TILE, 4):
750 $for Y in range(HEIGHT_TILE):
751 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x0), 1);
752
753 $for C in range(0, CHANNEL_TILE, 4):
754 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
755
756 $for C in range(0, CHANNEL_TILE, 4):
757 $for Y in range(HEIGHT_TILE):
758 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x0), 1);
759
760 $for C in range(0, CHANNEL_TILE, 4):
761 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
762
763 $for C in range(0, CHANNEL_TILE, 4):
764 $for Y in range(HEIGHT_TILE):
765 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x0), 1);
766
767 $for C in range(0, CHANNEL_TILE, 4):
768 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
769
770 $for C in range(0, CHANNEL_TILE, 4):
771 $for Y in range(HEIGHT_TILE):
772 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x0), 0);
773
774 $for C in range(0, CHANNEL_TILE, 4):
775 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
776
777 $for C in range(0, CHANNEL_TILE, 4):
778 $for Y in range(HEIGHT_TILE):
779 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x0), 0);
780
781 $for C in range(0, CHANNEL_TILE, 4):
782 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
783
784 $for C in range(0, CHANNEL_TILE, 4):
785 $for Y in range(HEIGHT_TILE):
786 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x0), 0);
787
788 $for C in range(0, CHANNEL_TILE, 4):
789 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
790
791 $for C in range(0, CHANNEL_TILE, 4):
792 $for Y in range(HEIGHT_TILE):
793 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x0), 1);
794
795 $for C in range(0, CHANNEL_TILE, 4):
796 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
797
798 $for C in range(0, CHANNEL_TILE, 4):
799 $for Y in range(HEIGHT_TILE):
800 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x0), 1);
801
802 $for C in range(0, CHANNEL_TILE, 4):
803 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
804
805 $for C in range(0, CHANNEL_TILE, 4):
806 $for Y in range(HEIGHT_TILE):
807 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x0), 1);
808
809 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
810 $for Y in range(HEIGHT_TILE + 3):
811 const float32x4_t vi${Y}x1 = vld1q_f32(i${Y}); i${Y} += 3;
812
813 $for C in range(0, CHANNEL_TILE, 4):
814 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
815
816 $for C in range(0, CHANNEL_TILE, 4):
817 $for Y in range(HEIGHT_TILE):
818 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x1), 0);
819
820 $for C in range(0, CHANNEL_TILE, 4):
821 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
822
823 $for C in range(0, CHANNEL_TILE, 4):
824 $for Y in range(HEIGHT_TILE):
825 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x1), 0);
826
827 $for C in range(0, CHANNEL_TILE, 4):
828 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
829
830 $for C in range(0, CHANNEL_TILE, 4):
831 $for Y in range(HEIGHT_TILE):
832 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x1), 0);
833
834 $for C in range(0, CHANNEL_TILE, 4):
835 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
836
837 $for C in range(0, CHANNEL_TILE, 4):
838 $for Y in range(HEIGHT_TILE):
839 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2}x1), 1);
840
841 $for C in range(0, CHANNEL_TILE, 4):
842 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
843
844 $for C in range(0, CHANNEL_TILE, 4):
845 $for Y in range(HEIGHT_TILE):
846 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+1}x1), 1);
847
848 $for C in range(0, CHANNEL_TILE, 4):
849 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
850
851 $for C in range(0, CHANNEL_TILE, 4):
852 $for Y in range(HEIGHT_TILE):
853 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f32(vi${Y*2+2}x1), 1);
854
855 $for C in range(0, CHANNEL_TILE, 4):
856 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
857
858 $for C in range(0, CHANNEL_TILE, 4):
859 $for Y in range(HEIGHT_TILE):
860 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2}x1), 0);
861
862 $for C in range(0, CHANNEL_TILE, 4):
863 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
864
865 $for C in range(0, CHANNEL_TILE, 4):
866 $for Y in range(HEIGHT_TILE):
867 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+1}x1), 0);
868
869 $for C in range(0, CHANNEL_TILE, 4):
870 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
871
872 $for C in range(0, CHANNEL_TILE, 4):
873 $for Y in range(HEIGHT_TILE):
874 vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_f32(vi${Y*2+2}x1), 0);
875
Marat Dukhan56b10cd2020-05-18 09:35:49 -0700876 $if not FMA:
877 const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
878 const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
879
Marat Dukhance7a3f82020-05-17 21:46:44 -0700880 $for C in range(0, CHANNEL_TILE, 4):
881 $for Y in range(HEIGHT_TILE):
882 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
883
884 $for C in range(0, CHANNEL_TILE, 4):
885 $for Y in range(HEIGHT_TILE):
886 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
887
888 if XNN_LIKELY(c >= ${CHANNEL_TILE}) {
889 $for Y in reversed(range(HEIGHT_TILE)):
890 vst1q_f32(o${Y}, vo${Y}c${ABC[0:4]});
891 $for C in range(4, CHANNEL_TILE, 4):
892 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
893 o${Y} = (float*) ((uintptr_t) o${Y} + output_width_stride);
894 } else {
895 $for Y in range(HEIGHT_TILE):
896 float* o${Y}_tmp = o${Y};
897 $for LOG2_CHANNEL_TILE in reversed(range(CHANNEL_TILE.bit_length())):
898 $if CHANNEL_TILE != 1 << LOG2_CHANNEL_TILE:
899 $if LOG2_CHANNEL_TILE == 1:
900 $for Y in range(HEIGHT_TILE):
901 float32x2_t vo${Y}c${ABC[0:2]} = vget_low_f32(vo${Y}c${ABC[0:4]});
902 if (c & ${1 << LOG2_CHANNEL_TILE}) {
903 $if LOG2_CHANNEL_TILE >= 2:
904 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
905 $for Y in reversed(range(HEIGHT_TILE)):
906 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
907 vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
908 $elif LOG2_CHANNEL_TILE == 1:
909 $for Y in reversed(range(HEIGHT_TILE)):
910 vst1_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}); o${Y}_tmp += 2;
911 vo${Y}c${ABC[0:2]} = vget_high_f32(vo${Y}c${ABC[0:4]});
912 $elif LOG2_CHANNEL_TILE == 0:
913 $for Y in reversed(range(HEIGHT_TILE)):
914 vst1_lane_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}, 0);
915 }
916 $for Y in range(HEIGHT_TILE):
917 o${Y} = (float*) ((uintptr_t) o${Y} + output_width_stride);
918 }
919 }
920 // Move output pointers back to the position of the first pixel in a row,
921 // and forward to the next block of output channels
922 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
923 o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
924 // Revert input pointers to the position of the first pixel in a row
925 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
926 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
927 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
928 i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
929 i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
930 // Move to the block of weights for the next ${CHANNEL_TILE} output channels
931 w += ${CHANNEL_TILE * 28};
932 c = doz(c, ${CHANNEL_TILE});
933 } while (c != 0);
934 // Move output pointers back to the position of the first channel, and forward to the next block of rows
935 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
936 o1 = (float*) ((uintptr_t) o1 + output_height_increment);
937 // Move input pointers forward to the next four rows
938 i0 = i4;
939 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
940 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
941 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
942 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
943 }
944}