blob: e6886ce4b5e1bb401cdabfb61bafb54ee6a4fc2d [file] [log] [blame]
Frank Barchardc6889b32020-12-21 11:27:22 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-dwconv2d-chw/5x5s2p2-wasmsimd-loadsplat.c.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <wasm_simd128.h>
13
14#include <xnnpack/dwconv.h>
15#include <xnnpack/math.h>
16
17
18
19void xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2(
20
21 size_t input_height,
22 size_t input_width,
23 const float* input,
24 const float* weights,
25 const float* zero,
26 float* output,
27 uint32_t padding_top,
28 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
29{
30 assert(input_height != 0);
31 assert(input_width != 0);
32 assert(input_width % sizeof(float) == 0);
33 assert(padding_top >= 1);
34 assert(padding_top <= 2);
35
36 const v128_t vmask_even = wasm_v128_load(params->scalar.mask_even);
37 const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd);
38 const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
39 const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
40
41 const v128_t vbias = wasm_v32x4_load_splat(weights);
42 const v128_t vk00 = wasm_v32x4_load_splat(weights + 1);
43 const v128_t vk01 = wasm_v32x4_load_splat(weights + 2);
44 const v128_t vk02 = wasm_v32x4_load_splat(weights + 3);
45 const v128_t vk03 = wasm_v32x4_load_splat(weights + 4);
46 const v128_t vk04 = wasm_v32x4_load_splat(weights + 5);
47 const v128_t vk10 = wasm_v32x4_load_splat(weights + 6);
48 const v128_t vk11 = wasm_v32x4_load_splat(weights + 7);
49 const v128_t vk12 = wasm_v32x4_load_splat(weights + 8);
50 const v128_t vk13 = wasm_v32x4_load_splat(weights + 9);
51 const v128_t vk14 = wasm_v32x4_load_splat(weights + 10);
52 const v128_t vk20 = wasm_v32x4_load_splat(weights + 11);
53 const v128_t vk21 = wasm_v32x4_load_splat(weights + 12);
54 const v128_t vk22 = wasm_v32x4_load_splat(weights + 13);
55 const v128_t vk23 = wasm_v32x4_load_splat(weights + 14);
56 const v128_t vk24 = wasm_v32x4_load_splat(weights + 15);
57 const v128_t vk30 = wasm_v32x4_load_splat(weights + 16);
58 const v128_t vk31 = wasm_v32x4_load_splat(weights + 17);
59 const v128_t vk32 = wasm_v32x4_load_splat(weights + 18);
60 const v128_t vk33 = wasm_v32x4_load_splat(weights + 19);
61 const v128_t vk34 = wasm_v32x4_load_splat(weights + 20);
62 const v128_t vk40 = wasm_v32x4_load_splat(weights + 21);
63 const v128_t vk41 = wasm_v32x4_load_splat(weights + 22);
64 const v128_t vk42 = wasm_v32x4_load_splat(weights + 23);
65 const v128_t vk43 = wasm_v32x4_load_splat(weights + 24);
66 const v128_t vk44 = wasm_v32x4_load_splat(weights + 25);
67
68 const v128_t vzero = wasm_f32x4_splat(0.0f);
69
70 const uint32_t padding_top_less_1 = padding_top - 1;
71 const size_t input_decrement = round_up_po2(input_width, 8 * sizeof(float));
72
73 const float* i0 = zero;
74 const float* i1 = (const float*) ((uintptr_t) input - ((-padding_top_less_1) & input_width));
75 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
76 if XNN_UNPREDICTABLE(padding_top_less_1 != 0) {
77 i1 = zero;
78 }
79 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
80 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
81 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
82 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
83 const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
84 const float* i8 = (const float*) ((uintptr_t) i7 + input_width);
85
86 const size_t output_width = round_down_po2((input_width + (2 /* padding */ - 3 /* kernel size */ + 2 /* subsampling */) * sizeof(float)) / 2, sizeof(float));
87
88 float* o0 = output;
89 float* o1 = (float*) ((uintptr_t) o0 + output_width);
90 float* o2 = (float*) ((uintptr_t) o1 + output_width);
91
92 size_t padded_input_height = input_height + (padding_top_less_1 + 1) + 2 /* padding bottom */;
93 size_t output_height = (padded_input_height - 5 /* kernel size */ + 2 /* subsampling */) / 2;
94 do {
95 if XNN_UNPREDICTABLE(padded_input_height < 6) {
96 i3 = zero;
97 }
98 if XNN_UNPREDICTABLE(padded_input_height < 7) {
99 i4 = zero;
100 o1 = o0;
101 }
102 if XNN_UNPREDICTABLE(padded_input_height < 8) {
103 i5 = zero;
104 }
105 if XNN_UNPREDICTABLE(padded_input_height < 9) {
106 i6 = zero;
107 o2 = o1;
108 }
109 if XNN_UNPREDICTABLE(padded_input_height < 10) {
110 i7 = zero;
111 }
112 if XNN_UNPREDICTABLE(padded_input_height < 11) {
113 i8 = zero;
114 }
115
116 v128_t vi0x0246 = vzero;
117 v128_t vi1x0246 = vzero;
118 v128_t vi2x0246 = vzero;
119 v128_t vi3x0246 = vzero;
120 v128_t vi4x0246 = vzero;
121 v128_t vi5x0246 = vzero;
122 v128_t vi6x0246 = vzero;
123 v128_t vi7x0246 = vzero;
124 v128_t vi8x0246 = vzero;
125
126 v128_t vi0x1357 = vzero;
127 v128_t vi1x1357 = vzero;
128 v128_t vi2x1357 = vzero;
129 v128_t vi3x1357 = vzero;
130 v128_t vi4x1357 = vzero;
131 v128_t vi5x1357 = vzero;
132 v128_t vi6x1357 = vzero;
133 v128_t vi7x1357 = vzero;
134 v128_t vi8x1357 = vzero;
135
136 const v128_t vi0x89AB = wasm_v128_load(i0);
137 const v128_t vi0xCDEF = wasm_v128_load(i0 + 4);
138 i0 += 8;
139 const v128_t vi1x89AB = wasm_v128_load(i1);
140 const v128_t vi1xCDEF = wasm_v128_load(i1 + 4);
141 i1 += 8;
142 const v128_t vi2x89AB = wasm_v128_load(i2);
143 const v128_t vi2xCDEF = wasm_v128_load(i2 + 4);
144 i2 += 8;
145 const v128_t vi3x89AB = wasm_v128_load(i3);
146 const v128_t vi3xCDEF = wasm_v128_load(i3 + 4);
147 i3 += 8;
148 const v128_t vi4x89AB = wasm_v128_load(i4);
149 const v128_t vi4xCDEF = wasm_v128_load(i4 + 4);
150 i4 += 8;
151 const v128_t vi5x89AB = wasm_v128_load(i5);
152 const v128_t vi5xCDEF = wasm_v128_load(i5 + 4);
153 i5 += 8;
154 const v128_t vi6x89AB = wasm_v128_load(i6);
155 const v128_t vi6xCDEF = wasm_v128_load(i6 + 4);
156 i6 += 8;
157 const v128_t vi7x89AB = wasm_v128_load(i7);
158 const v128_t vi7xCDEF = wasm_v128_load(i7 + 4);
159 i7 += 8;
160 const v128_t vi8x89AB = wasm_v128_load(i8);
161 const v128_t vi8xCDEF = wasm_v128_load(i8 + 4);
162 i8 += 8;
163
164 v128_t vi0x8ACE = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4, 6);
165 v128_t vi0x9BDF = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 5, 7);
166 v128_t vi1x8ACE = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4, 6);
167 v128_t vi1x9BDF = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 5, 7);
168 v128_t vi2x8ACE = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4, 6);
169 v128_t vi2x9BDF = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 5, 7);
170 v128_t vi3x8ACE = wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 0, 2, 4, 6);
171 v128_t vi3x9BDF = wasm_v32x4_shuffle(vi3x89AB, vi3xCDEF, 1, 3, 5, 7);
172 v128_t vi4x8ACE = wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 0, 2, 4, 6);
173 v128_t vi4x9BDF = wasm_v32x4_shuffle(vi4x89AB, vi4xCDEF, 1, 3, 5, 7);
174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6);
175 v128_t vi5x9BDF = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 1, 3, 5, 7);
176 v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6);
177 v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7);
178 v128_t vi7x8ACE = wasm_v32x4_shuffle(vi7x89AB, vi7xCDEF, 0, 2, 4, 6);
179 v128_t vi7x9BDF = wasm_v32x4_shuffle(vi7x89AB, vi7xCDEF, 1, 3, 5, 7);
180 v128_t vi8x8ACE = wasm_v32x4_shuffle(vi8x89AB, vi8xCDEF, 0, 2, 4, 6);
181 v128_t vi8x9BDF = wasm_v32x4_shuffle(vi8x89AB, vi8xCDEF, 1, 3, 5, 7);
182
183 size_t w = input_width;
184 for (; w > 8 * sizeof(float); w -= 8 * sizeof(float)) {
185 v128_t vo0p0 = vbias;
186 v128_t vo1p0 = vbias;
187 v128_t vo2p0 = vbias;
188
189 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, vk02);
190 v128_t vo1p1 = wasm_f32x4_mul(vi2x8ACE, vk02);
191 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, vk02);
192
193 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, vk12));
194 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, vk12));
195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12));
196
197 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, vk22));
198 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, vk22));
199 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, vk22));
200
201 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, vk32));
202 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32));
203 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, vk32));
204
205 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, vk42));
206 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, vk42));
207 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, vk42));
208
209 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, vk03));
210 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x9BDF, vk03));
211 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, vk03));
212
213 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, vk13));
214 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, vk13));
215 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, vk13));
216
217 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, vk23));
218 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x9BDF, vk23));
219 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, vk23));
220
221 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, vk33));
222 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x9BDF, vk33));
223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, vk33));
224
225 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, vk43));
226 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi6x9BDF, vk43));
227 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, vk43));
228
229 const v128_t vi0x68AC = wasm_v32x4_shuffle(vi0x0246, vi0x8ACE, 3, 4, 5, 6);
230 vi0x0246 = vi0x8ACE;
231 const v128_t vi1x68AC = wasm_v32x4_shuffle(vi1x0246, vi1x8ACE, 3, 4, 5, 6);
232 vi1x0246 = vi1x8ACE;
233 const v128_t vi2x68AC = wasm_v32x4_shuffle(vi2x0246, vi2x8ACE, 3, 4, 5, 6);
234 vi2x0246 = vi2x8ACE;
235 const v128_t vi3x68AC = wasm_v32x4_shuffle(vi3x0246, vi3x8ACE, 3, 4, 5, 6);
236 vi3x0246 = vi3x8ACE;
237 const v128_t vi4x68AC = wasm_v32x4_shuffle(vi4x0246, vi4x8ACE, 3, 4, 5, 6);
238 vi4x0246 = vi4x8ACE;
239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6);
240 vi5x0246 = vi5x8ACE;
241 const v128_t vi6x68AC = wasm_v32x4_shuffle(vi6x0246, vi6x8ACE, 3, 4, 5, 6);
242 vi6x0246 = vi6x8ACE;
243 const v128_t vi7x68AC = wasm_v32x4_shuffle(vi7x0246, vi7x8ACE, 3, 4, 5, 6);
244 vi7x0246 = vi7x8ACE;
245 const v128_t vi8x68AC = wasm_v32x4_shuffle(vi8x0246, vi8x8ACE, 3, 4, 5, 6);
246 vi8x0246 = vi8x8ACE;
247
248 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x68AC, vk00));
249 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x68AC, vk00));
250 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x68AC, vk00));
251
252 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, vk10));
253 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x68AC, vk10));
254 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, vk10));
255
256 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, vk20));
257 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x68AC, vk20));
258 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x68AC, vk20));
259
260 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, vk30));
261 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x68AC, vk30));
262 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, vk30));
263
264 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, vk40));
265 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x68AC, vk40));
266 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x68AC, vk40));
267
268 const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6);
269 vi0x1357 = vi0x9BDF;
270 const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6);
271 vi1x1357 = vi1x9BDF;
272 const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6);
273 vi2x1357 = vi2x9BDF;
274 const v128_t vi3x79BD = wasm_v32x4_shuffle(vi3x1357, vi3x9BDF, 3, 4, 5, 6);
275 vi3x1357 = vi3x9BDF;
276 const v128_t vi4x79BD = wasm_v32x4_shuffle(vi4x1357, vi4x9BDF, 3, 4, 5, 6);
277 vi4x1357 = vi4x9BDF;
278 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6);
279 vi5x1357 = vi5x9BDF;
280 const v128_t vi6x79BD = wasm_v32x4_shuffle(vi6x1357, vi6x9BDF, 3, 4, 5, 6);
281 vi6x1357 = vi6x9BDF;
282 const v128_t vi7x79BD = wasm_v32x4_shuffle(vi7x1357, vi7x9BDF, 3, 4, 5, 6);
283 vi7x1357 = vi7x9BDF;
284 const v128_t vi8x79BD = wasm_v32x4_shuffle(vi8x1357, vi8x9BDF, 3, 4, 5, 6);
285 vi8x1357 = vi8x9BDF;
286
287 const v128_t vi0xGHIJ = wasm_v128_load(i0);
288 const v128_t vi0xKLMN = wasm_v128_load(i0 + 4);
289 i0 += 8;
290 const v128_t vi1xGHIJ = wasm_v128_load(i1);
291 const v128_t vi1xKLMN = wasm_v128_load(i1 + 4);
292 i1 += 8;
293 const v128_t vi2xGHIJ = wasm_v128_load(i2);
294 const v128_t vi2xKLMN = wasm_v128_load(i2 + 4);
295 i2 += 8;
296 const v128_t vi3xGHIJ = wasm_v128_load(i3);
297 const v128_t vi3xKLMN = wasm_v128_load(i3 + 4);
298 i3 += 8;
299 const v128_t vi4xGHIJ = wasm_v128_load(i4);
300 const v128_t vi4xKLMN = wasm_v128_load(i4 + 4);
301 i4 += 8;
302 const v128_t vi5xGHIJ = wasm_v128_load(i5);
303 const v128_t vi5xKLMN = wasm_v128_load(i5 + 4);
304 i5 += 8;
305 const v128_t vi6xGHIJ = wasm_v128_load(i6);
306 const v128_t vi6xKLMN = wasm_v128_load(i6 + 4);
307 i6 += 8;
308 const v128_t vi7xGHIJ = wasm_v128_load(i7);
309 const v128_t vi7xKLMN = wasm_v128_load(i7 + 4);
310 i7 += 8;
311 const v128_t vi8xGHIJ = wasm_v128_load(i8);
312 const v128_t vi8xKLMN = wasm_v128_load(i8 + 4);
313 i8 += 8;
314
315 const v128_t vi0xGIKM = wasm_v32x4_shuffle(vi0xGHIJ, vi0xKLMN, 0, 2, 4, 6);
316 const v128_t vi0xHJLN = wasm_v32x4_shuffle(vi0xGHIJ, vi0xKLMN, 1, 3, 5, 7);
317 const v128_t vi1xGIKM = wasm_v32x4_shuffle(vi1xGHIJ, vi1xKLMN, 0, 2, 4, 6);
318 const v128_t vi1xHJLN = wasm_v32x4_shuffle(vi1xGHIJ, vi1xKLMN, 1, 3, 5, 7);
319 const v128_t vi2xGIKM = wasm_v32x4_shuffle(vi2xGHIJ, vi2xKLMN, 0, 2, 4, 6);
320 const v128_t vi2xHJLN = wasm_v32x4_shuffle(vi2xGHIJ, vi2xKLMN, 1, 3, 5, 7);
321 const v128_t vi3xGIKM = wasm_v32x4_shuffle(vi3xGHIJ, vi3xKLMN, 0, 2, 4, 6);
322 const v128_t vi3xHJLN = wasm_v32x4_shuffle(vi3xGHIJ, vi3xKLMN, 1, 3, 5, 7);
323 const v128_t vi4xGIKM = wasm_v32x4_shuffle(vi4xGHIJ, vi4xKLMN, 0, 2, 4, 6);
324 const v128_t vi4xHJLN = wasm_v32x4_shuffle(vi4xGHIJ, vi4xKLMN, 1, 3, 5, 7);
325 const v128_t vi5xGIKM = wasm_v32x4_shuffle(vi5xGHIJ, vi5xKLMN, 0, 2, 4, 6);
326 const v128_t vi5xHJLN = wasm_v32x4_shuffle(vi5xGHIJ, vi5xKLMN, 1, 3, 5, 7);
327 const v128_t vi6xGIKM = wasm_v32x4_shuffle(vi6xGHIJ, vi6xKLMN, 0, 2, 4, 6);
328 const v128_t vi6xHJLN = wasm_v32x4_shuffle(vi6xGHIJ, vi6xKLMN, 1, 3, 5, 7);
329 const v128_t vi7xGIKM = wasm_v32x4_shuffle(vi7xGHIJ, vi7xKLMN, 0, 2, 4, 6);
330 const v128_t vi7xHJLN = wasm_v32x4_shuffle(vi7xGHIJ, vi7xKLMN, 1, 3, 5, 7);
331 const v128_t vi8xGIKM = wasm_v32x4_shuffle(vi8xGHIJ, vi8xKLMN, 0, 2, 4, 6);
332 const v128_t vi8xHJLN = wasm_v32x4_shuffle(vi8xGHIJ, vi8xKLMN, 1, 3, 5, 7);
333
334 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, vk01));
335 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x79BD, vk01));
336 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, vk01));
337
338 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, vk11));
339 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x79BD, vk11));
340 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11));
341
342 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, vk21));
343 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x79BD, vk21));
344 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, vk21));
345
346 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x79BD, vk31));
347 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31));
348 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x79BD, vk31));
349
350 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, vk41));
351 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi6x79BD, vk41));
352 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, vk41));
353
354 const v128_t vi0xACEG = wasm_v32x4_shuffle(vi0x8ACE, vi0xGIKM, 1, 2, 3, 4);
355 vi0x8ACE = vi0xGIKM;
356 vi0x9BDF = vi0xHJLN;
357 const v128_t vi1xACEG = wasm_v32x4_shuffle(vi1x8ACE, vi1xGIKM, 1, 2, 3, 4);
358 vi1x8ACE = vi1xGIKM;
359 vi1x9BDF = vi1xHJLN;
360 const v128_t vi2xACEG = wasm_v32x4_shuffle(vi2x8ACE, vi2xGIKM, 1, 2, 3, 4);
361 vi2x8ACE = vi2xGIKM;
362 vi2x9BDF = vi2xHJLN;
363 const v128_t vi3xACEG = wasm_v32x4_shuffle(vi3x8ACE, vi3xGIKM, 1, 2, 3, 4);
364 vi3x8ACE = vi3xGIKM;
365 vi3x9BDF = vi3xHJLN;
366 const v128_t vi4xACEG = wasm_v32x4_shuffle(vi4x8ACE, vi4xGIKM, 1, 2, 3, 4);
367 vi4x8ACE = vi4xGIKM;
368 vi4x9BDF = vi4xHJLN;
369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4);
370 vi5x8ACE = vi5xGIKM;
371 vi5x9BDF = vi5xHJLN;
372 const v128_t vi6xACEG = wasm_v32x4_shuffle(vi6x8ACE, vi6xGIKM, 1, 2, 3, 4);
373 vi6x8ACE = vi6xGIKM;
374 vi6x9BDF = vi6xHJLN;
375 const v128_t vi7xACEG = wasm_v32x4_shuffle(vi7x8ACE, vi7xGIKM, 1, 2, 3, 4);
376 vi7x8ACE = vi7xGIKM;
377 vi7x9BDF = vi7xHJLN;
378 const v128_t vi8xACEG = wasm_v32x4_shuffle(vi8x8ACE, vi8xGIKM, 1, 2, 3, 4);
379 vi8x8ACE = vi8xGIKM;
380 vi8x9BDF = vi8xHJLN;
381
382 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0xACEG, vk04));
383 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2xACEG, vk04));
384 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4xACEG, vk04));
385
386 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1xACEG, vk14));
387 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3xACEG, vk14));
388 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5xACEG, vk14));
389
390 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2xACEG, vk24));
391 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4xACEG, vk24));
392 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6xACEG, vk24));
393
394 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3xACEG, vk34));
395 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5xACEG, vk34));
396 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7xACEG, vk34));
397
398 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4xACEG, vk44));
399 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6xACEG, vk44));
400 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8xACEG, vk44));
401
402 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
403 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
404 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
405
406 v128_t vo0 = wasm_v128_bitselect(vmin, vo0p0, wasm_f32x4_lt(vo0p0, vmin));
407 v128_t vo1 = wasm_v128_bitselect(vmin, vo1p0, wasm_f32x4_lt(vo1p0, vmin));
408 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin));
409 vo0 = wasm_v128_bitselect(vo0, vmax, wasm_f32x4_le(vo0, vmax));
410 vo1 = wasm_v128_bitselect(vo1, vmax, wasm_f32x4_le(vo1, vmax));
411 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax));
412
413 wasm_v128_store(o2, vo2); o2 += 4;
414 wasm_v128_store(o1, vo1); o1 += 4;
415 wasm_v128_store(o0, vo0); o0 += 4;
416 }
417 // Last block has 1-8 pixels to process.
418 assert(w <= 8 * sizeof(float));
419 assert(w >= 1 * sizeof(float));
420 {
421 v128_t vo0p0 = vbias;
422 v128_t vo1p0 = vbias;
423 v128_t vo2p0 = vbias;
424
425 vi0x8ACE = wasm_v128_and(vmask_even, vi0x8ACE);
426 vi1x8ACE = wasm_v128_and(vmask_even, vi1x8ACE);
427 vi2x8ACE = wasm_v128_and(vmask_even, vi2x8ACE);
428 vi3x8ACE = wasm_v128_and(vmask_even, vi3x8ACE);
429 vi4x8ACE = wasm_v128_and(vmask_even, vi4x8ACE);
430 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE);
431 vi6x8ACE = wasm_v128_and(vmask_even, vi6x8ACE);
432 vi7x8ACE = wasm_v128_and(vmask_even, vi7x8ACE);
433 vi8x8ACE = wasm_v128_and(vmask_even, vi8x8ACE);
434
435 vi0x9BDF = wasm_v128_and(vmask_odd, vi0x9BDF);
436 vi1x9BDF = wasm_v128_and(vmask_odd, vi1x9BDF);
437 vi2x9BDF = wasm_v128_and(vmask_odd, vi2x9BDF);
438 vi3x9BDF = wasm_v128_and(vmask_odd, vi3x9BDF);
439 vi4x9BDF = wasm_v128_and(vmask_odd, vi4x9BDF);
440 vi5x9BDF = wasm_v128_and(vmask_odd, vi5x9BDF);
441 vi6x9BDF = wasm_v128_and(vmask_odd, vi6x9BDF);
442 vi7x9BDF = wasm_v128_and(vmask_odd, vi7x9BDF);
443 vi8x9BDF = wasm_v128_and(vmask_odd, vi8x9BDF);
444
445 v128_t vo0p1 = wasm_f32x4_mul(vi0x8ACE, vk02);
446 v128_t vo1p1 = wasm_f32x4_mul(vi2x8ACE, vk02);
447 v128_t vo2p1 = wasm_f32x4_mul(vi4x8ACE, vk02);
448
449 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x8ACE, vk12));
450 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, vk12));
451 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12));
452
453 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, vk22));
454 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, vk22));
455 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, vk22));
456
457 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x8ACE, vk32));
458 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32));
459 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x8ACE, vk32));
460
461 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x8ACE, vk42));
462 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, vk42));
463 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, vk42));
464
465 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x9BDF, vk03));
466 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x9BDF, vk03));
467 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x9BDF, vk03));
468
469 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x9BDF, vk13));
470 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, vk13));
471 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, vk13));
472
473 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x9BDF, vk23));
474 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x9BDF, vk23));
475 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x9BDF, vk23));
476
477 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x9BDF, vk33));
478 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x9BDF, vk33));
479 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, vk33));
480
481 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x9BDF, vk43));
482 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi6x9BDF, vk43));
483 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x9BDF, vk43));
484
485 const v128_t vi0x68AC = wasm_v32x4_shuffle(vi0x0246, vi0x8ACE, 3, 4, 5, 6);
486 const v128_t vi1x68AC = wasm_v32x4_shuffle(vi1x0246, vi1x8ACE, 3, 4, 5, 6);
487 const v128_t vi2x68AC = wasm_v32x4_shuffle(vi2x0246, vi2x8ACE, 3, 4, 5, 6);
488 const v128_t vi3x68AC = wasm_v32x4_shuffle(vi3x0246, vi3x8ACE, 3, 4, 5, 6);
489 const v128_t vi4x68AC = wasm_v32x4_shuffle(vi4x0246, vi4x8ACE, 3, 4, 5, 6);
490 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6);
491 const v128_t vi6x68AC = wasm_v32x4_shuffle(vi6x0246, vi6x8ACE, 3, 4, 5, 6);
492 const v128_t vi7x68AC = wasm_v32x4_shuffle(vi7x0246, vi7x8ACE, 3, 4, 5, 6);
493 const v128_t vi8x68AC = wasm_v32x4_shuffle(vi8x0246, vi8x8ACE, 3, 4, 5, 6);
494
495 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x68AC, vk00));
496 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x68AC, vk00));
497 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x68AC, vk00));
498
499 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x68AC, vk10));
500 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x68AC, vk10));
501 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x68AC, vk10));
502
503 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x68AC, vk20));
504 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x68AC, vk20));
505 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x68AC, vk20));
506
507 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x68AC, vk30));
508 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x68AC, vk30));
509 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7x68AC, vk30));
510
511 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x68AC, vk40));
512 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x68AC, vk40));
513 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x68AC, vk40));
514
515 const v128_t vi0x79BD = wasm_v32x4_shuffle(vi0x1357, vi0x9BDF, 3, 4, 5, 6);
516 const v128_t vi1x79BD = wasm_v32x4_shuffle(vi1x1357, vi1x9BDF, 3, 4, 5, 6);
517 const v128_t vi2x79BD = wasm_v32x4_shuffle(vi2x1357, vi2x9BDF, 3, 4, 5, 6);
518 const v128_t vi3x79BD = wasm_v32x4_shuffle(vi3x1357, vi3x9BDF, 3, 4, 5, 6);
519 const v128_t vi4x79BD = wasm_v32x4_shuffle(vi4x1357, vi4x9BDF, 3, 4, 5, 6);
520 const v128_t vi5x79BD = wasm_v32x4_shuffle(vi5x1357, vi5x9BDF, 3, 4, 5, 6);
521 const v128_t vi6x79BD = wasm_v32x4_shuffle(vi6x1357, vi6x9BDF, 3, 4, 5, 6);
522 const v128_t vi7x79BD = wasm_v32x4_shuffle(vi7x1357, vi7x9BDF, 3, 4, 5, 6);
523 const v128_t vi8x79BD = wasm_v32x4_shuffle(vi8x1357, vi8x9BDF, 3, 4, 5, 6);
524
525 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x79BD, vk01));
526 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x79BD, vk01));
527 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x79BD, vk01));
528
529 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x79BD, vk11));
530 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x79BD, vk11));
531 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x79BD, vk11));
532
533 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x79BD, vk21));
534 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x79BD, vk21));
535 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x79BD, vk21));
536
537 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x79BD, vk31));
538 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x79BD, vk31));
539 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x79BD, vk31));
540
541 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x79BD, vk41));
542 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi6x79BD, vk41));
543 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi8x79BD, vk41));
544
545 const v128_t vi0xACEG = wasm_v32x4_shuffle(vi0x8ACE, vzero, 1, 2, 3, 4);
546 const v128_t vi1xACEG = wasm_v32x4_shuffle(vi1x8ACE, vzero, 1, 2, 3, 4);
547 const v128_t vi2xACEG = wasm_v32x4_shuffle(vi2x8ACE, vzero, 1, 2, 3, 4);
548 const v128_t vi3xACEG = wasm_v32x4_shuffle(vi3x8ACE, vzero, 1, 2, 3, 4);
549 const v128_t vi4xACEG = wasm_v32x4_shuffle(vi4x8ACE, vzero, 1, 2, 3, 4);
550 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4);
551 const v128_t vi6xACEG = wasm_v32x4_shuffle(vi6x8ACE, vzero, 1, 2, 3, 4);
552 const v128_t vi7xACEG = wasm_v32x4_shuffle(vi7x8ACE, vzero, 1, 2, 3, 4);
553 const v128_t vi8xACEG = wasm_v32x4_shuffle(vi8x8ACE, vzero, 1, 2, 3, 4);
554
555 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0xACEG, vk04));
556 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2xACEG, vk04));
557 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4xACEG, vk04));
558
559 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1xACEG, vk14));
560 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3xACEG, vk14));
561 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5xACEG, vk14));
562
563 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2xACEG, vk24));
564 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4xACEG, vk24));
565 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6xACEG, vk24));
566
567 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3xACEG, vk34));
568 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5xACEG, vk34));
569 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi7xACEG, vk34));
570
571 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4xACEG, vk44));
572 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6xACEG, vk44));
573 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8xACEG, vk44));
574
575 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
576 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
577 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
578
579 v128_t vo0 = wasm_v128_bitselect(vmin, vo0p0, wasm_f32x4_lt(vo0p0, vmin));
580 v128_t vo1 = wasm_v128_bitselect(vmin, vo1p0, wasm_f32x4_lt(vo1p0, vmin));
581 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin));
582 vo0 = wasm_v128_bitselect(vo0, vmax, wasm_f32x4_le(vo0, vmax));
583 vo1 = wasm_v128_bitselect(vo1, vmax, wasm_f32x4_le(vo1, vmax));
584 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax));
585
586 size_t w_tmp = (w + 1 * sizeof(float)) / (2 * sizeof(float));
587 if XNN_LIKELY(w_tmp >= 4) {
588 wasm_v128_store(o2, vo2); o2 += 4;
589 wasm_v128_store(o1, vo1); o1 += 4;
590 wasm_v128_store(o0, vo0); o0 += 4;
591 } else {
592 if (w_tmp & 2) {
593 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2;
594 *((double*) o1) = wasm_f64x2_extract_lane(vo1, 0); o1 += 2;
595 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2;
596
597 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1);
598 vo1 = wasm_v32x4_shuffle(vo1, vo1, 2, 3, 0, 1);
599 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1);
600 }
601 if (w_tmp & 1) {
602 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1;
603 *o1 = wasm_f32x4_extract_lane(vo1, 0); o1 += 1;
604 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1;
605 }
606 }
607 }
608
609 i0 = (const float*) ((uintptr_t) i6 - input_decrement);
610 i1 = (const float*) ((uintptr_t) i7 - input_decrement);
611 i2 = (const float*) ((uintptr_t) i8 - input_decrement);
612 i3 = (const float*) ((uintptr_t) i2 + input_width);
613 i4 = (const float*) ((uintptr_t) i3 + input_width);
614 i5 = (const float*) ((uintptr_t) i4 + input_width);
615 i6 = (const float*) ((uintptr_t) i5 + input_width);
616 i7 = (const float*) ((uintptr_t) i6 + input_width);
617 i8 = (const float*) ((uintptr_t) i7 + input_width);
618
619 o0 = o2;
620 o1 = (float*) ((uintptr_t) o0 + output_width);
621 o2 = (float*) ((uintptr_t) o1 + output_width);
622
623 output_height = doz(output_height, 3);
624 padded_input_height = doz(padded_input_height, 6);
625 } while (output_height != 0);
626}