blob: acfe8feba7e4dd78912bfe9e9452a89aae485d9c [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <assert.h>
7
8#include <arm_neon.h>
9
10#include <xnnpack/dwconv.h>
11#include <xnnpack/math.h>
12
13
Marat Dukhan1f29b802020-05-15 23:46:39 -070014void xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma(
Erich Elseneda9c112020-05-11 04:40:25 -070015 size_t input_height,
16 size_t input_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -070017 const float* input,
18 const float* weights,
Erich Elsen4e5db3d2020-05-07 08:57:47 -070019 const float* zero,
XNNPACK Teamb455b122019-09-27 18:10:33 -070020 float* output,
Erich Elsen4e5db3d2020-05-07 08:57:47 -070021 uint32_t padding_top,
XNNPACK Teamb455b122019-09-27 18:10:33 -070022 size_t input_tuple_stride,
23 size_t output_tuple_stride,
24 size_t input_width_stride,
25 size_t output_width_stride,
Marat Dukhan1f29b802020-05-15 23:46:39 -070026 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
XNNPACK Teamb455b122019-09-27 18:10:33 -070027{
Erich Elseneda9c112020-05-11 04:40:25 -070028 assert(input_width != 0);
29 assert(input_height != 0);
Erich Elsen4e5db3d2020-05-07 08:57:47 -070030 assert(padding_top >= 1 && padding_top <= 2);
XNNPACK Teamb455b122019-09-27 18:10:33 -070031
Erich Elseneda9c112020-05-11 04:40:25 -070032 const size_t padded_input_height = input_height + padding_top + 2 /* padding_bottom */;
33 const size_t output_height = (padded_input_height - 5) / 2 + 1;
34
XNNPACK Teamb455b122019-09-27 18:10:33 -070035 const uint32x4_t vmask_even = vld1q_u32(params->neon.mask_even);
36 const uint32x4_t vmask_odd = vld1q_u32(params->neon.mask_odd);
Frank Barchardfcfdc0e2019-10-21 15:58:42 -070037 const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
38 const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
XNNPACK Teamb455b122019-09-27 18:10:33 -070039
Erich Elseneda9c112020-05-11 04:40:25 -070040 const size_t input_width_decrement_single = input_tuple_stride * ( (input_width - 1) / 4 + 1);
Erich Elsen4e5db3d2020-05-07 08:57:47 -070041 const size_t input_width_increment_single = input_width_stride - input_width_decrement_single;
42 const size_t input_width_increment_double= input_width_stride * 2 - input_width_decrement_single;
Erich Elseneda9c112020-05-11 04:40:25 -070043 const size_t output_width_increment_single = output_width_stride - (input_width + 1) / 8 * output_tuple_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -070044
Erich Elsen4e5db3d2020-05-07 08:57:47 -070045 const float* i0;
46 const float* i1;
47 const float* i2;
48 const float* i3;
49 const float* i4;
50
51 if (padding_top == 1) {
52 i0 = zero;
53 i1 = input;
54 i2 = (const float*) ((uintptr_t) i1 + input_width_stride);
55 i3 = (const float*) ((uintptr_t) i2 + input_width_stride);
56 i4 = (const float*) ((uintptr_t) i3 + input_width_stride);
Erich Elseneda9c112020-05-11 04:40:25 -070057 if (input_height <= 3) {
58 i4 = zero;
59 }
60 if (input_height <= 2) {
61 i3 = zero;
62 }
63 if (input_height == 1) {
64 i2 = zero;
65 }
Erich Elsen4e5db3d2020-05-07 08:57:47 -070066 } else {
67 i0 = zero;
68 i1 = zero;
69 i2 = input;
70 i3 = (const float*) ((uintptr_t) i2 + input_width_stride);
71 i4 = (const float*) ((uintptr_t) i3 + input_width_stride);
Erich Elseneda9c112020-05-11 04:40:25 -070072 if (input_height <= 2) {
73 i4 = zero;
74 }
75 if (input_height == 1) {
76 i3 = zero;
77 }
Erich Elsen4e5db3d2020-05-07 08:57:47 -070078 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070079
80 float* output0 = output;
81
82 const float32x4_t vw0123 = vld1q_f32(weights);
83 const float32x4_t vw4567 = vld1q_f32(weights + 4);
84 const float32x4_t vw89AB = vld1q_f32(weights + 8);
85 const float32x4_t vwCDEF = vld1q_f32(weights + 12);
86 const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
87 const float32x4_t vwKLMN = vld1q_f32(weights + 20);
88 const float32x2_t vwOP = vld1_f32( weights + 24);
89
Erich Elseneda9c112020-05-11 04:40:25 -070090 size_t m = output_height;
XNNPACK Teamb455b122019-09-27 18:10:33 -070091 do {
92 float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
93 float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
94 float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
95 float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
96 float32x4_t vi4x0123 = vmovq_n_f32(0.0f);
97 float32x4_t vi0x4567 = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
98 float32x4_t vi1x4567 = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
99 float32x4_t vi2x4567 = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
100 float32x4_t vi3x4567 = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
101 float32x4_t vi4x4567 = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
102
Erich Elseneda9c112020-05-11 04:40:25 -0700103 size_t k = input_width;
Erich Elsen179ac852019-11-15 18:17:12 -0800104 for (; k > 8; k -= 8) {
105 float32x4_t vo468Ap00 = vdupq_laneq_f32(vw0123, 0);
106
107 float32x4_t vi0x89AB;
108 float32x4_t vi1x89AB;
109 float32x4_t vi2x89AB;
110 float32x4_t vi3x89AB;
111 float32x4_t vi4x89AB;
112
113 vi0x89AB = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
114 vi1x89AB = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
115 vi2x89AB = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
116 vi3x89AB = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
117 vi4x89AB = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
118
119 float32x4_t vi0xCDEF;
120 float32x4_t vi1xCDEF;
121 float32x4_t vi2xCDEF;
122 float32x4_t vi3xCDEF;
123 float32x4_t vi4xCDEF;
124
125 vi0xCDEF = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
126 vi1xCDEF = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
127 vi2xCDEF = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
128 vi3xCDEF = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
129 vi4xCDEF = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
130
131 float32x4_t vi0x468A = vuzp1q_f32(vi0x4567, vi0x89AB);
132 float32x4_t vi0x579B = vuzp2q_f32(vi0x4567, vi0x89AB);
133 float32x4_t vi1x468A = vuzp1q_f32(vi1x4567, vi1x89AB);
134 float32x4_t vi1x579B = vuzp2q_f32(vi1x4567, vi1x89AB);
135 float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB);
136 float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB);
137 float32x4_t vi3x468A = vuzp1q_f32(vi3x4567, vi3x89AB);
138 float32x4_t vi3x579B = vuzp2q_f32(vi3x4567, vi3x89AB);
139 float32x4_t vi4x468A = vuzp1q_f32(vi4x4567, vi4x89AB);
140 float32x4_t vi4x579B = vuzp2q_f32(vi4x4567, vi4x89AB);
141
142 // middle tap
143 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x468A, vw0123, 3);
144 float32x4_t vo468Ap01 = vmulq_laneq_f32(vi1x468A, vw89AB, 0);
145 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x468A, vwCDEF, 1);
146 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x468A, vwGHIJ, 2);
147 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x468A, vwKLMN, 3);
148
149 // one left
150 const float32x4_t vi0x3579 = vextq_f32(vi0x0123, vi0x579B, 3);
151 const float32x4_t vi1x3579 = vextq_f32(vi1x0123, vi1x579B, 3);
152 const float32x4_t vi2x3579 = vextq_f32(vi2x0123, vi2x579B, 3);
153 const float32x4_t vi3x3579 = vextq_f32(vi3x0123, vi3x579B, 3);
154 const float32x4_t vi4x3579 = vextq_f32(vi4x0123, vi4x579B, 3);
155
156 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x3579, vw0123, 2);
157 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x3579, vw4567, 3);
158 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x3579, vwCDEF, 0);
159 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x3579, vwGHIJ, 1);
160 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x3579, vwKLMN, 2);
161
162 // two left
163 // getting the vector to use for the far left tap is annoying
164 // as we can't ext anything we currently have to get it.
165 // To do this, we get a bit ugly. Interpret the float 32x4
166 // vector as int 64x2. Then left shift by 32. Interpret
167 // again as float 32x4. Now the right most bits are what we
168 // want them to be for the following ext.
169 const float32x4_t vi0x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi0x0123), 32));
170 const float32x4_t vi1x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi1x0123), 32));
171 const float32x4_t vi2x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi2x0123), 32));
172 const float32x4_t vi3x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi3x0123), 32));
173 const float32x4_t vi4x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi4x0123), 32));
174
175 const float32x4_t vi0x2468 = vextq_f32(vi0x0012, vi0x468A, 3);
176 const float32x4_t vi1x2468 = vextq_f32(vi1x0012, vi1x468A, 3);
177 const float32x4_t vi2x2468 = vextq_f32(vi2x0012, vi2x468A, 3);
178 const float32x4_t vi3x2468 = vextq_f32(vi3x0012, vi3x468A, 3);
179 const float32x4_t vi4x2468 = vextq_f32(vi4x0012, vi4x468A, 3);
180
181 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x2468, vw0123, 1);
182 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x2468, vw4567, 2);
183 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x2468, vw89AB, 3);
184 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x2468, vwGHIJ, 0);
185 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x2468, vwKLMN, 1);
186
187 vi0x0123 = vi0x89AB;
188 vi1x0123 = vi1x89AB;
189 vi2x0123 = vi2x89AB;
190 vi3x0123 = vi3x89AB;
191 vi4x0123 = vi4x89AB;
192
193 // one right
194 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x579B, vw4567, 0);
195 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x579B, vw89AB, 1);
196 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x579B, vwCDEF, 2);
197 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x579B, vwGHIJ, 3);
198 vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x579B, vwOP, 0);
199
200 // two right
201 const float32x4_t vi0x68AC = vextq_f32(vi0x468A, vi0xCDEF, 1);
202 const float32x4_t vi1x68AC = vextq_f32(vi1x468A, vi1xCDEF, 1);
203 const float32x4_t vi2x68AC = vextq_f32(vi2x468A, vi2xCDEF, 1);
204 const float32x4_t vi3x68AC = vextq_f32(vi3x468A, vi3xCDEF, 1);
205 const float32x4_t vi4x68AC = vextq_f32(vi4x468A, vi4xCDEF, 1);
206
207 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x68AC, vw4567, 1);
208 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x68AC, vw89AB, 2);
209 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x68AC, vwCDEF, 3);
210 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x68AC, vwKLMN, 0);
211 vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x68AC, vwOP, 1);
212
213 vi0x4567 = vi0xCDEF;
214 vi1x4567 = vi1xCDEF;
215 vi2x4567 = vi2xCDEF;
216 vi3x4567 = vi3xCDEF;
217 vi4x4567 = vi4xCDEF;
218
219 float32x4_t vo0 = vaddq_f32(vo468Ap00, vo468Ap01);
220
221 vo0 = vmaxq_f32(vo0, vmin);
222 vo0 = vminq_f32(vo0, vmax);
223
224 size_t k_tmp = (k + 1) / 2;
225 if XNN_LIKELY(k_tmp >= 4) {
226 vst1q_f32(output0, vo0);
227 output0 = (float*) ((uintptr_t) output0 + output_tuple_stride);
228 } else {
229 float* output0_lo = output0;
230 float32x2_t vo0_lo = vget_low_f32(vo0);
231 if (k_tmp & 2) {
232 vst1_f32(output0_lo, vo0_lo); output0_lo += 2;
233 vo0_lo = vget_high_f32(vo0);
234 }
235 if (k_tmp & 1) {
236 vst1_lane_f32(output0_lo, vo0_lo, 0);
237 }
238 }
239 }
240
241 {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700242 float32x4_t vo468Ap00 = vdupq_laneq_f32(vw0123, 0);
243
244 float32x4_t vi0x89AB;
245 float32x4_t vi1x89AB;
246 float32x4_t vi2x89AB;
247 float32x4_t vi3x89AB;
248 float32x4_t vi4x89AB;
249
250 if XNN_LIKELY(k > 4) {
251 vi0x89AB = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
252 vi1x89AB = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
253 vi2x89AB = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
254 vi3x89AB = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
255 vi4x89AB = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
256 } else {
257 vi0x89AB = vmovq_n_f32(0.f);
258 vi1x89AB = vmovq_n_f32(0.f);
259 vi2x89AB = vmovq_n_f32(0.f);
260 vi3x89AB = vmovq_n_f32(0.f);
261 vi4x89AB = vmovq_n_f32(0.f);
262 }
263
264 float32x4_t vi0xCDEF;
265 float32x4_t vi1xCDEF;
266 float32x4_t vi2xCDEF;
267 float32x4_t vi3xCDEF;
268 float32x4_t vi4xCDEF;
269
270 if XNN_LIKELY(k > 8) {
271 vi0xCDEF = vld1q_f32(i0); i0 = (const float*) ((uintptr_t) i0 + input_tuple_stride);
272 vi1xCDEF = vld1q_f32(i1); i1 = (const float*) ((uintptr_t) i1 + input_tuple_stride);
273 vi2xCDEF = vld1q_f32(i2); i2 = (const float*) ((uintptr_t) i2 + input_tuple_stride);
274 vi3xCDEF = vld1q_f32(i3); i3 = (const float*) ((uintptr_t) i3 + input_tuple_stride);
275 vi4xCDEF = vld1q_f32(i4); i4 = (const float*) ((uintptr_t) i4 + input_tuple_stride);
276 } else {
277 vi0xCDEF = vmovq_n_f32(0.f);
278 vi1xCDEF = vmovq_n_f32(0.f);
279 vi2xCDEF = vmovq_n_f32(0.f);
280 vi3xCDEF = vmovq_n_f32(0.f);
281 vi4xCDEF = vmovq_n_f32(0.f);
282 }
283 float32x4_t vi0x468A = vuzp1q_f32(vi0x4567, vi0x89AB);
284 float32x4_t vi0x579B = vuzp2q_f32(vi0x4567, vi0x89AB);
285 float32x4_t vi1x468A = vuzp1q_f32(vi1x4567, vi1x89AB);
286 float32x4_t vi1x579B = vuzp2q_f32(vi1x4567, vi1x89AB);
287 float32x4_t vi2x468A = vuzp1q_f32(vi2x4567, vi2x89AB);
288 float32x4_t vi2x579B = vuzp2q_f32(vi2x4567, vi2x89AB);
289 float32x4_t vi3x468A = vuzp1q_f32(vi3x4567, vi3x89AB);
290 float32x4_t vi3x579B = vuzp2q_f32(vi3x4567, vi3x89AB);
291 float32x4_t vi4x468A = vuzp1q_f32(vi4x4567, vi4x89AB);
292 float32x4_t vi4x579B = vuzp2q_f32(vi4x4567, vi4x89AB);
293
Marat Dukhane3fad192019-11-22 13:01:42 -0800294 vi0x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi0x468A)));
295 vi1x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi1x468A)));
296 vi2x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi2x468A)));
297 vi3x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi3x468A)));
298 vi4x468A = vreinterpretq_f32_u32(vandq_u32(vmask_even, vreinterpretq_u32_f32(vi4x468A)));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700299
Marat Dukhan9f08af42019-11-24 02:30:26 -0800300 vi0x579B = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi0x579B)));
301 vi1x579B = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi1x579B)));
302 vi2x579B = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi2x579B)));
303 vi3x579B = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi3x579B)));
304 vi4x579B = vreinterpretq_f32_u32(vandq_u32(vmask_odd, vreinterpretq_u32_f32(vi4x579B)));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700305
306 // middle tap
307 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x468A, vw0123, 3);
308 float32x4_t vo468Ap01 = vmulq_laneq_f32(vi1x468A, vw89AB, 0);
309 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x468A, vwCDEF, 1);
310 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x468A, vwGHIJ, 2);
311 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x468A, vwKLMN, 3);
312
313 // one left
314 const float32x4_t vi0x3579 = vextq_f32(vi0x0123, vi0x579B, 3);
315 const float32x4_t vi1x3579 = vextq_f32(vi1x0123, vi1x579B, 3);
316 const float32x4_t vi2x3579 = vextq_f32(vi2x0123, vi2x579B, 3);
317 const float32x4_t vi3x3579 = vextq_f32(vi3x0123, vi3x579B, 3);
318 const float32x4_t vi4x3579 = vextq_f32(vi4x0123, vi4x579B, 3);
319
320 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x3579, vw0123, 2);
321 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x3579, vw4567, 3);
322 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x3579, vwCDEF, 0);
323 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x3579, vwGHIJ, 1);
324 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x3579, vwKLMN, 2);
325
326 // two left
327 // getting the vector to use for the far left tap is annoying
328 // as we can't ext anything we currently have to get it.
329 // To do this, we get a bit ugly. Interpret the float 32x4
330 // vector as int 64x2. Then left shift by 32. Interpret
331 // again as float 32x4. Now the right most bits are what we
332 // want them to be for the following ext.
333 const float32x4_t vi0x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi0x0123), 32));
334 const float32x4_t vi1x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi1x0123), 32));
335 const float32x4_t vi2x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi2x0123), 32));
336 const float32x4_t vi3x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi3x0123), 32));
337 const float32x4_t vi4x0012 = vreinterpretq_f32_u64(vshlq_n_u64(vreinterpretq_u64_f32(vi4x0123), 32));
338
339 const float32x4_t vi0x2468 = vextq_f32(vi0x0012, vi0x468A, 3);
340 const float32x4_t vi1x2468 = vextq_f32(vi1x0012, vi1x468A, 3);
341 const float32x4_t vi2x2468 = vextq_f32(vi2x0012, vi2x468A, 3);
342 const float32x4_t vi3x2468 = vextq_f32(vi3x0012, vi3x468A, 3);
343 const float32x4_t vi4x2468 = vextq_f32(vi4x0012, vi4x468A, 3);
344
345 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x2468, vw0123, 1);
346 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x2468, vw4567, 2);
347 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x2468, vw89AB, 3);
348 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x2468, vwGHIJ, 0);
349 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi4x2468, vwKLMN, 1);
350
351 vi0x0123 = vi0x89AB;
352 vi1x0123 = vi1x89AB;
353 vi2x0123 = vi2x89AB;
354 vi3x0123 = vi3x89AB;
355 vi4x0123 = vi4x89AB;
356
357 // one right
358 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x579B, vw4567, 0);
359 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x579B, vw89AB, 1);
360 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x579B, vwCDEF, 2);
361 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x579B, vwGHIJ, 3);
362 vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x579B, vwOP, 0);
363
364 // two right
365 const float32x4_t vi0x68AC = vextq_f32(vi0x468A, vi0xCDEF, 1);
366 const float32x4_t vi1x68AC = vextq_f32(vi1x468A, vi1xCDEF, 1);
367 const float32x4_t vi2x68AC = vextq_f32(vi2x468A, vi2xCDEF, 1);
368 const float32x4_t vi3x68AC = vextq_f32(vi3x468A, vi3xCDEF, 1);
369 const float32x4_t vi4x68AC = vextq_f32(vi4x468A, vi4xCDEF, 1);
370
371 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi0x68AC, vw4567, 1);
372 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi1x68AC, vw89AB, 2);
373 vo468Ap00 = vfmaq_laneq_f32(vo468Ap00, vi2x68AC, vwCDEF, 3);
374 vo468Ap01 = vfmaq_laneq_f32(vo468Ap01, vi3x68AC, vwKLMN, 0);
375 vo468Ap00 = vfmaq_lane_f32( vo468Ap00, vi4x68AC, vwOP, 1);
376
377 vi0x4567 = vi0xCDEF;
378 vi1x4567 = vi1xCDEF;
379 vi2x4567 = vi2xCDEF;
380 vi3x4567 = vi3xCDEF;
381 vi4x4567 = vi4xCDEF;
382
383 float32x4_t vo0 = vaddq_f32(vo468Ap00, vo468Ap01);
384
Frank Barchardfcfdc0e2019-10-21 15:58:42 -0700385 vo0 = vmaxq_f32(vo0, vmin);
386 vo0 = vminq_f32(vo0, vmax);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700387
388 size_t k_tmp = (k + 1) / 2;
389 if XNN_LIKELY(k_tmp >= 4) {
390 vst1q_f32(output0, vo0);
391 output0 = (float*) ((uintptr_t) output0 + output_tuple_stride);
392 } else {
393 float* output0_lo = output0;
394 float32x2_t vo0_lo = vget_low_f32(vo0);
395 if (k_tmp & 2) {
396 vst1_f32(output0_lo, vo0_lo); output0_lo += 2;
397 vo0_lo = vget_high_f32(vo0);
398 }
399 if (k_tmp & 1) {
400 vst1_lane_f32(output0_lo, vo0_lo, 0);
401 }
402 }
403 }
404
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700405 i0 = (const float*) ((uintptr_t) i2 - input_width_decrement_single);
406 i1 = (const float*) ((uintptr_t) i2 + input_width_increment_single);
407 i2 = (const float*) ((uintptr_t) i2 + input_width_increment_double);
408 i3 = (const float*) ((uintptr_t) i3 + input_width_increment_double);
409 i4 = (const float*) ((uintptr_t) i4 + input_width_increment_double);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700410 output0 = (float*) ((uintptr_t) output0 + output_width_increment_single);
411 m -= 1;
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700412 if (m == 1) {
Erich Elseneda9c112020-05-11 04:40:25 -0700413 i4 = zero;
414 // we mimic the following logic:
415 // if (padding_top == 2 && input_height % 2 == 1) {
416 // i3 = zero;
417 // } else if (padding_top == 1 && input_height % 2 == 0) {
418 // i3 = zero;
419 // }
420 // with: padding_top - 1 == input_height % 2
421 if (padding_top - 1 == input_height % 2) {
422 i3 = zero;
423 }
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700424 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700425 } while (m > 0);
426}