blob: 557aa7afb2cc94c3a950b568997e34c1ddf93d2b [file] [log] [blame]
Marat Dukhan5098c3e2019-11-07 12:01:19 -08001// Auto-generated file. Do not edit!
2// Template: src/f32-dwconv/up-sse.c.in
3// Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <xmmintrin.h>
13
14#include <xnnpack/dwconv.h>
15
16
17void xnn_f32_dwconv_ukernel_up8x9__sse(
18 size_t channels,
19 size_t output_width,
20 const float** input,
21 const float* weights,
22 float* output,
23 size_t input_stride,
24 size_t output_increment,
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070025 const union xnn_f32_minmax_params params[restrict static 1])
Marat Dukhan5098c3e2019-11-07 12:01:19 -080026{
27 assert(channels != 0);
28 assert(output_width != 0);
29
30 const __m128 vmax = _mm_load_ps(params->sse.max);
31 const __m128 vmin = _mm_load_ps(params->sse.min);
32 do {
33 const float* i0 = input[0];
Marat Dukhan68660992020-02-03 13:31:12 -080034 assert(i0 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080035 const float* i1 = input[1];
Marat Dukhan68660992020-02-03 13:31:12 -080036 assert(i1 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080037 const float* i2 = input[2];
Marat Dukhan68660992020-02-03 13:31:12 -080038 assert(i2 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080039 const float* i3 = input[3];
Marat Dukhan68660992020-02-03 13:31:12 -080040 assert(i3 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080041 const float* i4 = input[4];
Marat Dukhan68660992020-02-03 13:31:12 -080042 assert(i4 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080043 const float* i5 = input[5];
Marat Dukhan68660992020-02-03 13:31:12 -080044 assert(i5 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080045 const float* i6 = input[6];
Marat Dukhan68660992020-02-03 13:31:12 -080046 assert(i6 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080047 const float* i7 = input[7];
Marat Dukhan68660992020-02-03 13:31:12 -080048 assert(i7 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080049 const float* i8 = input[8];
Marat Dukhan68660992020-02-03 13:31:12 -080050 assert(i8 != NULL);
Marat Dukhan5098c3e2019-11-07 12:01:19 -080051 input = (const float**) ((uintptr_t) input + input_stride);
52
53 size_t c = channels;
54 const float* w = weights;
55 for (; c >= 8; c -= 8) {
56 __m128 vacc0123p0 = _mm_load_ps(w);
57 __m128 vacc4567p0 = _mm_load_ps(w + 4);
58
59
60 const __m128 vi0x0123 = _mm_loadu_ps(i0);
61 const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
62 i0 += 8;
63
64 const __m128 vk0x0123 = _mm_load_ps(w + 8);
65 const __m128 vk0x4567 = _mm_load_ps(w + 12);
66 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
67 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
68
69 const __m128 vi1x0123 = _mm_loadu_ps(i1);
70 const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
71 i1 += 8;
72
73 const __m128 vk1x0123 = _mm_load_ps(w + 16);
74 const __m128 vk1x4567 = _mm_load_ps(w + 20);
75 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
76 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
77
78 const __m128 vi2x0123 = _mm_loadu_ps(i2);
79 const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
80 i2 += 8;
81
82 const __m128 vk2x0123 = _mm_load_ps(w + 24);
83 const __m128 vk2x4567 = _mm_load_ps(w + 28);
84 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
85 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
86
87 const __m128 vi3x0123 = _mm_loadu_ps(i3);
88 const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
89 i3 += 8;
90
91 const __m128 vk3x0123 = _mm_load_ps(w + 32);
92 const __m128 vk3x4567 = _mm_load_ps(w + 36);
93 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
94 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
95
96 const __m128 vi4x0123 = _mm_loadu_ps(i4);
97 const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
98 i4 += 8;
99
100 const __m128 vk4x0123 = _mm_load_ps(w + 40);
101 const __m128 vk4x4567 = _mm_load_ps(w + 44);
102 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
103 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
104
105 const __m128 vi5x0123 = _mm_loadu_ps(i5);
106 const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
107 i5 += 8;
108
109 const __m128 vk5x0123 = _mm_load_ps(w + 48);
110 const __m128 vk5x4567 = _mm_load_ps(w + 52);
111 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
112 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
113
114 const __m128 vi6x0123 = _mm_loadu_ps(i6);
115 const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
116 i6 += 8;
117
118 const __m128 vk6x0123 = _mm_load_ps(w + 56);
119 const __m128 vk6x4567 = _mm_load_ps(w + 60);
120 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
121 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
122
123 const __m128 vi7x0123 = _mm_loadu_ps(i7);
124 const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
125 i7 += 8;
126
127 const __m128 vk7x0123 = _mm_load_ps(w + 64);
128 const __m128 vk7x4567 = _mm_load_ps(w + 68);
129 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
130 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
131
132 const __m128 vi8x0123 = _mm_loadu_ps(i8);
133 const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
134 i8 += 8;
135
136 const __m128 vk8x0123 = _mm_load_ps(w + 72);
137 const __m128 vk8x4567 = _mm_load_ps(w + 76);
138 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
139 vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
140
141 w += 80;
142
143
144 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
145 __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
146 vacc0123 = _mm_min_ps(vacc0123, vmax);
147 vacc4567 = _mm_min_ps(vacc4567, vmax);
148
149 _mm_storeu_ps(output, vacc0123);
150 _mm_storeu_ps(output + 4, vacc4567);
151 output += 8;
152 }
153 for (; c >= 4; c -= 4) {
154 __m128 vacc0123p0 = _mm_load_ps(w);
155
156 const __m128 vi0x0123 = _mm_loadu_ps(i0);
157 i0 += 4;
158
159 const __m128 vk0x0123 = _mm_load_ps(w + 8);
160 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
161
162 const __m128 vi1x0123 = _mm_loadu_ps(i1);
163 i1 += 4;
164
165 const __m128 vk1x0123 = _mm_load_ps(w + 16);
166 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
167
168 const __m128 vi2x0123 = _mm_loadu_ps(i2);
169 i2 += 4;
170
171 const __m128 vk2x0123 = _mm_load_ps(w + 24);
172 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
173
174 const __m128 vi3x0123 = _mm_loadu_ps(i3);
175 i3 += 4;
176
177 const __m128 vk3x0123 = _mm_load_ps(w + 32);
178 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
179
180 const __m128 vi4x0123 = _mm_loadu_ps(i4);
181 i4 += 4;
182
183 const __m128 vk4x0123 = _mm_load_ps(w + 40);
184 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
185
186 const __m128 vi5x0123 = _mm_loadu_ps(i5);
187 i5 += 4;
188
189 const __m128 vk5x0123 = _mm_load_ps(w + 48);
190 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
191
192 const __m128 vi6x0123 = _mm_loadu_ps(i6);
193 i6 += 4;
194
195 const __m128 vk6x0123 = _mm_load_ps(w + 56);
196 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
197
198 const __m128 vi7x0123 = _mm_loadu_ps(i7);
199 i7 += 4;
200
201 const __m128 vk7x0123 = _mm_load_ps(w + 64);
202 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
203
204 const __m128 vi8x0123 = _mm_loadu_ps(i8);
205 i8 += 4;
206
207 const __m128 vk8x0123 = _mm_load_ps(w + 72);
208 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
209
210 w += 4;
211
212
213 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
214 vacc0123 = _mm_min_ps(vacc0123, vmax);
215
216 _mm_storeu_ps(output, vacc0123);
217 output += 4;
218 }
219 if XNN_UNLIKELY(c != 0) {
220 __m128 vacc0123p0 = _mm_load_ps(w);
221
222 const __m128 vi0x0123 = _mm_loadu_ps(i0);
223 const __m128 vk0x0123 = _mm_load_ps(w + 8);
224 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
225
226 const __m128 vi1x0123 = _mm_loadu_ps(i1);
227 const __m128 vk1x0123 = _mm_load_ps(w + 16);
228 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
229
230 const __m128 vi2x0123 = _mm_loadu_ps(i2);
231 const __m128 vk2x0123 = _mm_load_ps(w + 24);
232 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
233
234 const __m128 vi3x0123 = _mm_loadu_ps(i3);
235 const __m128 vk3x0123 = _mm_load_ps(w + 32);
236 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
237
238 const __m128 vi4x0123 = _mm_loadu_ps(i4);
239 const __m128 vk4x0123 = _mm_load_ps(w + 40);
240 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
241
242 const __m128 vi5x0123 = _mm_loadu_ps(i5);
243 const __m128 vk5x0123 = _mm_load_ps(w + 48);
244 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
245
246 const __m128 vi6x0123 = _mm_loadu_ps(i6);
247 const __m128 vk6x0123 = _mm_load_ps(w + 56);
248 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
249
250 const __m128 vi7x0123 = _mm_loadu_ps(i7);
251 const __m128 vk7x0123 = _mm_load_ps(w + 64);
252 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
253
254 const __m128 vi8x0123 = _mm_loadu_ps(i8);
255 const __m128 vk8x0123 = _mm_load_ps(w + 72);
256 vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
257
258
259 __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
260 vacc0123 = _mm_min_ps(vacc0123, vmax);
261
262 if (c & 2) {
263 _mm_storel_pi((__m64*) output, vacc0123);
264 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
265 output += 2;
266 }
267 if (c & 1) {
268 _mm_store_ss(output, vacc0123);
269 output += 1;
270 }
271 }
272
273 output = (float*) ((uintptr_t) output + output_increment);
274 } while (--output_width != 0);
275}