blob: 5854f4f208421c66abdbb3ce043e40233bebe81b [file] [log] [blame]
Marat Dukhan0ff97182020-10-25 19:14:03 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-dwconv2d-chw/3x3s2p1-sse.c.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
XNNPACK Teamb455b122019-09-27 18:10:33 -07006//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <xmmintrin.h>
13
14#include <xnnpack/dwconv.h>
15#include <xnnpack/math.h>
16
17
Marat Dukhanbf715f92020-10-23 20:17:00 -070018void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3(
Erich Elseneda9c112020-05-11 04:40:25 -070019 size_t input_height,
20 size_t input_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -070021 const float* input,
22 const float* weights,
Erich Elsen4e5db3d2020-05-07 08:57:47 -070023 const float* zero,
XNNPACK Teamb455b122019-09-27 18:10:33 -070024 float* output,
Erich Elsen4e5db3d2020-05-07 08:57:47 -070025 uint32_t padding_top,
Marat Dukhan1f29b802020-05-15 23:46:39 -070026 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
XNNPACK Teamb455b122019-09-27 18:10:33 -070027{
Marat Dukhan75157772020-10-21 01:46:28 -070028 assert(input_height != 0);
Erich Elseneda9c112020-05-11 04:40:25 -070029 assert(input_width != 0);
Marat Dukhan75157772020-10-21 01:46:28 -070030 assert(input_width % sizeof(float) == 0);
Marat Dukhana690c092020-10-20 19:01:06 -070031 assert(padding_top >= 0);
32 assert(padding_top <= 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -070033
34 const __m128 vmask_even = _mm_load_ps((const float*) params->sse.mask_even);
35 const __m128 vmask_odd = _mm_load_ps((const float*) params->sse.mask_odd);
36 const __m128 vmax = _mm_load_ps(params->sse.max);
37 const __m128 vmin = _mm_load_ps(params->sse.min);
38
XNNPACK Teamb455b122019-09-27 18:10:33 -070039 const __m128 vbias = _mm_load1_ps(weights);
40 const __m128 vk00 = _mm_load1_ps(weights + 1);
41 const __m128 vk01 = _mm_load1_ps(weights + 2);
42 const __m128 vk02 = _mm_load1_ps(weights + 3);
43 const __m128 vk10 = _mm_load1_ps(weights + 4);
44 const __m128 vk11 = _mm_load1_ps(weights + 5);
45 const __m128 vk12 = _mm_load1_ps(weights + 6);
46 const __m128 vk20 = _mm_load1_ps(weights + 7);
47 const __m128 vk21 = _mm_load1_ps(weights + 8);
48 const __m128 vk22 = _mm_load1_ps(weights + 9);
49
Marat Dukhan75157772020-10-21 01:46:28 -070050 const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070051
Marat Dukhan75157772020-10-21 01:46:28 -070052 const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
53 const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070054 if XNN_UNPREDICTABLE(padding_top != 0) {
55 i0 = zero;
56 }
Marat Dukhan75157772020-10-21 01:46:28 -070057 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070058
Marat Dukhan0ff97182020-10-25 19:14:03 -070059 float* o0 = output;
60
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070061 size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
62 size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
XNNPACK Teamb455b122019-09-27 18:10:33 -070063 do {
Marat Dukhan0ff97182020-10-25 19:14:03 -070064 if XNN_UNPREDICTABLE(padded_input_height < 4) {
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070065 i2 = zero;
66 }
67
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 __m128 vi0x7531 = _mm_setzero_ps();
69 __m128 vi1x7531 = _mm_setzero_ps();
70 __m128 vi2x7531 = _mm_setzero_ps();
71
Marat Dukhana690c092020-10-20 19:01:06 -070072 size_t w = input_width;
Marat Dukhan75157772020-10-21 01:46:28 -070073 for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070074 const __m128 vi0x89AB = _mm_loadu_ps(i0);
Marat Dukhanbc967c72020-10-20 18:18:40 -070075 const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
76 i0 += 8;
Marat Dukhan0ff97182020-10-25 19:14:03 -070077 const __m128 vi1x89AB = _mm_loadu_ps(i1);
Marat Dukhanbc967c72020-10-20 18:18:40 -070078 const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
79 i1 += 8;
Marat Dukhan0ff97182020-10-25 19:14:03 -070080 const __m128 vi2x89AB = _mm_loadu_ps(i2);
Marat Dukhanbc967c72020-10-20 18:18:40 -070081 const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
82 i2 += 8;
XNNPACK Teamb455b122019-09-27 18:10:33 -070083
84 const __m128 vi0x8ACE = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
85 const __m128 vi0x9BDF = _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
86 const __m128 vi1x8ACE = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
87 const __m128 vi1x9BDF = _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
88 const __m128 vi2x8ACE = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0));
89 const __m128 vi2x9BDF = _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1));
90
Marat Dukhan0ff97182020-10-25 19:14:03 -070091 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
92 __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
93 __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
XNNPACK Teamb455b122019-09-27 18:10:33 -070094
95 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
96 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
97 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
98
Marat Dukhan0ff97182020-10-25 19:14:03 -070099 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
100 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
101 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700102
103 const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
104 const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
105 const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
106
107 vi0x7531 = vi0xF9BD;
108 vi1x7531 = vi1xF9BD;
109 vi2x7531 = vi2xF9BD;
110
Marat Dukhan0ff97182020-10-25 19:14:03 -0700111 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
112 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
113 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700114
Marat Dukhan0ff97182020-10-25 19:14:03 -0700115 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
116 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700117
Marat Dukhan0ff97182020-10-25 19:14:03 -0700118 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700119
Marat Dukhan0ff97182020-10-25 19:14:03 -0700120 vo0 = _mm_min_ps(vo0, vmax);
121
122 _mm_storeu_ps(o0, vo0);
123 o0 += 4;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700124 }
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700125 // Potentially process the last block of 0..7 pixels.
Marat Dukhan75157772020-10-21 01:46:28 -0700126 assert(w < 8 * sizeof(float));
Marat Dukhana690c092020-10-20 19:01:06 -0700127 if XNN_LIKELY(w != 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700128 const __m128 vi0x89AB = _mm_loadu_ps(i0);
Marat Dukhanbc967c72020-10-20 18:18:40 -0700129 const __m128 vi0xCDEF = _mm_loadu_ps(i0 + 4);
Marat Dukhan0ff97182020-10-25 19:14:03 -0700130 const __m128 vi1x89AB = _mm_loadu_ps(i1);
Marat Dukhanbc967c72020-10-20 18:18:40 -0700131 const __m128 vi1xCDEF = _mm_loadu_ps(i1 + 4);
Marat Dukhan0ff97182020-10-25 19:14:03 -0700132 const __m128 vi2x89AB = _mm_loadu_ps(i2);
Marat Dukhanbc967c72020-10-20 18:18:40 -0700133 const __m128 vi2xCDEF = _mm_loadu_ps(i2 + 4);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700134
135 const __m128 vi0x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
136 const __m128 vi0x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi0x89AB, vi0xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
137 const __m128 vi1x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
138 const __m128 vi1x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi1x89AB, vi1xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
139 const __m128 vi2x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(2, 0, 2, 0)));
140 const __m128 vi2x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi2x89AB, vi2xCDEF, _MM_SHUFFLE(3, 1, 3, 1)));
141
Marat Dukhan0ff97182020-10-25 19:14:03 -0700142 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x8ACE, vk01));
143 __m128 vo0p1 = _mm_mul_ps(vi1x8ACE, vk11);
144 __m128 vo0p2 = _mm_mul_ps(vi2x8ACE, vk21);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700145
146 const __m128 vi0xF9BD = _mm_shuffle_ps(vi0x9BDF, vi0x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
147 const __m128 vi1xF9BD = _mm_shuffle_ps(vi1x9BDF, vi1x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
148 const __m128 vi2xF9BD = _mm_shuffle_ps(vi2x9BDF, vi2x9BDF, _MM_SHUFFLE(2, 1, 0, 3));
149
Marat Dukhan0ff97182020-10-25 19:14:03 -0700150 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x9BDF, vk02));
151 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x9BDF, vk12));
152 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x9BDF, vk22));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700153
154 const __m128 vi0x7BDF = _mm_move_ss(vi0xF9BD, vi0x7531);
155 const __m128 vi1x7BDF = _mm_move_ss(vi1xF9BD, vi1x7531);
156 const __m128 vi2x7BDF = _mm_move_ss(vi2xF9BD, vi2x7531);
157
Marat Dukhan0ff97182020-10-25 19:14:03 -0700158 vi0x7531 = vi0xF9BD;
159 vi1x7531 = vi1xF9BD;
160 vi2x7531 = vi2xF9BD;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700161
Marat Dukhan0ff97182020-10-25 19:14:03 -0700162 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x7BDF, vk00));
163 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi1x7BDF, vk10));
164 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x7BDF, vk20));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700165
Marat Dukhan0ff97182020-10-25 19:14:03 -0700166 vo0p0 = _mm_add_ps(vo0p0, vo0p1);
167 vo0p0 = _mm_add_ps(vo0p0, vo0p2);
168
169 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
170
171 vo0 = _mm_min_ps(vo0, vmax);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700172
Marat Dukhan75157772020-10-21 01:46:28 -0700173 if (w == 7 * sizeof(float)) {
Marat Dukhan0ff97182020-10-25 19:14:03 -0700174 _mm_storeu_ps(o0, vo0);
175 o0 += 4;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700176 } else {
Marat Dukhan75157772020-10-21 01:46:28 -0700177 w += 1 * sizeof(float);
178 if (w & (4 * sizeof(float))) {
Marat Dukhan0ff97182020-10-25 19:14:03 -0700179 _mm_storel_pi((__m64*) o0, vo0);
180 o0 += 2;
181
182 vo0 = _mm_movehl_ps(vo0, vo0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700183 }
Marat Dukhan75157772020-10-21 01:46:28 -0700184 if (w & (2 * sizeof(float))) {
Marat Dukhan0ff97182020-10-25 19:14:03 -0700185 _mm_store_ss(o0, vo0);
186 o0 += 1;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700187 }
188 }
189 }
190
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700191 i0 = (const float*) ((uintptr_t) i2 - input_decrement);
Marat Dukhan75157772020-10-21 01:46:28 -0700192 i1 = (const float*) ((uintptr_t) i0 + input_width);
193 i2 = (const float*) ((uintptr_t) i1 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700194
Marat Dukhan0ff97182020-10-25 19:14:03 -0700195
Marat Dukhan04d91c82020-10-20 18:35:44 -0700196 output_height -= 1;
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700197 padded_input_height -= 2;
Marat Dukhan04d91c82020-10-20 18:35:44 -0700198 } while (output_height != 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700199}