blob: b9f1cedfaff28be65a447322775d8b0d5f99af96 [file] [log] [blame]
Erich Elsenfd7a6e32020-06-11 12:04:44 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <assert.h>
7
8#include <psimd.h>
9
10#include <xnnpack/dwconv.h>
11#include <xnnpack/math.h>
12
13PSIMD_INTRINSIC psimd_f32 rotright_f32(psimd_f32 a) {
14 #if defined(__clang__)
15 return __builtin_shufflevector(a, a, 3, 0, 1, 2);
16 #else
17 return __builtin_shuffle(a, (psimd_s32){3, 0, 1, 2});
18 #endif // defined(__clang__)
19}
20
21PSIMD_INTRINSIC psimd_f32 movess_f32(psimd_f32 a, psimd_f32 b) {
22 #if defined(__clang__)
23 return __builtin_shufflevector(a, b, 4, 1, 2, 3);
24 #else
25 return __builtin_shuffle(a, b, (psimd_s32){4, 1, 2, 3});
26 #endif // defined(__clang__)
27}
28
29
Marat Dukhan6f469a52020-10-21 20:02:52 -070030void xnn_f32_dwconv_chw_ukernel_3x3s2p1__psimd_1x4_acc3(
Erich Elsenfd7a6e32020-06-11 12:04:44 -070031 size_t input_height,
32 size_t input_width,
33 const float* input,
34 const float* weights,
35 const float* zero,
36 float* output,
37 uint32_t padding_top,
Erich Elsenfd7a6e32020-06-11 12:04:44 -070038 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
39{
Marat Dukhan75157772020-10-21 01:46:28 -070040 assert(input_height != 0);
Erich Elsenfd7a6e32020-06-11 12:04:44 -070041 assert(input_width != 0);
Marat Dukhan75157772020-10-21 01:46:28 -070042 assert(input_width % sizeof(float) == 0);
Marat Dukhana690c092020-10-20 19:01:06 -070043 assert(padding_top >= 0);
44 assert(padding_top <= 1);
Erich Elsenfd7a6e32020-06-11 12:04:44 -070045
Erich Elsenfd7a6e32020-06-11 12:04:44 -070046 const psimd_s32 vmask_even = psimd_load_s32(params->scalar.mask_even);
47 const psimd_s32 vmask_odd = psimd_load_s32(params->scalar.mask_odd);
48 const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
49 const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
50
Erich Elsenfd7a6e32020-06-11 12:04:44 -070051 const psimd_f32 vbias = psimd_load_splat_f32(weights);
52 const psimd_f32 vk00 = psimd_load_splat_f32(weights + 1);
53 const psimd_f32 vk01 = psimd_load_splat_f32(weights + 2);
54 const psimd_f32 vk02 = psimd_load_splat_f32(weights + 3);
55 const psimd_f32 vk10 = psimd_load_splat_f32(weights + 4);
56 const psimd_f32 vk11 = psimd_load_splat_f32(weights + 5);
57 const psimd_f32 vk12 = psimd_load_splat_f32(weights + 6);
58 const psimd_f32 vk20 = psimd_load_splat_f32(weights + 7);
59 const psimd_f32 vk21 = psimd_load_splat_f32(weights + 8);
60 const psimd_f32 vk22 = psimd_load_splat_f32(weights + 9);
61
Marat Dukhan75157772020-10-21 01:46:28 -070062 const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float));
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070063
Marat Dukhan75157772020-10-21 01:46:28 -070064 const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width));
65 const float* i1 = (const float*) ((uintptr_t) i0 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070066 if XNN_UNPREDICTABLE(padding_top != 0) {
67 i0 = zero;
68 }
Marat Dukhan75157772020-10-21 01:46:28 -070069 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070070
71 size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */;
72 size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2;
Erich Elsenfd7a6e32020-06-11 12:04:44 -070073 do {
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -070074 if XNN_UNPREDICTABLE(padded_input_height <= 3) {
75 i2 = zero;
76 }
77
Erich Elsenfd7a6e32020-06-11 12:04:44 -070078 psimd_f32 vi0x7531 = psimd_zero_f32();
79 psimd_f32 vi1x7531 = psimd_zero_f32();
80 psimd_f32 vi2x7531 = psimd_zero_f32();
81
Marat Dukhana690c092020-10-20 19:01:06 -070082 size_t w = input_width;
Marat Dukhan75157772020-10-21 01:46:28 -070083 for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) {
Erich Elsenfd7a6e32020-06-11 12:04:44 -070084 psimd_f32 vo8ACEp0 = vbias;
85
86 const psimd_f32 vi0x89AB = psimd_load_f32(i0);
Erich Elsenfd7a6e32020-06-11 12:04:44 -070087 const psimd_f32 vi1x89AB = psimd_load_f32(i1);
Erich Elsenfd7a6e32020-06-11 12:04:44 -070088 const psimd_f32 vi2x89AB = psimd_load_f32(i2);
Erich Elsenfd7a6e32020-06-11 12:04:44 -070089
Marat Dukhanbc967c72020-10-20 18:18:40 -070090 const psimd_f32 vi0xCDEF = psimd_load_f32(i0 + 4);
91 i0 += 8;
92 const psimd_f32 vi1xCDEF = psimd_load_f32(i1 + 4);
93 i1 += 8;
94 const psimd_f32 vi2xCDEF = psimd_load_f32(i2 + 4);
95 i2 += 8;
Erich Elsenfd7a6e32020-06-11 12:04:44 -070096
97 const psimd_f32 vi0x8ACE = psimd_concat_even_f32(vi0x89AB, vi0xCDEF);
98 const psimd_f32 vi0x9BDF = psimd_concat_odd_f32(vi0x89AB, vi0xCDEF);
99 const psimd_f32 vi1x8ACE = psimd_concat_even_f32(vi1x89AB, vi1xCDEF);
100 const psimd_f32 vi1x9BDF = psimd_concat_odd_f32(vi1x89AB, vi1xCDEF);
101 const psimd_f32 vi2x8ACE = psimd_concat_even_f32(vi2x89AB, vi2xCDEF);
102 const psimd_f32 vi2x9BDF = psimd_concat_odd_f32(vi2x89AB, vi2xCDEF);
103
104 vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x8ACE, vk01));
105 psimd_f32 vo8ACEp1 = psimd_mul_f32(vi1x8ACE, vk11);
106 psimd_f32 vo8ACEp2 = psimd_mul_f32(vi2x8ACE, vk21);
107
108 const psimd_f32 vi0xF9BD = rotright_f32(vi0x9BDF);
109 const psimd_f32 vi1xF9BD = rotright_f32(vi1x9BDF);
110 const psimd_f32 vi2xF9BD = rotright_f32(vi2x9BDF);
111
112 vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x9BDF, vk02));
113 vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x9BDF, vk12));
114 vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x9BDF, vk22));
115
116 const psimd_f32 vi0x7BDF = movess_f32(vi0xF9BD, vi0x7531);
117 const psimd_f32 vi1x7BDF = movess_f32(vi1xF9BD, vi1x7531);
118 const psimd_f32 vi2x7BDF = movess_f32(vi2xF9BD, vi2x7531);
119
120 vi0x7531 = vi0xF9BD;
121 vi1x7531 = vi1xF9BD;
122 vi2x7531 = vi2xF9BD;
123
124 vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x7BDF, vk00));
125 vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x7BDF, vk10));
126 vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x7BDF, vk20));
127
128 psimd_f32 vo = psimd_add_f32(vo8ACEp0, vo8ACEp1);
129 vo = psimd_add_f32(vo, vo8ACEp2);
130
131 vo = psimd_max_f32(vo, vmin);
132 vo = psimd_min_f32(vo, vmax);
133
134 psimd_store_f32(output, vo);
Marat Dukhanbc967c72020-10-20 18:18:40 -0700135 output += 4;
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700136 }
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700137 // Potentially process the last block of 0..7 pixels.
Marat Dukhan75157772020-10-21 01:46:28 -0700138 assert(w < 8 * sizeof(float));
Marat Dukhana690c092020-10-20 19:01:06 -0700139 if XNN_LIKELY(w != 0) {
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700140 psimd_f32 vo8ACEp0 = vbias;
141
142 const psimd_f32 vi0x89AB = psimd_load_f32(i0);
143 const psimd_f32 vi1x89AB = psimd_load_f32(i1);
144 const psimd_f32 vi2x89AB = psimd_load_f32(i2);
145
Marat Dukhanbc967c72020-10-20 18:18:40 -0700146 const psimd_f32 vi0xCDEF = psimd_load_f32(i0 + 4);
147 const psimd_f32 vi1xCDEF = psimd_load_f32(i1 + 4);
148 const psimd_f32 vi2xCDEF = psimd_load_f32(i2 + 4);
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700149
150 const psimd_f32 vi0x8ACE = psimd_andmask_f32(vmask_even, psimd_concat_even_f32(vi0x89AB, vi0xCDEF));
151 const psimd_f32 vi0x9BDF = psimd_andmask_f32(vmask_odd, psimd_concat_odd_f32(vi0x89AB, vi0xCDEF));
152 const psimd_f32 vi1x8ACE = psimd_andmask_f32(vmask_even, psimd_concat_even_f32(vi1x89AB, vi1xCDEF));
153 const psimd_f32 vi1x9BDF = psimd_andmask_f32(vmask_odd, psimd_concat_odd_f32(vi1x89AB, vi1xCDEF));
154 const psimd_f32 vi2x8ACE = psimd_andmask_f32(vmask_even, psimd_concat_even_f32(vi2x89AB, vi2xCDEF));
155 const psimd_f32 vi2x9BDF = psimd_andmask_f32(vmask_odd, psimd_concat_odd_f32(vi2x89AB, vi2xCDEF));
156
157 vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x8ACE, vk01));
158 psimd_f32 vo8ACEp1 = psimd_mul_f32(vi1x8ACE, vk11);
159 psimd_f32 vo8ACEp2 = psimd_mul_f32(vi2x8ACE, vk21);
160
161 const psimd_f32 vi0xF9BD = rotright_f32(vi0x9BDF);
162 const psimd_f32 vi1xF9BD = rotright_f32(vi1x9BDF);
163 const psimd_f32 vi2xF9BD = rotright_f32(vi2x9BDF);
164
165 vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x9BDF, vk02));
166 vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x9BDF, vk12));
167 vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x9BDF, vk22));
168
169 const psimd_f32 vi0x7BDF = movess_f32(vi0xF9BD, vi0x7531);
170 const psimd_f32 vi1x7BDF = movess_f32(vi1xF9BD, vi1x7531);
171 const psimd_f32 vi2x7BDF = movess_f32(vi2xF9BD, vi2x7531);
172
173 vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x7BDF, vk00));
174 vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x7BDF, vk10));
175 vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x7BDF, vk20));
176
177 psimd_f32 vo = psimd_add_f32(vo8ACEp0, vo8ACEp1);
178 vo = psimd_add_f32(vo, vo8ACEp2);
179
180 vo = psimd_max_f32(vo, vmin);
181 vo = psimd_min_f32(vo, vmax);
182
Marat Dukhan75157772020-10-21 01:46:28 -0700183 if (w == 7 * sizeof(float)) {
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700184 psimd_store_f32(output, vo);
Marat Dukhanc808c972020-10-20 19:39:41 -0700185 output += 4;
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700186 } else {
Marat Dukhan75157772020-10-21 01:46:28 -0700187 w += 1 * sizeof(float);
188 if (w & (4 * sizeof(float))) {
Marat Dukhanc808c972020-10-20 19:39:41 -0700189 psimd_store2_f32(output, vo);
190 output += 2;
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700191 vo = psimd_concat_hi_f32(vo, vo);
192 }
Marat Dukhan75157772020-10-21 01:46:28 -0700193 if (w & (2 * sizeof(float))) {
Marat Dukhanc808c972020-10-20 19:39:41 -0700194 psimd_store1_f32(output, vo);
195 output += 1;
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700196 }
197 }
198 }
199
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700200 i0 = (const float*) ((uintptr_t) i2 - input_decrement);
Marat Dukhan75157772020-10-21 01:46:28 -0700201 i1 = (const float*) ((uintptr_t) i0 + input_width);
202 i2 = (const float*) ((uintptr_t) i1 + input_width);
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700203
Marat Dukhan04d91c82020-10-20 18:35:44 -0700204 output_height -= 1;
Marat Dukhan7ed0e3c2020-10-21 00:41:31 -0700205 padded_input_height -= 2;
Marat Dukhan04d91c82020-10-20 18:35:44 -0700206 } while (output_height != 0);
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700207}