Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 1 | // Copyright 2019 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
| 6 | #include <assert.h> |
| 7 | |
| 8 | #include <psimd.h> |
| 9 | |
| 10 | #include <xnnpack/dwconv.h> |
| 11 | #include <xnnpack/math.h> |
| 12 | |
| 13 | PSIMD_INTRINSIC psimd_f32 rotright_f32(psimd_f32 a) { |
| 14 | #if defined(__clang__) |
| 15 | return __builtin_shufflevector(a, a, 3, 0, 1, 2); |
| 16 | #else |
| 17 | return __builtin_shuffle(a, (psimd_s32){3, 0, 1, 2}); |
| 18 | #endif // defined(__clang__) |
| 19 | } |
| 20 | |
| 21 | PSIMD_INTRINSIC psimd_f32 movess_f32(psimd_f32 a, psimd_f32 b) { |
| 22 | #if defined(__clang__) |
| 23 | return __builtin_shufflevector(a, b, 4, 1, 2, 3); |
| 24 | #else |
| 25 | return __builtin_shuffle(a, b, (psimd_s32){4, 1, 2, 3}); |
| 26 | #endif // defined(__clang__) |
| 27 | } |
| 28 | |
| 29 | |
Marat Dukhan | 6f469a5 | 2020-10-21 20:02:52 -0700 | [diff] [blame] | 30 | void xnn_f32_dwconv_chw_ukernel_3x3s2p1__psimd_1x4_acc3( |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 31 | size_t input_height, |
| 32 | size_t input_width, |
| 33 | const float* input, |
| 34 | const float* weights, |
| 35 | const float* zero, |
| 36 | float* output, |
| 37 | uint32_t padding_top, |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 38 | const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) |
| 39 | { |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 40 | assert(input_height != 0); |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 41 | assert(input_width != 0); |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 42 | assert(input_width % sizeof(float) == 0); |
Marat Dukhan | a690c09 | 2020-10-20 19:01:06 -0700 | [diff] [blame] | 43 | assert(padding_top >= 0); |
| 44 | assert(padding_top <= 1); |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 45 | |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 46 | const psimd_s32 vmask_even = psimd_load_s32(params->scalar.mask_even); |
| 47 | const psimd_s32 vmask_odd = psimd_load_s32(params->scalar.mask_odd); |
| 48 | const psimd_f32 vmax = psimd_load_splat_f32(¶ms->scalar.max); |
| 49 | const psimd_f32 vmin = psimd_load_splat_f32(¶ms->scalar.min); |
| 50 | |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 51 | const psimd_f32 vbias = psimd_load_splat_f32(weights); |
| 52 | const psimd_f32 vk00 = psimd_load_splat_f32(weights + 1); |
| 53 | const psimd_f32 vk01 = psimd_load_splat_f32(weights + 2); |
| 54 | const psimd_f32 vk02 = psimd_load_splat_f32(weights + 3); |
| 55 | const psimd_f32 vk10 = psimd_load_splat_f32(weights + 4); |
| 56 | const psimd_f32 vk11 = psimd_load_splat_f32(weights + 5); |
| 57 | const psimd_f32 vk12 = psimd_load_splat_f32(weights + 6); |
| 58 | const psimd_f32 vk20 = psimd_load_splat_f32(weights + 7); |
| 59 | const psimd_f32 vk21 = psimd_load_splat_f32(weights + 8); |
| 60 | const psimd_f32 vk22 = psimd_load_splat_f32(weights + 9); |
| 61 | |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 62 | const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float)); |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 63 | |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 64 | const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width)); |
| 65 | const float* i1 = (const float*) ((uintptr_t) i0 + input_width); |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 66 | if XNN_UNPREDICTABLE(padding_top != 0) { |
| 67 | i0 = zero; |
| 68 | } |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 69 | const float* i2 = (const float*) ((uintptr_t) i1 + input_width); |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 70 | |
| 71 | size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; |
| 72 | size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 73 | do { |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 74 | if XNN_UNPREDICTABLE(padded_input_height <= 3) { |
| 75 | i2 = zero; |
| 76 | } |
| 77 | |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 78 | psimd_f32 vi0x7531 = psimd_zero_f32(); |
| 79 | psimd_f32 vi1x7531 = psimd_zero_f32(); |
| 80 | psimd_f32 vi2x7531 = psimd_zero_f32(); |
| 81 | |
Marat Dukhan | a690c09 | 2020-10-20 19:01:06 -0700 | [diff] [blame] | 82 | size_t w = input_width; |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 83 | for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) { |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 84 | psimd_f32 vo8ACEp0 = vbias; |
| 85 | |
| 86 | const psimd_f32 vi0x89AB = psimd_load_f32(i0); |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 87 | const psimd_f32 vi1x89AB = psimd_load_f32(i1); |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 88 | const psimd_f32 vi2x89AB = psimd_load_f32(i2); |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 89 | |
Marat Dukhan | bc967c7 | 2020-10-20 18:18:40 -0700 | [diff] [blame] | 90 | const psimd_f32 vi0xCDEF = psimd_load_f32(i0 + 4); |
| 91 | i0 += 8; |
| 92 | const psimd_f32 vi1xCDEF = psimd_load_f32(i1 + 4); |
| 93 | i1 += 8; |
| 94 | const psimd_f32 vi2xCDEF = psimd_load_f32(i2 + 4); |
| 95 | i2 += 8; |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 96 | |
| 97 | const psimd_f32 vi0x8ACE = psimd_concat_even_f32(vi0x89AB, vi0xCDEF); |
| 98 | const psimd_f32 vi0x9BDF = psimd_concat_odd_f32(vi0x89AB, vi0xCDEF); |
| 99 | const psimd_f32 vi1x8ACE = psimd_concat_even_f32(vi1x89AB, vi1xCDEF); |
| 100 | const psimd_f32 vi1x9BDF = psimd_concat_odd_f32(vi1x89AB, vi1xCDEF); |
| 101 | const psimd_f32 vi2x8ACE = psimd_concat_even_f32(vi2x89AB, vi2xCDEF); |
| 102 | const psimd_f32 vi2x9BDF = psimd_concat_odd_f32(vi2x89AB, vi2xCDEF); |
| 103 | |
| 104 | vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x8ACE, vk01)); |
| 105 | psimd_f32 vo8ACEp1 = psimd_mul_f32(vi1x8ACE, vk11); |
| 106 | psimd_f32 vo8ACEp2 = psimd_mul_f32(vi2x8ACE, vk21); |
| 107 | |
| 108 | const psimd_f32 vi0xF9BD = rotright_f32(vi0x9BDF); |
| 109 | const psimd_f32 vi1xF9BD = rotright_f32(vi1x9BDF); |
| 110 | const psimd_f32 vi2xF9BD = rotright_f32(vi2x9BDF); |
| 111 | |
| 112 | vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x9BDF, vk02)); |
| 113 | vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x9BDF, vk12)); |
| 114 | vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x9BDF, vk22)); |
| 115 | |
| 116 | const psimd_f32 vi0x7BDF = movess_f32(vi0xF9BD, vi0x7531); |
| 117 | const psimd_f32 vi1x7BDF = movess_f32(vi1xF9BD, vi1x7531); |
| 118 | const psimd_f32 vi2x7BDF = movess_f32(vi2xF9BD, vi2x7531); |
| 119 | |
| 120 | vi0x7531 = vi0xF9BD; |
| 121 | vi1x7531 = vi1xF9BD; |
| 122 | vi2x7531 = vi2xF9BD; |
| 123 | |
| 124 | vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x7BDF, vk00)); |
| 125 | vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x7BDF, vk10)); |
| 126 | vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x7BDF, vk20)); |
| 127 | |
| 128 | psimd_f32 vo = psimd_add_f32(vo8ACEp0, vo8ACEp1); |
| 129 | vo = psimd_add_f32(vo, vo8ACEp2); |
| 130 | |
| 131 | vo = psimd_max_f32(vo, vmin); |
| 132 | vo = psimd_min_f32(vo, vmax); |
| 133 | |
| 134 | psimd_store_f32(output, vo); |
Marat Dukhan | bc967c7 | 2020-10-20 18:18:40 -0700 | [diff] [blame] | 135 | output += 4; |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 136 | } |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 137 | // Potentially process the last block of 0..7 pixels. |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 138 | assert(w < 8 * sizeof(float)); |
Marat Dukhan | a690c09 | 2020-10-20 19:01:06 -0700 | [diff] [blame] | 139 | if XNN_LIKELY(w != 0) { |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 140 | psimd_f32 vo8ACEp0 = vbias; |
| 141 | |
| 142 | const psimd_f32 vi0x89AB = psimd_load_f32(i0); |
| 143 | const psimd_f32 vi1x89AB = psimd_load_f32(i1); |
| 144 | const psimd_f32 vi2x89AB = psimd_load_f32(i2); |
| 145 | |
Marat Dukhan | bc967c7 | 2020-10-20 18:18:40 -0700 | [diff] [blame] | 146 | const psimd_f32 vi0xCDEF = psimd_load_f32(i0 + 4); |
| 147 | const psimd_f32 vi1xCDEF = psimd_load_f32(i1 + 4); |
| 148 | const psimd_f32 vi2xCDEF = psimd_load_f32(i2 + 4); |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 149 | |
| 150 | const psimd_f32 vi0x8ACE = psimd_andmask_f32(vmask_even, psimd_concat_even_f32(vi0x89AB, vi0xCDEF)); |
| 151 | const psimd_f32 vi0x9BDF = psimd_andmask_f32(vmask_odd, psimd_concat_odd_f32(vi0x89AB, vi0xCDEF)); |
| 152 | const psimd_f32 vi1x8ACE = psimd_andmask_f32(vmask_even, psimd_concat_even_f32(vi1x89AB, vi1xCDEF)); |
| 153 | const psimd_f32 vi1x9BDF = psimd_andmask_f32(vmask_odd, psimd_concat_odd_f32(vi1x89AB, vi1xCDEF)); |
| 154 | const psimd_f32 vi2x8ACE = psimd_andmask_f32(vmask_even, psimd_concat_even_f32(vi2x89AB, vi2xCDEF)); |
| 155 | const psimd_f32 vi2x9BDF = psimd_andmask_f32(vmask_odd, psimd_concat_odd_f32(vi2x89AB, vi2xCDEF)); |
| 156 | |
| 157 | vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x8ACE, vk01)); |
| 158 | psimd_f32 vo8ACEp1 = psimd_mul_f32(vi1x8ACE, vk11); |
| 159 | psimd_f32 vo8ACEp2 = psimd_mul_f32(vi2x8ACE, vk21); |
| 160 | |
| 161 | const psimd_f32 vi0xF9BD = rotright_f32(vi0x9BDF); |
| 162 | const psimd_f32 vi1xF9BD = rotright_f32(vi1x9BDF); |
| 163 | const psimd_f32 vi2xF9BD = rotright_f32(vi2x9BDF); |
| 164 | |
| 165 | vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x9BDF, vk02)); |
| 166 | vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x9BDF, vk12)); |
| 167 | vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x9BDF, vk22)); |
| 168 | |
| 169 | const psimd_f32 vi0x7BDF = movess_f32(vi0xF9BD, vi0x7531); |
| 170 | const psimd_f32 vi1x7BDF = movess_f32(vi1xF9BD, vi1x7531); |
| 171 | const psimd_f32 vi2x7BDF = movess_f32(vi2xF9BD, vi2x7531); |
| 172 | |
| 173 | vo8ACEp0 = psimd_add_f32(vo8ACEp0, psimd_mul_f32(vi0x7BDF, vk00)); |
| 174 | vo8ACEp1 = psimd_add_f32(vo8ACEp1, psimd_mul_f32(vi1x7BDF, vk10)); |
| 175 | vo8ACEp2 = psimd_add_f32(vo8ACEp2, psimd_mul_f32(vi2x7BDF, vk20)); |
| 176 | |
| 177 | psimd_f32 vo = psimd_add_f32(vo8ACEp0, vo8ACEp1); |
| 178 | vo = psimd_add_f32(vo, vo8ACEp2); |
| 179 | |
| 180 | vo = psimd_max_f32(vo, vmin); |
| 181 | vo = psimd_min_f32(vo, vmax); |
| 182 | |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 183 | if (w == 7 * sizeof(float)) { |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 184 | psimd_store_f32(output, vo); |
Marat Dukhan | c808c97 | 2020-10-20 19:39:41 -0700 | [diff] [blame] | 185 | output += 4; |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 186 | } else { |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 187 | w += 1 * sizeof(float); |
| 188 | if (w & (4 * sizeof(float))) { |
Marat Dukhan | c808c97 | 2020-10-20 19:39:41 -0700 | [diff] [blame] | 189 | psimd_store2_f32(output, vo); |
| 190 | output += 2; |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 191 | vo = psimd_concat_hi_f32(vo, vo); |
| 192 | } |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 193 | if (w & (2 * sizeof(float))) { |
Marat Dukhan | c808c97 | 2020-10-20 19:39:41 -0700 | [diff] [blame] | 194 | psimd_store1_f32(output, vo); |
| 195 | output += 1; |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 196 | } |
| 197 | } |
| 198 | } |
| 199 | |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 200 | i0 = (const float*) ((uintptr_t) i2 - input_decrement); |
Marat Dukhan | 7515777 | 2020-10-21 01:46:28 -0700 | [diff] [blame] | 201 | i1 = (const float*) ((uintptr_t) i0 + input_width); |
| 202 | i2 = (const float*) ((uintptr_t) i1 + input_width); |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 203 | |
Marat Dukhan | 04d91c8 | 2020-10-20 18:35:44 -0700 | [diff] [blame] | 204 | output_height -= 1; |
Marat Dukhan | 7ed0e3c | 2020-10-21 00:41:31 -0700 | [diff] [blame] | 205 | padded_input_height -= 2; |
Marat Dukhan | 04d91c8 | 2020-10-20 18:35:44 -0700 | [diff] [blame] | 206 | } while (output_height != 0); |
Erich Elsen | fd7a6e3 | 2020-06-11 12:04:44 -0700 | [diff] [blame] | 207 | } |