Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 1 | // Copyright 2020 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
| 6 | #include <assert.h> |
| 7 | |
| 8 | #include <wasm_simd128.h> |
| 9 | |
| 10 | |
| 11 | #include <xnnpack/dwconv.h> |
| 12 | #include <xnnpack/math.h> |
| 13 | |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 14 | |
Frank Barchard | db5c32d | 2020-11-16 23:58:42 -0800 | [diff] [blame] | 15 | $ARCH_SUFFIX = "_x86" if X86 else "_arm" |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 16 | |
Frank Barchard | db5c32d | 2020-11-16 23:58:42 -0800 | [diff] [blame] | 17 | void xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd${ARCH_SUFFIX}_1x4_acc3( |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 18 | size_t input_height, |
| 19 | size_t input_width, |
| 20 | const float* input, |
| 21 | const float* weights, |
| 22 | const float* zero, |
| 23 | float* output, |
| 24 | uint32_t padding_top, |
| 25 | const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)]) |
| 26 | { |
| 27 | assert(input_height != 0); |
| 28 | assert(input_width != 0); |
| 29 | assert(input_width % sizeof(float) == 0); |
| 30 | assert(padding_top >= 0); |
| 31 | assert(padding_top <= 1); |
| 32 | |
| 33 | const v128_t vmask_even = wasm_v128_load(params->scalar.mask_even); |
| 34 | const v128_t vmask_odd = wasm_v128_load(params->scalar.mask_odd); |
| 35 | const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max); |
| 36 | const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min); |
| 37 | |
| 38 | const v128_t vbias = wasm_v32x4_load_splat(weights); |
| 39 | const v128_t vk00 = wasm_v32x4_load_splat(weights + 1); |
| 40 | const v128_t vk01 = wasm_v32x4_load_splat(weights + 2); |
| 41 | const v128_t vk02 = wasm_v32x4_load_splat(weights + 3); |
| 42 | const v128_t vk10 = wasm_v32x4_load_splat(weights + 4); |
| 43 | const v128_t vk11 = wasm_v32x4_load_splat(weights + 5); |
| 44 | const v128_t vk12 = wasm_v32x4_load_splat(weights + 6); |
| 45 | const v128_t vk20 = wasm_v32x4_load_splat(weights + 7); |
| 46 | const v128_t vk21 = wasm_v32x4_load_splat(weights + 8); |
| 47 | const v128_t vk22 = wasm_v32x4_load_splat(weights + 9); |
| 48 | |
Frank Barchard | 2af73ac | 2020-11-23 11:45:10 -0800 | [diff] [blame] | 49 | const v128_t vzero = wasm_f32x4_splat(0.0f); |
| 50 | |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 51 | const size_t input_decrement = round_down_po2(input_width, 4 /* SIMD output width */ * 2 /* subsampling */ * sizeof(float)); |
| 52 | |
| 53 | const float* i0 = (const float*) ((uintptr_t) input - ((-padding_top) & input_width)); |
| 54 | const float* i1 = (const float*) ((uintptr_t) i0 + input_width); |
| 55 | if XNN_UNPREDICTABLE(padding_top != 0) { |
| 56 | i0 = zero; |
| 57 | } |
| 58 | const float* i2 = (const float*) ((uintptr_t) i1 + input_width); |
| 59 | |
| 60 | size_t padded_input_height = input_height + padding_top + 1 /* padding bottom */; |
| 61 | size_t output_height = (padded_input_height - 3 /* kernel size */ + 2 /* subsampling */) / 2; |
| 62 | do { |
| 63 | if XNN_UNPREDICTABLE(padded_input_height <= 3) { |
| 64 | i2 = zero; |
| 65 | } |
| 66 | |
Frank Barchard | 2af73ac | 2020-11-23 11:45:10 -0800 | [diff] [blame] | 67 | v128_t vi0x7531 = vzero; |
| 68 | v128_t vi1x7531 = vzero; |
| 69 | v128_t vi2x7531 = vzero; |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 70 | |
| 71 | size_t w = input_width; |
| 72 | for (; w >= 8 * sizeof(float); w -= 8 * sizeof(float)) { |
| 73 | v128_t vo8ACEp0 = vbias; |
| 74 | |
| 75 | const v128_t vi0x89AB = wasm_v128_load(i0); |
| 76 | const v128_t vi1x89AB = wasm_v128_load(i1); |
| 77 | const v128_t vi2x89AB = wasm_v128_load(i2); |
| 78 | |
| 79 | const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); |
| 80 | i0 += 8; |
| 81 | const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); |
| 82 | i1 += 8; |
| 83 | const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); |
| 84 | i2 += 8; |
| 85 | |
Frank Barchard | 9cef5ea | 2020-11-18 14:52:08 -0800 | [diff] [blame] | 86 | const v128_t vi0x8ACE = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4 + 0, 4 + 2); |
| 87 | const v128_t vi0x9BDF = wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 4 + 1, 4 + 3); |
| 88 | const v128_t vi1x8ACE = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4 + 0, 4 + 2); |
| 89 | const v128_t vi1x9BDF = wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 4 + 1, 4 + 3); |
| 90 | const v128_t vi2x8ACE = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4 + 0, 4 + 2); |
| 91 | const v128_t vi2x9BDF = wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 4 + 1, 4 + 3); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 92 | |
| 93 | vo8ACEp0 = wasm_f32x4_add(vo8ACEp0, wasm_f32x4_mul(vi0x8ACE, vk01)); |
| 94 | v128_t vo8ACEp1 = wasm_f32x4_mul(vi1x8ACE, vk11); |
| 95 | v128_t vo8ACEp2 = wasm_f32x4_mul(vi2x8ACE, vk21); |
| 96 | |
Frank Barchard | 9cef5ea | 2020-11-18 14:52:08 -0800 | [diff] [blame] | 97 | const v128_t vi0xF9BD = wasm_v32x4_shuffle(vi0x9BDF, vi0x9BDF, 3, 0, 1, 2); |
| 98 | const v128_t vi1xF9BD = wasm_v32x4_shuffle(vi1x9BDF, vi1x9BDF, 3, 0, 1, 2); |
| 99 | const v128_t vi2xF9BD = wasm_v32x4_shuffle(vi2x9BDF, vi2x9BDF, 3, 0, 1, 2); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 100 | |
| 101 | vo8ACEp0 = wasm_f32x4_add(vo8ACEp0, wasm_f32x4_mul(vi0x9BDF, vk02)); |
| 102 | vo8ACEp1 = wasm_f32x4_add(vo8ACEp1, wasm_f32x4_mul(vi1x9BDF, vk12)); |
| 103 | vo8ACEp2 = wasm_f32x4_add(vo8ACEp2, wasm_f32x4_mul(vi2x9BDF, vk22)); |
| 104 | |
Frank Barchard | 9cef5ea | 2020-11-18 14:52:08 -0800 | [diff] [blame] | 105 | const v128_t vi0x7BDF = wasm_v32x4_shuffle(vi0xF9BD, vi0x7531, 4, 1, 2, 3); |
| 106 | const v128_t vi1x7BDF = wasm_v32x4_shuffle(vi1xF9BD, vi1x7531, 4, 1, 2, 3); |
| 107 | const v128_t vi2x7BDF = wasm_v32x4_shuffle(vi2xF9BD, vi2x7531, 4, 1, 2, 3); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 108 | |
| 109 | vi0x7531 = vi0xF9BD; |
| 110 | vi1x7531 = vi1xF9BD; |
| 111 | vi2x7531 = vi2xF9BD; |
| 112 | |
| 113 | vo8ACEp0 = wasm_f32x4_add(vo8ACEp0, wasm_f32x4_mul(vi0x7BDF, vk00)); |
| 114 | vo8ACEp1 = wasm_f32x4_add(vo8ACEp1, wasm_f32x4_mul(vi1x7BDF, vk10)); |
| 115 | vo8ACEp2 = wasm_f32x4_add(vo8ACEp2, wasm_f32x4_mul(vi2x7BDF, vk20)); |
| 116 | |
| 117 | v128_t vo = wasm_f32x4_add(vo8ACEp0, vo8ACEp1); |
| 118 | vo = wasm_f32x4_add(vo, vo8ACEp2); |
| 119 | |
Frank Barchard | db5c32d | 2020-11-16 23:58:42 -0800 | [diff] [blame] | 120 | $if X86: |
Frank Barchard | 2af73ac | 2020-11-23 11:45:10 -0800 | [diff] [blame] | 121 | vo = wasm_v128_bitselect(vmin, vo, wasm_f32x4_lt(vo, vmin)); |
| 122 | vo = wasm_v128_bitselect(vo, vmax, wasm_f32x4_le(vo, vmax)); |
Frank Barchard | db5c32d | 2020-11-16 23:58:42 -0800 | [diff] [blame] | 123 | $else: |
Frank Barchard | 2af73ac | 2020-11-23 11:45:10 -0800 | [diff] [blame] | 124 | vo = wasm_f32x4_max(vo, vmin); |
| 125 | vo = wasm_f32x4_min(vo, vmax); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 126 | |
| 127 | wasm_v128_store(output, vo); |
| 128 | output += 4; |
| 129 | } |
| 130 | // Potentially process the last block of 0..7 pixels. |
| 131 | assert(w < 8 * sizeof(float)); |
| 132 | if XNN_LIKELY(w != 0) { |
| 133 | v128_t vo8ACEp0 = vbias; |
| 134 | |
| 135 | const v128_t vi0x89AB = wasm_v128_load(i0); |
| 136 | const v128_t vi1x89AB = wasm_v128_load(i1); |
| 137 | const v128_t vi2x89AB = wasm_v128_load(i2); |
| 138 | |
| 139 | const v128_t vi0xCDEF = wasm_v128_load(i0 + 4); |
| 140 | const v128_t vi1xCDEF = wasm_v128_load(i1 + 4); |
| 141 | const v128_t vi2xCDEF = wasm_v128_load(i2 + 4); |
| 142 | |
Frank Barchard | 9cef5ea | 2020-11-18 14:52:08 -0800 | [diff] [blame] | 143 | const v128_t vi0x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 0, 2, 4 + 0, 4 + 2)); |
| 144 | const v128_t vi0x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi0x89AB, vi0xCDEF, 1, 3, 4 + 1, 4 + 3)); |
| 145 | const v128_t vi1x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 0, 2, 4 + 0, 4 + 2)); |
| 146 | const v128_t vi1x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi1x89AB, vi1xCDEF, 1, 3, 4 + 1, 4 + 3)); |
| 147 | const v128_t vi2x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 0, 2, 4 + 0, 4 + 2)); |
| 148 | const v128_t vi2x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi2x89AB, vi2xCDEF, 1, 3, 4 + 1, 4 + 3)); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 149 | |
| 150 | vo8ACEp0 = wasm_f32x4_add(vo8ACEp0, wasm_f32x4_mul(vi0x8ACE, vk01)); |
| 151 | v128_t vo8ACEp1 = wasm_f32x4_mul(vi1x8ACE, vk11); |
| 152 | v128_t vo8ACEp2 = wasm_f32x4_mul(vi2x8ACE, vk21); |
| 153 | |
Frank Barchard | 9cef5ea | 2020-11-18 14:52:08 -0800 | [diff] [blame] | 154 | const v128_t vi0xF9BD = wasm_v32x4_shuffle(vi0x9BDF, vi0x9BDF, 3, 0, 1, 2); |
| 155 | const v128_t vi1xF9BD = wasm_v32x4_shuffle(vi1x9BDF, vi1x9BDF, 3, 0, 1, 2); |
| 156 | const v128_t vi2xF9BD = wasm_v32x4_shuffle(vi2x9BDF, vi2x9BDF, 3, 0, 1, 2); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 157 | |
| 158 | vo8ACEp0 = wasm_f32x4_add(vo8ACEp0, wasm_f32x4_mul(vi0x9BDF, vk02)); |
| 159 | vo8ACEp1 = wasm_f32x4_add(vo8ACEp1, wasm_f32x4_mul(vi1x9BDF, vk12)); |
| 160 | vo8ACEp2 = wasm_f32x4_add(vo8ACEp2, wasm_f32x4_mul(vi2x9BDF, vk22)); |
| 161 | |
Frank Barchard | 9cef5ea | 2020-11-18 14:52:08 -0800 | [diff] [blame] | 162 | const v128_t vi0x7BDF = wasm_v32x4_shuffle(vi0xF9BD, vi0x7531, 4, 1, 2, 3); |
| 163 | const v128_t vi1x7BDF = wasm_v32x4_shuffle(vi1xF9BD, vi1x7531, 4, 1, 2, 3); |
| 164 | const v128_t vi2x7BDF = wasm_v32x4_shuffle(vi2xF9BD, vi2x7531, 4, 1, 2, 3); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 165 | |
| 166 | vo8ACEp0 = wasm_f32x4_add(vo8ACEp0, wasm_f32x4_mul(vi0x7BDF, vk00)); |
| 167 | vo8ACEp1 = wasm_f32x4_add(vo8ACEp1, wasm_f32x4_mul(vi1x7BDF, vk10)); |
| 168 | vo8ACEp2 = wasm_f32x4_add(vo8ACEp2, wasm_f32x4_mul(vi2x7BDF, vk20)); |
| 169 | |
| 170 | v128_t vo = wasm_f32x4_add(vo8ACEp0, vo8ACEp1); |
| 171 | vo = wasm_f32x4_add(vo, vo8ACEp2); |
| 172 | |
Frank Barchard | db5c32d | 2020-11-16 23:58:42 -0800 | [diff] [blame] | 173 | $if X86: |
Frank Barchard | 2af73ac | 2020-11-23 11:45:10 -0800 | [diff] [blame] | 174 | vo = wasm_v128_bitselect(vmin, vo, wasm_f32x4_lt(vo, vmin)); |
| 175 | vo = wasm_v128_bitselect(vo, vmax, wasm_f32x4_le(vo, vmax)); |
Frank Barchard | db5c32d | 2020-11-16 23:58:42 -0800 | [diff] [blame] | 176 | $else: |
Frank Barchard | 2af73ac | 2020-11-23 11:45:10 -0800 | [diff] [blame] | 177 | vo = wasm_f32x4_max(vo, vmin); |
| 178 | vo = wasm_f32x4_min(vo, vmax); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 179 | |
| 180 | if (w == 7 * sizeof(float)) { |
| 181 | wasm_v128_store(output, vo); |
| 182 | output += 4; |
| 183 | } else { |
| 184 | w += 1 * sizeof(float); |
| 185 | if (w & (4 * sizeof(float))) { |
| 186 | *((double*) output) = wasm_f64x2_extract_lane(vo, 0); |
| 187 | output += 2; |
Frank Barchard | 2af73ac | 2020-11-23 11:45:10 -0800 | [diff] [blame] | 188 | vo = wasm_v32x4_shuffle(vo, vo, 2, 3, 0, 1); |
Frank Barchard | 1a95305 | 2020-11-16 18:44:58 -0800 | [diff] [blame] | 189 | } |
| 190 | if (w & (2 * sizeof(float))) { |
| 191 | *output = wasm_f32x4_extract_lane(vo, 0); |
| 192 | output += 1; |
| 193 | } |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | i0 = (const float*) ((uintptr_t) i2 - input_decrement); |
| 198 | i1 = (const float*) ((uintptr_t) i0 + input_width); |
| 199 | i2 = (const float*) ((uintptr_t) i1 + input_width); |
| 200 | |
| 201 | output_height -= 1; |
| 202 | padded_input_height -= 2; |
| 203 | } while (output_height != 0); |
| 204 | } |