Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2018 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #ifndef SkBitmapProcState_opts_DEFINED |
| 9 | #define SkBitmapProcState_opts_DEFINED |
| 10 | |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 11 | #include "include/private/SkVx.h" |
Mike Klein | c0bd9f9 | 2019-04-23 12:05:21 -0500 | [diff] [blame] | 12 | #include "src/core/SkBitmapProcState.h" |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 13 | #include "src/core/SkMSAN.h" |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 14 | |
| 15 | // SkBitmapProcState optimized Shader, Sample, or Matrix procs. |
| 16 | // |
| 17 | // Only S32_alpha_D32_filter_DX exploits instructions beyond |
| 18 | // our common baseline SSE2/NEON instruction sets, so that's |
| 19 | // all that lives here. |
| 20 | // |
| 21 | // The rest are scattershot at the moment but I want to get them |
| 22 | // all migrated to be normal code inside SkBitmapProcState.cpp. |
| 23 | |
| 24 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 25 | #include <immintrin.h> |
| 26 | #elif defined(SK_ARM_HAS_NEON) |
| 27 | #include <arm_neon.h> |
| 28 | #endif |
| 29 | |
| 30 | namespace SK_OPTS_NS { |
| 31 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 32 | // This same basic packing scheme is used throughout the file. |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 33 | template <typename U32, typename Out> |
| 34 | static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) { |
| 35 | *v0 = (packed >> 18); // Integer coordinate x0 or y0. |
| 36 | *v1 = (packed & 0x3fff); // Integer coordinate x1 or y1. |
| 37 | *w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w. |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 38 | } |
| 39 | |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 40 | #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
| 41 | /*not static*/ inline |
| 42 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 43 | const uint32_t* xy, int count, uint32_t* colors) { |
| 44 | SkASSERT(count > 0 && colors != nullptr); |
Mike Reed | bbaad02 | 2020-11-24 16:04:05 -0500 | [diff] [blame] | 45 | SkASSERT(s.fBilerp); |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 46 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
| 47 | SkASSERT(s.fAlphaScale <= 256); |
| 48 | |
| 49 | // In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight. |
| 50 | int y0, y1, wy; |
| 51 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
| 52 | |
Mike Klein | 666e4da | 2019-09-27 11:21:20 -0500 | [diff] [blame] | 53 | const uint32_t* row0 = s.fPixmap.addr32(0,y0); |
| 54 | const uint32_t* row1 = s.fPixmap.addr32(0,y1); |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 55 | |
| 56 | auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> { |
| 57 | // Decode up to 8 output pixels' x-coordinates and weights. |
| 58 | skvx::Vec<8,uint32_t> x0,x1,wx; |
| 59 | decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx); |
| 60 | |
| 61 | // Splat wx to each color channel. |
| 62 | wx = (wx << 0) |
| 63 | | (wx << 8) |
| 64 | | (wx << 16) |
| 65 | | (wx << 24); |
| 66 | |
Mike Klein | 666e4da | 2019-09-27 11:21:20 -0500 | [diff] [blame] | 67 | auto gather = [](const uint32_t* ptr, skvx::Vec<8,uint32_t> ix) { |
| 68 | #if 1 |
| 69 | // Drop into AVX2 intrinsics for vpgatherdd. |
| 70 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>( |
| 71 | _mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), 4)); |
| 72 | #else |
| 73 | // Portable version... sometimes I don't trust vpgatherdd. |
| 74 | return skvx::Vec<8,uint32_t>{ |
| 75 | ptr[ix[0]], ptr[ix[1]], ptr[ix[2]], ptr[ix[3]], |
| 76 | ptr[ix[4]], ptr[ix[5]], ptr[ix[6]], ptr[ix[7]], |
| 77 | }; |
| 78 | #endif |
| 79 | }; |
| 80 | |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 81 | // Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels. |
Mike Klein | 666e4da | 2019-09-27 11:21:20 -0500 | [diff] [blame] | 82 | skvx::Vec<8,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1), |
| 83 | bl = gather(row1, x0), br = gather(row1, x1); |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 84 | |
Mike Klein | ac353cb | 2019-09-27 12:58:01 -0500 | [diff] [blame] | 85 | #if 1 |
| 86 | // We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code. |
| 87 | auto lerp_x = [&](skvx::Vec<8,uint32_t> L, skvx::Vec<8,uint32_t> R) { |
| 88 | __m256i l = skvx::bit_pun<__m256i>(L), |
| 89 | r = skvx::bit_pun<__m256i>(R), |
| 90 | wr = skvx::bit_pun<__m256i>(wx), |
| 91 | wl = _mm256_sub_epi8(_mm256_set1_epi8(16), wr); |
| 92 | |
| 93 | // Interlace l,r bytewise and line them up with their weights, then lerp. |
| 94 | __m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r), |
| 95 | _mm256_unpacklo_epi8(wl,wr)); |
| 96 | __m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r), |
| 97 | _mm256_unpackhi_epi8(wl,wr)); |
| 98 | |
| 99 | // Those _mm256_unpack??_epi8() calls left us in a bit of an odd order: |
| 100 | // |
| 101 | // if l = a b c d | e f g h |
| 102 | // and r = A B C D | E F G H |
| 103 | // |
| 104 | // then lo = a A b B | e E f F (low half of each input) |
| 105 | // and hi = c C d D | g G h H (high half of each input) |
| 106 | // |
| 107 | // To get everything back in original order we need to transpose that. |
| 108 | __m256i abcd = _mm256_permute2x128_si256(lo, hi, 0x20), |
| 109 | efgh = _mm256_permute2x128_si256(lo, hi, 0x31); |
| 110 | |
| 111 | return skvx::join(skvx::bit_pun<skvx::Vec<16,uint16_t>>(abcd), |
| 112 | skvx::bit_pun<skvx::Vec<16,uint16_t>>(efgh)); |
| 113 | }; |
| 114 | |
| 115 | skvx::Vec<32, uint16_t> top = lerp_x(tl, tr), |
| 116 | bot = lerp_x(bl, br), |
| 117 | sum = 16*top + (bot-top)*wy; |
| 118 | #else |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 119 | // Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply. |
| 120 | auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> { |
| 121 | return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v)); |
| 122 | }; |
| 123 | |
| 124 | // Sum up weighted sample pixels. The naive, redundant math would be, |
| 125 | // |
| 126 | // sum = tl * (16-wy) * (16-wx) |
| 127 | // + bl * ( wy) * (16-wx) |
| 128 | // + tr * (16-wy) * ( wx) |
| 129 | // + br * ( wy) * ( wx) |
| 130 | // |
| 131 | // But we refactor to eliminate a bunch of those common factors. |
| 132 | auto lerp = [](auto lo, auto hi, auto w) { |
| 133 | return 16*lo + (hi-lo)*w; |
| 134 | }; |
| 135 | skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy), |
| 136 | lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx)); |
Mike Klein | ac353cb | 2019-09-27 12:58:01 -0500 | [diff] [blame] | 137 | #endif |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 138 | |
| 139 | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
| 140 | sum >>= 8; |
| 141 | |
jiepan | e947efb | 2020-05-07 14:09:05 +0800 | [diff] [blame] | 142 | // Scale by alpha if needed. |
| 143 | if(s.fAlphaScale < 256) { |
| 144 | sum *= s.fAlphaScale; |
| 145 | sum >>= 8; |
| 146 | } |
Mike Klein | b83cc76 | 2019-09-26 15:34:34 -0500 | [diff] [blame] | 147 | |
| 148 | // Pack back to 8-bit channels, undoing to_16x4(). |
| 149 | return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum)); |
| 150 | }; |
| 151 | |
| 152 | while (count >= 8) { |
| 153 | bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors); |
| 154 | xy += 8; |
| 155 | colors += 8; |
| 156 | count -= 8; |
| 157 | } |
| 158 | if (count > 0) { |
| 159 | __m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ), |
| 160 | coords = _mm256_maskload_epi32((const int*)xy, active), |
| 161 | pixels; |
| 162 | |
| 163 | bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels); |
| 164 | _mm256_maskstore_epi32((int*)colors, active, pixels); |
| 165 | |
| 166 | sk_msan_mark_initialized(colors, colors+count, |
| 167 | "MSAN still doesn't understand AVX2 mask loads and stores."); |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 172 | |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 173 | /*not static*/ inline |
| 174 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 175 | const uint32_t* xy, int count, uint32_t* colors) { |
| 176 | SkASSERT(count > 0 && colors != nullptr); |
Mike Reed | bbaad02 | 2020-11-24 16:04:05 -0500 | [diff] [blame] | 177 | SkASSERT(s.fBilerp); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 178 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 179 | SkASSERT(s.fAlphaScale <= 256); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 180 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 181 | // interpolate_in_x() is the crux of the SSSE3 implementation, |
| 182 | // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). |
| 183 | auto interpolate_in_x = [](uint32_t A0, uint32_t A1, |
| 184 | uint32_t B0, uint32_t B1, |
| 185 | __m128i interlaced_x_weights) { |
| 186 | // _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp. |
| 187 | // |
| 188 | // It takes two arguments interlaced byte-wise: |
| 189 | // - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...] |
| 190 | // - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...] |
| 191 | // and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ]. |
| 192 | // |
| 193 | // That's why we go to all this trouble to make interlaced_x_weights, |
| 194 | // and here we're about to interlace A0 with A1 and B0 with B1 to match. |
| 195 | // |
| 196 | // Our interlaced_x_weights are all in [0,16], and so we need not worry about |
| 197 | // the signedness of that input nor about the signedness of the output. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 198 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 199 | __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), |
| 200 | interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); |
| 201 | |
| 202 | return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), |
| 203 | interlaced_x_weights); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 204 | }; |
| 205 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 206 | // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. |
| 207 | // Returns two pixels, with each color channel in a 16-bit lane of the __m128i. |
| 208 | auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1, |
| 209 | uint32_t A2, uint32_t A3, |
| 210 | uint32_t B0, uint32_t B1, |
| 211 | uint32_t B2, uint32_t B3, |
| 212 | __m128i interlaced_x_weights, |
| 213 | int wy) { |
| 214 | // Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights. |
| 215 | __m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), |
| 216 | bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); |
| 217 | |
| 218 | // Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy |
| 219 | // as 16*top + (bot-top)*wy to save a multiply. |
| 220 | __m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4), |
| 221 | _mm_mullo_epi16(_mm_sub_epi16(bot, top), |
| 222 | _mm_set1_epi16(wy))); |
| 223 | |
| 224 | // Scale down by total max weight 16x16 = 256. |
| 225 | px = _mm_srli_epi16(px, 8); |
| 226 | |
| 227 | // Scale by alpha if needed. |
| 228 | if (s.fAlphaScale < 256) { |
| 229 | px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8); |
| 230 | } |
| 231 | return px; |
| 232 | }; |
| 233 | |
| 234 | // We're in _DX mode here, so we're only varying in X. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 235 | // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. |
| 236 | // All the other entries in xy will be pairs of X coordinates and the X weight. |
| 237 | int y0, y1, wy; |
| 238 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
| 239 | |
| 240 | auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), |
| 241 | row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); |
| 242 | |
| 243 | while (count >= 4) { |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 244 | // We can really get going, loading 4 X-pairs at a time to produce 4 output pixels. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 245 | int x0[4], |
| 246 | x1[4]; |
| 247 | __m128i wx; |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 248 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 249 | // decode_packed_coordinates_and_weight(), 4x. |
| 250 | __m128i packed = _mm_loadu_si128((const __m128i*)xy); |
| 251 | _mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18)); |
| 252 | _mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); |
| 253 | wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15] |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 254 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 255 | // Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1, |
| 256 | // and sixteen minus that as wl for pixels on the left at x0. |
| 257 | __m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), |
| 258 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
| 259 | |
| 260 | // We need to interlace wl and wr for _mm_maddubs_epi16(). |
| 261 | __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr), |
| 262 | interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr); |
| 263 | |
| 264 | enum { A,B,C,D }; |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 265 | |
| 266 | // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time |
| 267 | // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 268 | __m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]], |
| 269 | row1[x0[A]], row1[x1[A]], |
| 270 | row0[x0[B]], row0[x1[B]], |
| 271 | row1[x0[B]], row1[x1[B]], |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 272 | interlaced_x_weights_AB, wy); |
| 273 | |
| 274 | // Once more with the other half of the x-weights for two more pixels C,D. |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 275 | __m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]], |
| 276 | row1[x0[C]], row1[x1[C]], |
| 277 | row0[x0[D]], row0[x1[D]], |
| 278 | row1[x0[D]], row1[x1[D]], |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 279 | interlaced_x_weights_CD, wy); |
| 280 | |
| 281 | // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 282 | _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD)); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 283 | xy += 4; |
| 284 | colors += 4; |
| 285 | count -= 4; |
| 286 | } |
| 287 | |
| 288 | while (count --> 0) { |
| 289 | // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. |
| 290 | int x0, x1, wx; |
| 291 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
| 292 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 293 | // As above, splat out wx four times as wr, and sixteen minus that as wl. |
| 294 | __m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. |
| 295 | wl = _mm_sub_epi8(_mm_set1_epi8(16), wr); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 296 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 297 | __m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 298 | |
| 299 | __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], |
| 300 | row1[x0], row1[x1], |
| 301 | 0, 0, |
| 302 | 0, 0, |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 303 | interlaced_x_weights, wy); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 304 | |
Mike Klein | c3f6314 | 2019-09-25 13:56:44 -0500 | [diff] [blame] | 305 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128())); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 306 | } |
| 307 | } |
| 308 | |
| 309 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 310 | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 311 | |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 312 | /*not static*/ inline |
| 313 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 314 | const uint32_t* xy, int count, uint32_t* colors) { |
| 315 | SkASSERT(count > 0 && colors != nullptr); |
Mike Reed | bbaad02 | 2020-11-24 16:04:05 -0500 | [diff] [blame] | 316 | SkASSERT(s.fBilerp); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 317 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
| 318 | SkASSERT(s.fAlphaScale <= 256); |
| 319 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 320 | int y0, y1, wy; |
| 321 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 322 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 323 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
| 324 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 325 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 326 | // We'll put one pixel in the low 4 16-bit lanes to line up with wy, |
| 327 | // and another in the upper 4 16-bit lanes to line up with 16 - wy. |
Mike Klein | 0ec56fc | 2019-09-25 11:38:46 -0500 | [diff] [blame] | 328 | const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here. |
| 329 | _mm_set1_epi16(16-wy)); // Top pixel goes here. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 330 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 331 | while (count --> 0) { |
| 332 | int x0, x1, wx; |
| 333 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 334 | |
Mike Klein | 0ec56fc | 2019-09-25 11:38:46 -0500 | [diff] [blame] | 335 | // Load the 4 pixels we're interpolating, in this grid: |
| 336 | // | tl tr | |
| 337 | // | bl br | |
| 338 | const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]), |
| 339 | bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 340 | |
Mike Klein | 1cb5743 | 2019-09-25 12:14:28 -0500 | [diff] [blame] | 341 | // We want to calculate a sum of 4 pixels weighted in two directions: |
| 342 | // |
| 343 | // sum = tl * (16-wy) * (16-wx) |
| 344 | // + bl * ( wy) * (16-wx) |
| 345 | // + tr * (16-wy) * ( wx) |
| 346 | // + br * ( wy) * ( wx) |
| 347 | // |
| 348 | // (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.) |
| 349 | // |
| 350 | // We've already prepared allY as a vector containing [wy, 16-wy] as a way |
| 351 | // to apply those y-direction weights. So we'll start on the x-direction |
| 352 | // first, grouping into left and right halves, lined up with allY: |
| 353 | // |
| 354 | // L = [bl, tl] |
| 355 | // R = [br, tr] |
| 356 | // |
| 357 | // sum = horizontalSum( allY * (L*(16-wx) + R*wx) ) |
| 358 | // |
| 359 | // Rewriting that one more step, we can replace a multiply with a shift: |
| 360 | // |
| 361 | // sum = horizontalSum( allY * (16*L + (R-L)*wx) ) |
| 362 | // |
| 363 | // That's how we'll actually do this math. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 364 | |
Mike Klein | 1cb5743 | 2019-09-25 12:14:28 -0500 | [diff] [blame] | 365 | __m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()), |
| 366 | R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128()); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 367 | |
Mike Klein | 1cb5743 | 2019-09-25 12:14:28 -0500 | [diff] [blame] | 368 | __m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4), |
| 369 | _mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx))); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 370 | |
Mike Klein | 1cb5743 | 2019-09-25 12:14:28 -0500 | [diff] [blame] | 371 | __m128i sum_in_x = _mm_mullo_epi16(inner, allY); |
| 372 | |
| 373 | // sum = horizontalSum( ... ) |
Mike Klein | 0ec56fc | 2019-09-25 11:38:46 -0500 | [diff] [blame] | 374 | __m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8)); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 375 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 376 | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 377 | sum = _mm_srli_epi16(sum, 8); |
| 378 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 379 | if (s.fAlphaScale < 256) { |
| 380 | // Scale by alpha, which is in [0,256]. |
| 381 | sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); |
| 382 | sum = _mm_srli_epi16(sum, 8); |
| 383 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 384 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 385 | // Pack back into 8-bit values and store. |
| 386 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); |
| 387 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 388 | } |
| 389 | |
| 390 | #else |
| 391 | |
| 392 | // The NEON code only actually differs from the portable code in the |
| 393 | // filtering step after we've loaded all four pixels we want to bilerp. |
| 394 | |
| 395 | #if defined(SK_ARM_HAS_NEON) |
| 396 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
| 397 | SkPMColor a00, SkPMColor a01, |
| 398 | SkPMColor a10, SkPMColor a11, |
| 399 | SkPMColor *dst, |
| 400 | uint16_t scale) { |
| 401 | uint8x8_t vy, vconst16_8, v16_y, vres; |
| 402 | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
| 403 | uint32x2_t va0, va1; |
| 404 | uint16x8_t tmp1, tmp2; |
| 405 | |
| 406 | vy = vdup_n_u8(y); // duplicate y into vy |
| 407 | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 408 | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
| 409 | |
| 410 | va0 = vdup_n_u32(a00); // duplicate a00 |
| 411 | va1 = vdup_n_u32(a10); // duplicate a10 |
| 412 | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
| 413 | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
| 414 | |
| 415 | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
| 416 | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
| 417 | |
| 418 | vx = vdup_n_u16(x); // duplicate x into vx |
| 419 | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
| 420 | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
| 421 | |
| 422 | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 423 | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 424 | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 425 | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 426 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 427 | if (scale < 256) { |
| 428 | vscale = vdup_n_u16(scale); // duplicate scale |
| 429 | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
| 430 | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
| 431 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 432 | |
Jian Cai | 9e0afb7 | 2019-12-18 14:42:49 -0800 | [diff] [blame] | 433 | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8 |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 434 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
| 435 | } |
| 436 | #else |
| 437 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
| 438 | SkPMColor a00, SkPMColor a01, |
| 439 | SkPMColor a10, SkPMColor a11, |
| 440 | SkPMColor* dstColor, |
| 441 | unsigned alphaScale) { |
| 442 | SkASSERT((unsigned)x <= 0xF); |
| 443 | SkASSERT((unsigned)y <= 0xF); |
| 444 | SkASSERT(alphaScale <= 256); |
| 445 | |
| 446 | int xy = x * y; |
| 447 | const uint32_t mask = 0xFF00FF; |
| 448 | |
| 449 | int scale = 256 - 16*y - 16*x + xy; |
| 450 | uint32_t lo = (a00 & mask) * scale; |
| 451 | uint32_t hi = ((a00 >> 8) & mask) * scale; |
| 452 | |
| 453 | scale = 16*x - xy; |
| 454 | lo += (a01 & mask) * scale; |
| 455 | hi += ((a01 >> 8) & mask) * scale; |
| 456 | |
| 457 | scale = 16*y - xy; |
| 458 | lo += (a10 & mask) * scale; |
| 459 | hi += ((a10 >> 8) & mask) * scale; |
| 460 | |
| 461 | lo += (a11 & mask) * xy; |
| 462 | hi += ((a11 >> 8) & mask) * xy; |
| 463 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 464 | if (alphaScale < 256) { |
| 465 | lo = ((lo >> 8) & mask) * alphaScale; |
| 466 | hi = ((hi >> 8) & mask) * alphaScale; |
| 467 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 468 | |
| 469 | *dstColor = ((lo >> 8) & mask) | (hi & ~mask); |
| 470 | } |
| 471 | #endif |
| 472 | |
| 473 | |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 474 | /*not static*/ inline |
| 475 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 476 | const uint32_t* xy, int count, SkPMColor* colors) { |
| 477 | SkASSERT(count > 0 && colors != nullptr); |
Mike Reed | bbaad02 | 2020-11-24 16:04:05 -0500 | [diff] [blame] | 478 | SkASSERT(s.fBilerp); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 479 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
| 480 | SkASSERT(s.fAlphaScale <= 256); |
| 481 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 482 | int y0, y1, wy; |
| 483 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 484 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 485 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
| 486 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 487 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 488 | while (count --> 0) { |
| 489 | int x0, x1, wx; |
| 490 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 491 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 492 | filter_and_scale_by_alpha(wx, wy, |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 493 | row0[x0], row0[x1], |
| 494 | row1[x0], row1[x1], |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 495 | colors++, |
| 496 | s.fAlphaScale); |
| 497 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 498 | } |
| 499 | |
| 500 | #endif |
| 501 | |
Mike Klein | 37bc8f9 | 2019-10-21 13:10:07 -0500 | [diff] [blame] | 502 | #if defined(SK_ARM_HAS_NEON) |
| 503 | /*not static*/ inline |
| 504 | void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s, |
| 505 | const uint32_t* xy, int count, SkPMColor* colors) { |
| 506 | SkASSERT(count > 0 && colors != nullptr); |
Mike Reed | bbaad02 | 2020-11-24 16:04:05 -0500 | [diff] [blame] | 507 | SkASSERT(s.fBilerp); |
Mike Klein | 37bc8f9 | 2019-10-21 13:10:07 -0500 | [diff] [blame] | 508 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
| 509 | SkASSERT(s.fAlphaScale <= 256); |
| 510 | |
| 511 | auto src = (const char*)s.fPixmap.addr(); |
| 512 | size_t rb = s.fPixmap.rowBytes(); |
| 513 | |
| 514 | while (count --> 0) { |
| 515 | int y0, y1, wy, |
| 516 | x0, x1, wx; |
| 517 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
| 518 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
| 519 | |
| 520 | auto row0 = (const uint32_t*)(src + y0*rb), |
| 521 | row1 = (const uint32_t*)(src + y1*rb); |
| 522 | |
| 523 | filter_and_scale_by_alpha(wx, wy, |
| 524 | row0[x0], row0[x1], |
| 525 | row1[x0], row1[x1], |
| 526 | colors++, |
| 527 | s.fAlphaScale); |
| 528 | } |
| 529 | } |
| 530 | #else |
| 531 | // It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2. |
| 532 | constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&, |
| 533 | const uint32_t*, int, SkPMColor*) = nullptr; |
| 534 | #endif |
| 535 | |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 536 | } // namespace SK_OPTS_NS |
| 537 | |
| 538 | #endif |