Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2018 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #ifndef SkBitmapProcState_opts_DEFINED |
| 9 | #define SkBitmapProcState_opts_DEFINED |
| 10 | |
| 11 | #include "SkBitmapProcState.h" |
| 12 | |
| 13 | // SkBitmapProcState optimized Shader, Sample, or Matrix procs. |
| 14 | // |
| 15 | // Only S32_alpha_D32_filter_DX exploits instructions beyond |
| 16 | // our common baseline SSE2/NEON instruction sets, so that's |
| 17 | // all that lives here. |
| 18 | // |
| 19 | // The rest are scattershot at the moment but I want to get them |
| 20 | // all migrated to be normal code inside SkBitmapProcState.cpp. |
| 21 | |
| 22 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 23 | #include <immintrin.h> |
| 24 | #elif defined(SK_ARM_HAS_NEON) |
| 25 | #include <arm_neon.h> |
| 26 | #endif |
| 27 | |
| 28 | namespace SK_OPTS_NS { |
| 29 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 30 | // This same basic packing scheme is used throughout the file. |
| 31 | static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) { |
| 32 | // The top 14 bits are the integer coordinate x0 or y0. |
| 33 | *v0 = packed >> 18; |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 34 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 35 | // The bottom 14 bits are the integer coordinate x1 or y1. |
| 36 | *v1 = packed & 0x3fff; |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 37 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 38 | // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1. |
| 39 | *w = (packed >> 14) & 0xf; |
| 40 | } |
| 41 | |
| 42 | #if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 43 | |
| 44 | // As above, 4x. |
| 45 | static void decode_packed_coordinates_and_weight(__m128i packed, |
| 46 | int v0[4], int v1[4], __m128i* w) { |
| 47 | _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18)); |
| 48 | _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); |
| 49 | *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); |
| 50 | } |
| 51 | |
| 52 | // This is the crux of the SSSE3 implementation, |
| 53 | // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). |
| 54 | static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1, |
| 55 | uint32_t B0, uint32_t B1, |
| 56 | const __m128i& interlaced_x_weights) { |
| 57 | // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp. |
| 58 | // |
| 59 | // It takes two arguments interlaced byte-wise: |
| 60 | // - first arg: [ x,y, ... 7 more pairs of 8-bit values ...] |
| 61 | // - second arg: [ z,w, ... 7 more pairs of 8-bit values ...] |
| 62 | // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ]. |
| 63 | // |
| 64 | // That's why we go to all this trouble to make interlaced_x_weights, |
| 65 | // and here we're interlacing A0 with A1, B0 with B1 to match. |
| 66 | |
| 67 | __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), |
| 68 | interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); |
| 69 | |
| 70 | return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), |
| 71 | interlaced_x_weights); |
| 72 | } |
| 73 | |
| 74 | // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. |
| 75 | // Returns two pixels, with each channel in a 16-bit lane of the __m128i. |
| 76 | static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1, |
| 77 | uint32_t A2, uint32_t A3, |
| 78 | uint32_t B0, uint32_t B1, |
| 79 | uint32_t B2, uint32_t B3, |
| 80 | const __m128i& interlaced_x_weights, |
| 81 | int wy) { |
| 82 | // The stored Y weight wy is for y1, and y0 gets a weight 16-wy. |
| 83 | const __m128i wy1 = _mm_set1_epi16(wy), |
| 84 | wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1); |
| 85 | |
| 86 | // First interpolate in X, |
| 87 | // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights. |
| 88 | __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), |
| 89 | row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); |
| 90 | |
| 91 | // Interpolate in Y across the two rows, |
| 92 | // then scale everything down by the maximum total weight 16x16 = 256. |
| 93 | return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0), |
| 94 | _mm_mullo_epi16(row1, wy1)), 8); |
| 95 | } |
| 96 | |
| 97 | /*not static*/ inline |
| 98 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 99 | const uint32_t* xy, int count, uint32_t* colors) { |
| 100 | SkASSERT(count > 0 && colors != nullptr); |
| 101 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
| 102 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
| 103 | |
| 104 | int alpha = s.fAlphaScale; |
| 105 | |
| 106 | // Return (px * s.fAlphaScale) / 256. (s.fAlphaScale is in [0,256].) |
| 107 | auto scale_by_alpha = [alpha](const __m128i& px) { |
| 108 | return alpha == 256 ? px |
| 109 | : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8); |
| 110 | }; |
| 111 | |
| 112 | // We're in _DX_ mode here, so we're only varying in X. |
| 113 | // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. |
| 114 | // All the other entries in xy will be pairs of X coordinates and the X weight. |
| 115 | int y0, y1, wy; |
| 116 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
| 117 | |
| 118 | auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), |
| 119 | row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); |
| 120 | |
| 121 | while (count >= 4) { |
| 122 | // We can really get going, loading 4 X pairs at a time to produce 4 output pixels. |
| 123 | const __m128i xx = _mm_loadu_si128((const __m128i*)xy); |
| 124 | |
| 125 | int x0[4], |
| 126 | x1[4]; |
| 127 | __m128i wx; |
| 128 | decode_packed_coordinates_and_weight(xx, x0, x1, &wx); |
| 129 | |
| 130 | // Splat out each x weight wx four times (one for each pixel channel) as wx1, |
| 131 | // and sixteen minus that as the weight for x0, wx0. |
| 132 | __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), |
| 133 | wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1); |
| 134 | |
| 135 | // We need to interlace wx0 and wx1 for _mm_maddubs_epi16(). |
| 136 | __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1), |
| 137 | interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1); |
| 138 | |
| 139 | // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time |
| 140 | // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. |
| 141 | __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]], |
| 142 | row1[x0[0]], row1[x1[0]], |
| 143 | row0[x0[1]], row0[x1[1]], |
| 144 | row1[x0[1]], row1[x1[1]], |
| 145 | interlaced_x_weights_AB, wy); |
| 146 | |
| 147 | // Once more with the other half of the x-weights for two more pixels C,D. |
| 148 | __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]], |
| 149 | row1[x0[2]], row1[x1[2]], |
| 150 | row0[x0[3]], row0[x1[3]], |
| 151 | row1[x0[3]], row1[x1[3]], |
| 152 | interlaced_x_weights_CD, wy); |
| 153 | |
| 154 | // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! |
| 155 | _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB), |
| 156 | scale_by_alpha(CD))); |
| 157 | xy += 4; |
| 158 | colors += 4; |
| 159 | count -= 4; |
| 160 | } |
| 161 | |
| 162 | while (count --> 0) { |
| 163 | // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. |
| 164 | int x0, x1, wx; |
| 165 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
| 166 | |
| 167 | // As above, splat out wx four times as wx1, and sixteen minus that as wx0. |
| 168 | __m128i wx1 = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. |
| 169 | wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1); |
| 170 | |
| 171 | __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1); |
| 172 | |
| 173 | __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], |
| 174 | row1[x0], row1[x1], |
| 175 | 0, 0, |
| 176 | 0, 0, |
| 177 | interlaced_x_weights_A, wy); |
| 178 | |
| 179 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128())); |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 184 | #elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 185 | |
| 186 | // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc. |
| 187 | |
| 188 | /*not static*/ inline |
| 189 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 190 | const uint32_t* xy, int count, uint32_t* colors) { |
| 191 | SkASSERT(count > 0 && colors != nullptr); |
| 192 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
| 193 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
| 194 | SkASSERT(s.fAlphaScale <= 256); |
| 195 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 196 | int y0, y1, wy; |
| 197 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 198 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 199 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
| 200 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 201 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 202 | // We'll put one pixel in the low 4 16-bit lanes to line up with wy, |
| 203 | // and another in the upper 4 16-bit lanes to line up with 16 - wy. |
| 204 | const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), |
| 205 | _mm_set1_epi16(16-wy)); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 206 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 207 | while (count --> 0) { |
| 208 | int x0, x1, wx; |
| 209 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 210 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 211 | // Load the 4 pixels we're interpolating. |
| 212 | const __m128i a00 = _mm_cvtsi32_si128(row0[x0]), |
| 213 | a01 = _mm_cvtsi32_si128(row0[x1]), |
| 214 | a10 = _mm_cvtsi32_si128(row1[x0]), |
| 215 | a11 = _mm_cvtsi32_si128(row1[x1]); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 216 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 217 | // Line up low-x pixels a00 and a10 with allY. |
| 218 | __m128i a00a10 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a10, a00), |
| 219 | _mm_setzero_si128()); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 220 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 221 | // Scale by allY and 16-wx. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 222 | a00a10 = _mm_mullo_epi16(a00a10, allY); |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 223 | a00a10 = _mm_mullo_epi16(a00a10, _mm_set1_epi16(16-wx)); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 224 | |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 225 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 226 | // Line up high-x pixels a01 and a11 with allY. |
| 227 | __m128i a01a11 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a11, a01), |
| 228 | _mm_setzero_si128()); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 229 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 230 | // Scale by allY and wx. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 231 | a01a11 = _mm_mullo_epi16(a01a11, allY); |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 232 | a01a11 = _mm_mullo_epi16(a01a11, _mm_set1_epi16(wx)); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 233 | |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 234 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 235 | // Add the two intermediates, summing across in one direction. |
| 236 | __m128i halves = _mm_add_epi16(a00a10, a01a11); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 237 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 238 | // Add the two halves to each other to sum in the other direction. |
| 239 | __m128i sum = _mm_add_epi16(halves, _mm_srli_si128(halves, 8)); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 240 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 241 | // Get back to [0,255] by dividing by maximum weight 16x16 = 256. |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 242 | sum = _mm_srli_epi16(sum, 8); |
| 243 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 244 | if (s.fAlphaScale < 256) { |
| 245 | // Scale by alpha, which is in [0,256]. |
| 246 | sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale)); |
| 247 | sum = _mm_srli_epi16(sum, 8); |
| 248 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 249 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 250 | // Pack back into 8-bit values and store. |
| 251 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128())); |
| 252 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 253 | } |
| 254 | |
| 255 | #else |
| 256 | |
| 257 | // The NEON code only actually differs from the portable code in the |
| 258 | // filtering step after we've loaded all four pixels we want to bilerp. |
| 259 | |
| 260 | #if defined(SK_ARM_HAS_NEON) |
| 261 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
| 262 | SkPMColor a00, SkPMColor a01, |
| 263 | SkPMColor a10, SkPMColor a11, |
| 264 | SkPMColor *dst, |
| 265 | uint16_t scale) { |
| 266 | uint8x8_t vy, vconst16_8, v16_y, vres; |
| 267 | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
| 268 | uint32x2_t va0, va1; |
| 269 | uint16x8_t tmp1, tmp2; |
| 270 | |
| 271 | vy = vdup_n_u8(y); // duplicate y into vy |
| 272 | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 273 | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
| 274 | |
| 275 | va0 = vdup_n_u32(a00); // duplicate a00 |
| 276 | va1 = vdup_n_u32(a10); // duplicate a10 |
| 277 | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
| 278 | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
| 279 | |
| 280 | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
| 281 | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
| 282 | |
| 283 | vx = vdup_n_u16(x); // duplicate x into vx |
| 284 | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
| 285 | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
| 286 | |
| 287 | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 288 | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 289 | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 290 | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 291 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 292 | if (scale < 256) { |
| 293 | vscale = vdup_n_u16(scale); // duplicate scale |
| 294 | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
| 295 | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
| 296 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 297 | |
| 298 | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8 |
| 299 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
| 300 | } |
| 301 | #else |
| 302 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
| 303 | SkPMColor a00, SkPMColor a01, |
| 304 | SkPMColor a10, SkPMColor a11, |
| 305 | SkPMColor* dstColor, |
| 306 | unsigned alphaScale) { |
| 307 | SkASSERT((unsigned)x <= 0xF); |
| 308 | SkASSERT((unsigned)y <= 0xF); |
| 309 | SkASSERT(alphaScale <= 256); |
| 310 | |
| 311 | int xy = x * y; |
| 312 | const uint32_t mask = 0xFF00FF; |
| 313 | |
| 314 | int scale = 256 - 16*y - 16*x + xy; |
| 315 | uint32_t lo = (a00 & mask) * scale; |
| 316 | uint32_t hi = ((a00 >> 8) & mask) * scale; |
| 317 | |
| 318 | scale = 16*x - xy; |
| 319 | lo += (a01 & mask) * scale; |
| 320 | hi += ((a01 >> 8) & mask) * scale; |
| 321 | |
| 322 | scale = 16*y - xy; |
| 323 | lo += (a10 & mask) * scale; |
| 324 | hi += ((a10 >> 8) & mask) * scale; |
| 325 | |
| 326 | lo += (a11 & mask) * xy; |
| 327 | hi += ((a11 >> 8) & mask) * xy; |
| 328 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 329 | if (alphaScale < 256) { |
| 330 | lo = ((lo >> 8) & mask) * alphaScale; |
| 331 | hi = ((hi >> 8) & mask) * alphaScale; |
| 332 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 333 | |
| 334 | *dstColor = ((lo >> 8) & mask) | (hi & ~mask); |
| 335 | } |
| 336 | #endif |
| 337 | |
| 338 | |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 339 | /*not static*/ inline |
| 340 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 341 | const uint32_t* xy, int count, SkPMColor* colors) { |
| 342 | SkASSERT(count > 0 && colors != nullptr); |
| 343 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
| 344 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
| 345 | SkASSERT(s.fAlphaScale <= 256); |
| 346 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 347 | int y0, y1, wy; |
| 348 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 349 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 350 | auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ), |
| 351 | row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() ); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 352 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 353 | while (count --> 0) { |
| 354 | int x0, x1, wx; |
| 355 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 356 | |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 357 | filter_and_scale_by_alpha(wx, wy, |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 358 | row0[x0], row0[x1], |
| 359 | row1[x0], row1[x1], |
Mike Klein | 2c8e2bc | 2018-11-16 16:44:10 -0500 | [diff] [blame] | 360 | colors++, |
| 361 | s.fAlphaScale); |
| 362 | } |
Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 363 | } |
| 364 | |
| 365 | #endif |
| 366 | |
| 367 | } // namespace SK_OPTS_NS |
| 368 | |
| 369 | #endif |