Mike Klein | a2187bf | 2018-11-16 12:22:05 -0500 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2018 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #ifndef SkBitmapProcState_opts_DEFINED |
| 9 | #define SkBitmapProcState_opts_DEFINED |
| 10 | |
| 11 | #include "SkBitmapProcState.h" |
| 12 | |
| 13 | // SkBitmapProcState optimized Shader, Sample, or Matrix procs. |
| 14 | // |
| 15 | // Only S32_alpha_D32_filter_DX exploits instructions beyond |
| 16 | // our common baseline SSE2/NEON instruction sets, so that's |
| 17 | // all that lives here. |
| 18 | // |
| 19 | // The rest are scattershot at the moment but I want to get them |
| 20 | // all migrated to be normal code inside SkBitmapProcState.cpp. |
| 21 | |
| 22 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 23 | #include <immintrin.h> |
| 24 | #elif defined(SK_ARM_HAS_NEON) |
| 25 | #include <arm_neon.h> |
| 26 | #endif |
| 27 | |
| 28 | namespace SK_OPTS_NS { |
| 29 | |
| 30 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 |
| 31 | // This same basic packing scheme is used throughout the file. |
| 32 | static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) { |
| 33 | // The top 14 bits are the integer coordinate x0 or y0. |
| 34 | *v0 = packed >> 18; |
| 35 | |
| 36 | // The bottom 14 bits are the integer coordinate x1 or y1. |
| 37 | *v1 = packed & 0x3fff; |
| 38 | |
| 39 | // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1. |
| 40 | *w = (packed >> 14) & 0xf; |
| 41 | } |
| 42 | |
| 43 | // As above, 4x. |
| 44 | static void decode_packed_coordinates_and_weight(__m128i packed, |
| 45 | int v0[4], int v1[4], __m128i* w) { |
| 46 | _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18)); |
| 47 | _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff))); |
| 48 | *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); |
| 49 | } |
| 50 | |
| 51 | // This is the crux of the SSSE3 implementation, |
| 52 | // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16(). |
| 53 | static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1, |
| 54 | uint32_t B0, uint32_t B1, |
| 55 | const __m128i& interlaced_x_weights) { |
| 56 | // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp. |
| 57 | // |
| 58 | // It takes two arguments interlaced byte-wise: |
| 59 | // - first arg: [ x,y, ... 7 more pairs of 8-bit values ...] |
| 60 | // - second arg: [ z,w, ... 7 more pairs of 8-bit values ...] |
| 61 | // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ]. |
| 62 | // |
| 63 | // That's why we go to all this trouble to make interlaced_x_weights, |
| 64 | // and here we're interlacing A0 with A1, B0 with B1 to match. |
| 65 | |
| 66 | __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)), |
| 67 | interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1)); |
| 68 | |
| 69 | return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B), |
| 70 | interlaced_x_weights); |
| 71 | } |
| 72 | |
| 73 | // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B. |
| 74 | // Returns two pixels, with each channel in a 16-bit lane of the __m128i. |
| 75 | static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1, |
| 76 | uint32_t A2, uint32_t A3, |
| 77 | uint32_t B0, uint32_t B1, |
| 78 | uint32_t B2, uint32_t B3, |
| 79 | const __m128i& interlaced_x_weights, |
| 80 | int wy) { |
| 81 | // The stored Y weight wy is for y1, and y0 gets a weight 16-wy. |
| 82 | const __m128i wy1 = _mm_set1_epi16(wy), |
| 83 | wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1); |
| 84 | |
| 85 | // First interpolate in X, |
| 86 | // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights. |
| 87 | __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights), |
| 88 | row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights); |
| 89 | |
| 90 | // Interpolate in Y across the two rows, |
| 91 | // then scale everything down by the maximum total weight 16x16 = 256. |
| 92 | return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0), |
| 93 | _mm_mullo_epi16(row1, wy1)), 8); |
| 94 | } |
| 95 | |
| 96 | /*not static*/ inline |
| 97 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 98 | const uint32_t* xy, int count, uint32_t* colors) { |
| 99 | SkASSERT(count > 0 && colors != nullptr); |
| 100 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
| 101 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
| 102 | |
| 103 | int alpha = s.fAlphaScale; |
| 104 | |
| 105 | // Return (px * s.fAlphaScale) / 256. (s.fAlphaScale is in [0,256].) |
| 106 | auto scale_by_alpha = [alpha](const __m128i& px) { |
| 107 | return alpha == 256 ? px |
| 108 | : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8); |
| 109 | }; |
| 110 | |
| 111 | // We're in _DX_ mode here, so we're only varying in X. |
| 112 | // That means the first entry of xy is our constant pair of Y coordinates and weight in Y. |
| 113 | // All the other entries in xy will be pairs of X coordinates and the X weight. |
| 114 | int y0, y1, wy; |
| 115 | decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy); |
| 116 | |
| 117 | auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()), |
| 118 | row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes()); |
| 119 | |
| 120 | while (count >= 4) { |
| 121 | // We can really get going, loading 4 X pairs at a time to produce 4 output pixels. |
| 122 | const __m128i xx = _mm_loadu_si128((const __m128i*)xy); |
| 123 | |
| 124 | int x0[4], |
| 125 | x1[4]; |
| 126 | __m128i wx; |
| 127 | decode_packed_coordinates_and_weight(xx, x0, x1, &wx); |
| 128 | |
| 129 | // Splat out each x weight wx four times (one for each pixel channel) as wx1, |
| 130 | // and sixteen minus that as the weight for x0, wx0. |
| 131 | __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)), |
| 132 | wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1); |
| 133 | |
| 134 | // We need to interlace wx0 and wx1 for _mm_maddubs_epi16(). |
| 135 | __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1), |
| 136 | interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1); |
| 137 | |
| 138 | // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time |
| 139 | // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each. |
| 140 | __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]], |
| 141 | row1[x0[0]], row1[x1[0]], |
| 142 | row0[x0[1]], row0[x1[1]], |
| 143 | row1[x0[1]], row1[x1[1]], |
| 144 | interlaced_x_weights_AB, wy); |
| 145 | |
| 146 | // Once more with the other half of the x-weights for two more pixels C,D. |
| 147 | __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]], |
| 148 | row1[x0[2]], row1[x1[2]], |
| 149 | row0[x0[3]], row0[x1[3]], |
| 150 | row1[x0[3]], row1[x1[3]], |
| 151 | interlaced_x_weights_CD, wy); |
| 152 | |
| 153 | // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels! |
| 154 | _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB), |
| 155 | scale_by_alpha(CD))); |
| 156 | xy += 4; |
| 157 | colors += 4; |
| 158 | count -= 4; |
| 159 | } |
| 160 | |
| 161 | while (count --> 0) { |
| 162 | // This is exactly the same flow as the count >= 4 loop above, but writing one pixel. |
| 163 | int x0, x1, wx; |
| 164 | decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx); |
| 165 | |
| 166 | // As above, splat out wx four times as wx1, and sixteen minus that as wx0. |
| 167 | __m128i wx1 = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine. |
| 168 | wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1); |
| 169 | |
| 170 | __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1); |
| 171 | |
| 172 | __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1], |
| 173 | row1[x0], row1[x1], |
| 174 | 0, 0, |
| 175 | 0, 0, |
| 176 | interlaced_x_weights_A, wy); |
| 177 | |
| 178 | *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128())); |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | |
| 183 | #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 184 | |
| 185 | // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc. |
| 186 | |
| 187 | /*not static*/ inline |
| 188 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 189 | const uint32_t* xy, int count, uint32_t* colors) { |
| 190 | SkASSERT(count > 0 && colors != nullptr); |
| 191 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
| 192 | SkASSERT(kN32_SkColorType == s.fPixmap.colorType()); |
| 193 | SkASSERT(s.fAlphaScale <= 256); |
| 194 | |
| 195 | const char* srcAddr = static_cast<const char*>(s.fPixmap.addr()); |
| 196 | size_t rb = s.fPixmap.rowBytes(); |
| 197 | uint32_t XY = *xy++; |
| 198 | unsigned y0 = XY >> 14; |
| 199 | const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb); |
| 200 | const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb); |
| 201 | unsigned subY = y0 & 0xF; |
| 202 | |
| 203 | // ( 0, 0, 0, 0, 0, 0, 0, 16) |
| 204 | __m128i sixteen = _mm_cvtsi32_si128(16); |
| 205 | |
| 206 | // ( 0, 0, 0, 0, 16, 16, 16, 16) |
| 207 | sixteen = _mm_shufflelo_epi16(sixteen, 0); |
| 208 | |
| 209 | // ( 0, 0, 0, 0, 0, 0, 0, y) |
| 210 | __m128i allY = _mm_cvtsi32_si128(subY); |
| 211 | |
| 212 | // ( 0, 0, 0, 0, y, y, y, y) |
| 213 | allY = _mm_shufflelo_epi16(allY, 0); |
| 214 | |
| 215 | // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) |
| 216 | __m128i negY = _mm_sub_epi16(sixteen, allY); |
| 217 | |
| 218 | // (16-y, 16-y, 16-y, 16-y, y, y, y, y) |
| 219 | allY = _mm_unpacklo_epi64(allY, negY); |
| 220 | |
| 221 | // (16, 16, 16, 16, 16, 16, 16, 16 ) |
| 222 | sixteen = _mm_shuffle_epi32(sixteen, 0); |
| 223 | |
| 224 | // ( 0, 0, 0, 0, 0, 0, 0, 0) |
| 225 | __m128i zero = _mm_setzero_si128(); |
| 226 | |
| 227 | // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha ) |
| 228 | __m128i alpha = _mm_set1_epi16(s.fAlphaScale); |
| 229 | |
| 230 | do { |
| 231 | uint32_t XX = *xy++; // x0:14 | 4 | x1:14 |
| 232 | unsigned x0 = XX >> 18; |
| 233 | unsigned x1 = XX & 0x3FFF; |
| 234 | |
| 235 | // (0, 0, 0, 0, 0, 0, 0, x) |
| 236 | __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); |
| 237 | |
| 238 | // (0, 0, 0, 0, x, x, x, x) |
| 239 | allX = _mm_shufflelo_epi16(allX, 0); |
| 240 | |
| 241 | // (x, x, x, x, x, x, x, x) |
| 242 | allX = _mm_shuffle_epi32(allX, 0); |
| 243 | |
| 244 | // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) |
| 245 | __m128i negX = _mm_sub_epi16(sixteen, allX); |
| 246 | |
| 247 | // Load 4 samples (pixels). |
| 248 | __m128i a00 = _mm_cvtsi32_si128(row0[x0]); |
| 249 | __m128i a01 = _mm_cvtsi32_si128(row0[x1]); |
| 250 | __m128i a10 = _mm_cvtsi32_si128(row1[x0]); |
| 251 | __m128i a11 = _mm_cvtsi32_si128(row1[x1]); |
| 252 | |
| 253 | // (0, 0, a00, a10) |
| 254 | __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); |
| 255 | |
| 256 | // Expand to 16 bits per component. |
| 257 | a00a10 = _mm_unpacklo_epi8(a00a10, zero); |
| 258 | |
| 259 | // ((a00 * (16-y)), (a10 * y)). |
| 260 | a00a10 = _mm_mullo_epi16(a00a10, allY); |
| 261 | |
| 262 | // (a00 * (16-y) * (16-x), a10 * y * (16-x)). |
| 263 | a00a10 = _mm_mullo_epi16(a00a10, negX); |
| 264 | |
| 265 | // (0, 0, a01, a10) |
| 266 | __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); |
| 267 | |
| 268 | // Expand to 16 bits per component. |
| 269 | a01a11 = _mm_unpacklo_epi8(a01a11, zero); |
| 270 | |
| 271 | // (a01 * (16-y)), (a11 * y) |
| 272 | a01a11 = _mm_mullo_epi16(a01a11, allY); |
| 273 | |
| 274 | // (a01 * (16-y) * x), (a11 * y * x) |
| 275 | a01a11 = _mm_mullo_epi16(a01a11, allX); |
| 276 | |
| 277 | // (a00*w00 + a01*w01, a10*w10 + a11*w11) |
| 278 | __m128i sum = _mm_add_epi16(a00a10, a01a11); |
| 279 | |
| 280 | // (DC, a00*w00 + a01*w01) |
| 281 | __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); |
| 282 | |
| 283 | // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) |
| 284 | sum = _mm_add_epi16(sum, shifted); |
| 285 | |
| 286 | // Divide each 16 bit component by 256. |
| 287 | sum = _mm_srli_epi16(sum, 8); |
| 288 | |
| 289 | // Multiply by alpha. |
| 290 | sum = _mm_mullo_epi16(sum, alpha); |
| 291 | |
| 292 | // Divide each 16 bit component by 256. |
| 293 | sum = _mm_srli_epi16(sum, 8); |
| 294 | |
| 295 | // Pack lower 4 16 bit values of sum into lower 4 bytes. |
| 296 | sum = _mm_packus_epi16(sum, zero); |
| 297 | |
| 298 | // Extract low int and store. |
| 299 | *colors++ = _mm_cvtsi128_si32(sum); |
| 300 | } while (--count > 0); |
| 301 | } |
| 302 | |
| 303 | #else |
| 304 | |
| 305 | // The NEON code only actually differs from the portable code in the |
| 306 | // filtering step after we've loaded all four pixels we want to bilerp. |
| 307 | |
| 308 | #if defined(SK_ARM_HAS_NEON) |
| 309 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
| 310 | SkPMColor a00, SkPMColor a01, |
| 311 | SkPMColor a10, SkPMColor a11, |
| 312 | SkPMColor *dst, |
| 313 | uint16_t scale) { |
| 314 | uint8x8_t vy, vconst16_8, v16_y, vres; |
| 315 | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
| 316 | uint32x2_t va0, va1; |
| 317 | uint16x8_t tmp1, tmp2; |
| 318 | |
| 319 | vy = vdup_n_u8(y); // duplicate y into vy |
| 320 | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 321 | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
| 322 | |
| 323 | va0 = vdup_n_u32(a00); // duplicate a00 |
| 324 | va1 = vdup_n_u32(a10); // duplicate a10 |
| 325 | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
| 326 | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
| 327 | |
| 328 | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
| 329 | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
| 330 | |
| 331 | vx = vdup_n_u16(x); // duplicate x into vx |
| 332 | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
| 333 | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
| 334 | |
| 335 | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 336 | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 337 | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 338 | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 339 | |
| 340 | vscale = vdup_n_u16(scale); // duplicate scale |
| 341 | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
| 342 | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
| 343 | |
| 344 | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8 |
| 345 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
| 346 | } |
| 347 | #else |
| 348 | static void filter_and_scale_by_alpha(unsigned x, unsigned y, |
| 349 | SkPMColor a00, SkPMColor a01, |
| 350 | SkPMColor a10, SkPMColor a11, |
| 351 | SkPMColor* dstColor, |
| 352 | unsigned alphaScale) { |
| 353 | SkASSERT((unsigned)x <= 0xF); |
| 354 | SkASSERT((unsigned)y <= 0xF); |
| 355 | SkASSERT(alphaScale <= 256); |
| 356 | |
| 357 | int xy = x * y; |
| 358 | const uint32_t mask = 0xFF00FF; |
| 359 | |
| 360 | int scale = 256 - 16*y - 16*x + xy; |
| 361 | uint32_t lo = (a00 & mask) * scale; |
| 362 | uint32_t hi = ((a00 >> 8) & mask) * scale; |
| 363 | |
| 364 | scale = 16*x - xy; |
| 365 | lo += (a01 & mask) * scale; |
| 366 | hi += ((a01 >> 8) & mask) * scale; |
| 367 | |
| 368 | scale = 16*y - xy; |
| 369 | lo += (a10 & mask) * scale; |
| 370 | hi += ((a10 >> 8) & mask) * scale; |
| 371 | |
| 372 | lo += (a11 & mask) * xy; |
| 373 | hi += ((a11 >> 8) & mask) * xy; |
| 374 | |
| 375 | // TODO: if (alphaScale < 256) ... |
| 376 | lo = ((lo >> 8) & mask) * alphaScale; |
| 377 | hi = ((hi >> 8) & mask) * alphaScale; |
| 378 | |
| 379 | *dstColor = ((lo >> 8) & mask) | (hi & ~mask); |
| 380 | } |
| 381 | #endif |
| 382 | |
| 383 | |
| 384 | // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc. |
| 385 | |
| 386 | /*not static*/ inline |
| 387 | void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, |
| 388 | const uint32_t* xy, int count, SkPMColor* colors) { |
| 389 | SkASSERT(count > 0 && colors != nullptr); |
| 390 | SkASSERT(s.fFilterQuality != kNone_SkFilterQuality); |
| 391 | SkASSERT(4 == s.fPixmap.info().bytesPerPixel()); |
| 392 | SkASSERT(s.fAlphaScale <= 256); |
| 393 | |
| 394 | unsigned alphaScale = s.fAlphaScale; |
| 395 | |
| 396 | const char* srcAddr = (const char*)s.fPixmap.addr(); |
| 397 | size_t rb = s.fPixmap.rowBytes(); |
| 398 | unsigned subY; |
| 399 | const SkPMColor* row0; |
| 400 | const SkPMColor* row1; |
| 401 | |
| 402 | // setup row ptrs and update proc_table |
| 403 | { |
| 404 | uint32_t XY = *xy++; |
| 405 | unsigned y0 = XY >> 14; |
| 406 | row0 = (const SkPMColor*)(srcAddr + (y0 >> 4) * rb); |
| 407 | row1 = (const SkPMColor*)(srcAddr + (XY & 0x3FFF) * rb); |
| 408 | subY = y0 & 0xF; |
| 409 | } |
| 410 | |
| 411 | do { |
| 412 | uint32_t XX = *xy++; // x0:14 | 4 | x1:14 |
| 413 | unsigned x0 = XX >> 14; |
| 414 | unsigned x1 = XX & 0x3FFF; |
| 415 | unsigned subX = x0 & 0xF; |
| 416 | x0 >>= 4; |
| 417 | |
| 418 | filter_and_scale_by_alpha(subX, subY, |
| 419 | row0[x0], row0[x1], |
| 420 | row1[x0], row1[x1], |
| 421 | colors, |
| 422 | alphaScale); |
| 423 | colors += 1; |
| 424 | |
| 425 | } while (--count != 0); |
| 426 | } |
| 427 | |
| 428 | #endif |
| 429 | |
| 430 | } // namespace SK_OPTS_NS |
| 431 | |
| 432 | #endif |