mtklein | 4a37d08 | 2015-09-10 10:38:02 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2015 Google Inc. |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
| 8 | #ifndef SkBlitRow_opts_DEFINED |
| 9 | #define SkBlitRow_opts_DEFINED |
| 10 | |
Mike Klein | c0bd9f9 | 2019-04-23 12:05:21 -0500 | [diff] [blame] | 11 | #include "include/private/SkColorData.h" |
| 12 | #include "include/private/SkVx.h" |
| 13 | #include "src/core/SkMSAN.h" |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 14 | |
| 15 | // Helpers for blit_row_s32a_opaque(), |
| 16 | // then blit_row_s32a_opaque() itself, |
| 17 | // then unrelated blit_row_color32() at the bottom. |
| 18 | // |
| 19 | // To keep Skia resistant to timing attacks, it's important not to branch on pixel data. |
| 20 | // In particular, don't be tempted to [v]ptest, pmovmskb, etc. to branch on the source alpha. |
| 21 | |
Lingyun Cai | 56f23a1 | 2020-04-26 14:44:32 +0800 | [diff] [blame] | 22 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX |
| 23 | #include <immintrin.h> |
| 24 | |
| 25 | static inline __m512i SkPMSrcOver_SKX(const __m512i& src, const __m512i& dst) { |
| 26 | // Detailed explanations in SkPMSrcOver_AVX2 |
| 27 | // b = s + (d*(256-srcA)) >> 8 |
| 28 | |
| 29 | // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel. |
| 30 | const uint8_t _ = -1; // fills a literal 0 byte. |
| 31 | const uint8_t mask[64] = { 3, _,3, _, 7, _,7, _, 11,_,11,_, 15,_,15,_, |
| 32 | 19,_,19,_, 23,_,23,_, 27,_,27,_, 31,_,31,_, |
| 33 | 35,_,35,_, 39,_,39,_, 43,_,43,_, 47,_,47,_, |
| 34 | 51,_,51,_, 55,_,55,_, 59,_,59,_, 63,_,63,_ }; |
| 35 | __m512i srcA_x2 = _mm512_shuffle_epi8(src, _mm512_loadu_si512(mask)); |
| 36 | __m512i scale_x2 = _mm512_sub_epi16(_mm512_set1_epi16(256), |
| 37 | srcA_x2); |
| 38 | |
| 39 | // Scale red and blue, leaving results in the low byte of each 16-bit lane. |
| 40 | __m512i rb = _mm512_and_si512(_mm512_set1_epi32(0x00ff00ff), dst); |
| 41 | rb = _mm512_mullo_epi16(rb, scale_x2); |
| 42 | rb = _mm512_srli_epi16(rb, 8); |
| 43 | |
| 44 | // Scale green and alpha, leaving results in the high byte, masking off the low bits. |
| 45 | __m512i ga = _mm512_srli_epi16(dst, 8); |
| 46 | ga = _mm512_mullo_epi16(ga, scale_x2); |
| 47 | ga = _mm512_andnot_si512(_mm512_set1_epi32(0x00ff00ff), ga); |
| 48 | |
| 49 | return _mm512_add_epi32(src, _mm512_or_si512(rb, ga)); |
| 50 | } |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 51 | #endif |
Lingyun Cai | 56f23a1 | 2020-04-26 14:44:32 +0800 | [diff] [blame] | 52 | |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 53 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
Zhenyu Shan | d2f2c04 | 2019-05-22 21:15:43 +0800 | [diff] [blame] | 54 | #include <immintrin.h> |
mtklein | b4a7dc9 | 2016-03-23 06:29:12 -0700 | [diff] [blame] | 55 | |
Zhenyu Shan | d2f2c04 | 2019-05-22 21:15:43 +0800 | [diff] [blame] | 56 | static inline __m256i SkPMSrcOver_AVX2(const __m256i& src, const __m256i& dst) { |
Mike Klein | fc64a1b | 2019-06-27 09:59:09 -0400 | [diff] [blame] | 57 | // Abstractly srcover is |
| 58 | // b = s + d*(1-srcA) |
| 59 | // |
| 60 | // In terms of unorm8 bytes, that works out to |
| 61 | // b = s + (d*(255-srcA) + 127) / 255 |
| 62 | // |
| 63 | // But we approximate that to within a bit with |
| 64 | // b = s + (d*(255-srcA) + d) / 256 |
| 65 | // a.k.a |
| 66 | // b = s + (d*(256-srcA)) >> 8 |
Zhenyu Shan | d2f2c04 | 2019-05-22 21:15:43 +0800 | [diff] [blame] | 67 | |
Mike Klein | fc64a1b | 2019-06-27 09:59:09 -0400 | [diff] [blame] | 68 | // The bottleneck of this math is the multiply, and we want to do it as |
| 69 | // narrowly as possible, here getting inputs into 16-bit lanes and |
| 70 | // using 16-bit multiplies. We can do twice as many multiplies at once |
| 71 | // as using naive 32-bit multiplies, and on top of that, the 16-bit multiplies |
| 72 | // are themselves a couple cycles quicker. Win-win. |
Zhenyu Shan | d2f2c04 | 2019-05-22 21:15:43 +0800 | [diff] [blame] | 73 | |
Mike Klein | fc64a1b | 2019-06-27 09:59:09 -0400 | [diff] [blame] | 74 | // We'll get everything in 16-bit lanes for two multiplies, one |
| 75 | // handling dst red and blue, the other green and alpha. (They're |
| 76 | // conveniently 16-bits apart, you see.) We don't need the individual |
| 77 | // src channels beyond alpha until the very end when we do the "s + " |
| 78 | // add, and we don't even need to unpack them; the adds cannot overflow. |
Zhenyu Shan | d2f2c04 | 2019-05-22 21:15:43 +0800 | [diff] [blame] | 79 | |
Mike Klein | fc64a1b | 2019-06-27 09:59:09 -0400 | [diff] [blame] | 80 | // Shuffle each pixel's srcA to the low byte of each 16-bit half of the pixel. |
| 81 | const int _ = -1; // fills a literal 0 byte. |
| 82 | __m256i srcA_x2 = _mm256_shuffle_epi8(src, |
| 83 | _mm256_setr_epi8(3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_, |
| 84 | 3,_,3,_, 7,_,7,_, 11,_,11,_, 15,_,15,_)); |
| 85 | __m256i scale_x2 = _mm256_sub_epi16(_mm256_set1_epi16(256), |
| 86 | srcA_x2); |
| 87 | |
| 88 | // Scale red and blue, leaving results in the low byte of each 16-bit lane. |
| 89 | __m256i rb = _mm256_and_si256(_mm256_set1_epi32(0x00ff00ff), dst); |
| 90 | rb = _mm256_mullo_epi16(rb, scale_x2); |
| 91 | rb = _mm256_srli_epi16 (rb, 8); |
| 92 | |
| 93 | // Scale green and alpha, leaving results in the high byte, masking off the low bits. |
| 94 | __m256i ga = _mm256_srli_epi16(dst, 8); |
| 95 | ga = _mm256_mullo_epi16(ga, scale_x2); |
| 96 | ga = _mm256_andnot_si256(_mm256_set1_epi32(0x00ff00ff), ga); |
| 97 | |
| 98 | return _mm256_add_epi32(src, _mm256_or_si256(rb, ga)); |
Zhenyu Shan | d2f2c04 | 2019-05-22 21:15:43 +0800 | [diff] [blame] | 99 | } |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 100 | #endif |
Zhenyu Shan | d2f2c04 | 2019-05-22 21:15:43 +0800 | [diff] [blame] | 101 | |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 102 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
Herbert Derby | d8e2b13 | 2017-11-29 11:02:07 -0500 | [diff] [blame] | 103 | #include <immintrin.h> |
Mike Klein | f3086f0 | 2018-12-04 15:14:28 -0500 | [diff] [blame] | 104 | |
| 105 | static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) { |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 106 | __m128i scale = _mm_sub_epi32(_mm_set1_epi32(256), |
| 107 | _mm_srli_epi32(src, 24)); |
| 108 | __m128i scale_x2 = _mm_or_si128(_mm_slli_epi32(scale, 16), scale); |
Mike Klein | f3086f0 | 2018-12-04 15:14:28 -0500 | [diff] [blame] | 109 | |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 110 | __m128i rb = _mm_and_si128(_mm_set1_epi32(0x00ff00ff), dst); |
| 111 | rb = _mm_mullo_epi16(rb, scale_x2); |
| 112 | rb = _mm_srli_epi16(rb, 8); |
Mike Klein | f3086f0 | 2018-12-04 15:14:28 -0500 | [diff] [blame] | 113 | |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 114 | __m128i ga = _mm_srli_epi16(dst, 8); |
| 115 | ga = _mm_mullo_epi16(ga, scale_x2); |
| 116 | ga = _mm_andnot_si128(_mm_set1_epi32(0x00ff00ff), ga); |
Mike Klein | f3086f0 | 2018-12-04 15:14:28 -0500 | [diff] [blame] | 117 | |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 118 | return _mm_add_epi32(src, _mm_or_si128(rb, ga)); |
Mike Klein | f3086f0 | 2018-12-04 15:14:28 -0500 | [diff] [blame] | 119 | } |
mtklein | b4a7dc9 | 2016-03-23 06:29:12 -0700 | [diff] [blame] | 120 | #endif |
mtklein | 4a37d08 | 2015-09-10 10:38:02 -0700 | [diff] [blame] | 121 | |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 122 | #if defined(SK_ARM_HAS_NEON) |
| 123 | #include <arm_neon.h> |
| 124 | |
| 125 | // SkMulDiv255Round() applied to each lane. |
| 126 | static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) { |
| 127 | uint16x8_t prod = vmull_u8(x, y); |
| 128 | return vraddhn_u16(prod, vrshrq_n_u16(prod, 8)); |
| 129 | } |
| 130 | |
| 131 | static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) { |
| 132 | uint8x8_t nalphas = vmvn_u8(src.val[3]); // 256 - alpha |
| 133 | return { |
| 134 | vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0])), |
| 135 | vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1])), |
| 136 | vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2])), |
| 137 | vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3])), |
| 138 | }; |
| 139 | } |
| 140 | |
| 141 | // Variant assuming dst and src contain the color components of two consecutive pixels. |
| 142 | static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) { |
| 143 | const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303); |
| 144 | uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices)); |
| 145 | return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst)); |
| 146 | } |
| 147 | |
| 148 | #endif |
| 149 | |
mtklein | 4a37d08 | 2015-09-10 10:38:02 -0700 | [diff] [blame] | 150 | namespace SK_OPTS_NS { |
| 151 | |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 152 | /*not static*/ |
| 153 | inline void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) { |
| 154 | SkASSERT(alpha == 0xFF); |
| 155 | sk_msan_assert_initialized(src, src+len); |
| 156 | |
| 157 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX |
| 158 | while (len >= 16) { |
| 159 | _mm512_storeu_si512((__m512*)dst, |
| 160 | SkPMSrcOver_SKX(_mm512_loadu_si512((const __m512i*)src), |
| 161 | _mm512_loadu_si512((const __m512i*)dst))); |
| 162 | src += 16; |
| 163 | dst += 16; |
| 164 | len -= 16; |
| 165 | } |
| 166 | #endif |
| 167 | |
| 168 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
| 169 | while (len >= 8) { |
| 170 | _mm256_storeu_si256((__m256i*)dst, |
| 171 | SkPMSrcOver_AVX2(_mm256_loadu_si256((const __m256i*)src), |
| 172 | _mm256_loadu_si256((const __m256i*)dst))); |
| 173 | src += 8; |
| 174 | dst += 8; |
| 175 | len -= 8; |
| 176 | } |
| 177 | #endif |
| 178 | |
| 179 | #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 180 | while (len >= 4) { |
| 181 | _mm_storeu_si128((__m128i*)dst, SkPMSrcOver_SSE2(_mm_loadu_si128((const __m128i*)src), |
| 182 | _mm_loadu_si128((const __m128i*)dst))); |
| 183 | src += 4; |
| 184 | dst += 4; |
| 185 | len -= 4; |
| 186 | } |
| 187 | #endif |
| 188 | |
| 189 | #if defined(SK_ARM_HAS_NEON) |
| 190 | while (len >= 8) { |
| 191 | vst4_u8((uint8_t*)dst, SkPMSrcOver_neon8(vld4_u8((const uint8_t*)dst), |
| 192 | vld4_u8((const uint8_t*)src))); |
| 193 | src += 8; |
| 194 | dst += 8; |
| 195 | len -= 8; |
| 196 | } |
| 197 | |
| 198 | while (len >= 2) { |
| 199 | vst1_u8((uint8_t*)dst, SkPMSrcOver_neon2(vld1_u8((const uint8_t*)dst), |
| 200 | vld1_u8((const uint8_t*)src))); |
| 201 | src += 2; |
| 202 | dst += 2; |
| 203 | len -= 2; |
| 204 | } |
| 205 | |
| 206 | if (len != 0) { |
| 207 | uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8((uint64_t)*dst), |
| 208 | vcreate_u8((uint64_t)*src)); |
| 209 | vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0); |
| 210 | } |
| 211 | return; |
| 212 | #endif |
| 213 | |
| 214 | while (len --> 0) { |
| 215 | *dst = SkPMSrcOver(*src, *dst); |
| 216 | src++; |
| 217 | dst++; |
| 218 | } |
| 219 | } |
| 220 | |
Mike Klein | c33e6dc | 2019-04-10 11:44:42 -0500 | [diff] [blame] | 221 | // Blend constant color over count src pixels, writing into dst. |
Mike Klein | 5d3314c | 2020-08-27 13:54:17 -0500 | [diff] [blame] | 222 | /*not static*/ |
Mike Klein | c33e6dc | 2019-04-10 11:44:42 -0500 | [diff] [blame] | 223 | inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) { |
Mike Klein | 3d50730 | 2019-04-15 08:56:06 -0500 | [diff] [blame] | 224 | constexpr int N = 4; // 8, 16 also reasonable choices |
Mike Klein | c33e6dc | 2019-04-10 11:44:42 -0500 | [diff] [blame] | 225 | using U32 = skvx::Vec< N, uint32_t>; |
| 226 | using U16 = skvx::Vec<4*N, uint16_t>; |
| 227 | using U8 = skvx::Vec<4*N, uint8_t>; |
| 228 | |
| 229 | auto kernel = [color](U32 src) { |
| 230 | unsigned invA = 255 - SkGetPackedA32(color); |
| 231 | invA += invA >> 7; |
| 232 | SkASSERT(0 < invA && invA < 256); // We handle alpha == 0 or alpha == 255 specially. |
| 233 | |
| 234 | // (src * invA + (color << 8) + 128) >> 8 |
| 235 | // Should all fit in 16 bits. |
Mike Klein | 3d50730 | 2019-04-15 08:56:06 -0500 | [diff] [blame] | 236 | U8 s = skvx::bit_pun<U8>(src), |
| 237 | a = U8(invA); |
| 238 | U16 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))), |
| 239 | d = (mull(s,a) + (c << 8) + 128)>>8; |
Mike Klein | c33e6dc | 2019-04-10 11:44:42 -0500 | [diff] [blame] | 240 | return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d)); |
| 241 | }; |
| 242 | |
| 243 | while (count >= N) { |
| 244 | kernel(U32::Load(src)).store(dst); |
| 245 | src += N; |
| 246 | dst += N; |
| 247 | count -= N; |
| 248 | } |
| 249 | while (count --> 0) { |
| 250 | *dst++ = kernel(U32{*src++})[0]; |
| 251 | } |
| 252 | } |
| 253 | |
John Stiles | a6841be | 2020-08-06 14:11:56 -0400 | [diff] [blame] | 254 | } // namespace SK_OPTS_NS |
mtklein | 4a37d08 | 2015-09-10 10:38:02 -0700 | [diff] [blame] | 255 | |
| 256 | #endif//SkBlitRow_opts_DEFINED |