| /* |
| * Copyright 2016 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| /* |
| ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q |
| */ |
| |
| #ifndef SkBlend_opts_DEFINED |
| #define SkBlend_opts_DEFINED |
| |
| #include "SkNx.h" |
| #include "SkPM4fPriv.h" |
| |
| #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| #include <immintrin.h> |
| #endif |
| |
| namespace SK_OPTS_NS { |
| |
| static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) { |
| if (src >= 0xFF000000) { |
| *dst = src; |
| return; |
| } |
| auto d = Sk4f_fromS32(*dst), |
| s = Sk4f_fromS32( src); |
| *dst = Sk4f_toS32(s + d * (1.0f - s[3])); |
| } |
| |
| static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) { |
| srcover_srgb_srgb_1(dst++, *src++); |
| srcover_srgb_srgb_1(dst++, *src++); |
| srcover_srgb_srgb_1(dst++, *src++); |
| srcover_srgb_srgb_1(dst , *src ); |
| } |
| |
| #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| |
| static inline __m128i load(const uint32_t* p) { |
| return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)); |
| } |
| |
| static inline void store(uint32_t* p, __m128i v) { |
| _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v); |
| } |
| |
| #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 |
| |
| static void srcover_srgb_srgb( |
| uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
| const __m128i alphaMask = _mm_set1_epi32(0xFF000000); |
| while (ndst > 0) { |
| int count = SkTMin(ndst, nsrc); |
| ndst -= count; |
| const uint32_t* src = srcStart; |
| const uint32_t* end = dst + (count & ~3); |
| ptrdiff_t delta = src - dst; |
| |
| while (dst < end) { |
| __m128i pixels = load(src); |
| if (_mm_testc_si128(pixels, alphaMask)) { |
| uint32_t* start = dst; |
| do { |
| store(dst, pixels); |
| dst += 4; |
| } while (dst < end |
| && _mm_testc_si128(pixels = load(dst + delta), alphaMask)); |
| src += dst - start; |
| } else if (_mm_testz_si128(pixels, alphaMask)) { |
| do { |
| dst += 4; |
| src += 4; |
| } while (dst < end |
| && _mm_testz_si128(pixels = load(src), alphaMask)); |
| } else { |
| uint32_t* start = dst; |
| do { |
| srcover_srgb_srgb_4(dst, dst + delta); |
| dst += 4; |
| } while (dst < end |
| && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask)); |
| src += dst - start; |
| } |
| } |
| |
| count = count & 3; |
| while (count-- > 0) { |
| srcover_srgb_srgb_1(dst++, *src++); |
| } |
| } |
| } |
| #else |
| // SSE2 versions |
| |
| // Note: In the next three comparisons a group of 4 pixels is converted to a group of |
| // "signed" pixels because the sse2 does not have an unsigned comparison. |
| // Make it so that we can use the signed comparison operators by biasing |
| // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to |
| // 0x7fxxxxxx which is the largest set of values. |
| static inline bool check_opaque_alphas(__m128i pixels) { |
| __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); |
| int mask = |
| _mm_movemask_epi8( |
| _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000))); |
| return mask == 0; |
| } |
| |
| static inline bool check_transparent_alphas(__m128i pixels) { |
| __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); |
| int mask = |
| _mm_movemask_epi8( |
| _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF))); |
| return mask == 0; |
| } |
| |
| static inline bool check_partial_alphas(__m128i pixels) { |
| __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000)); |
| __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)); |
| __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)); |
| int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent)); |
| return mask == 0; |
| } |
| |
| static void srcover_srgb_srgb( |
| uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) { |
| while (ndst > 0) { |
| int count = SkTMin(ndst, nsrc); |
| ndst -= count; |
| const uint32_t* src = srcStart; |
| const uint32_t* end = dst + (count & ~3); |
| const ptrdiff_t delta = src - dst; |
| |
| __m128i pixels = load(src); |
| do { |
| if (check_opaque_alphas(pixels)) { |
| uint32_t* start = dst; |
| do { |
| store(dst, pixels); |
| dst += 4; |
| } while (dst < end && check_opaque_alphas((pixels = load(dst + delta)))); |
| src += dst - start; |
| } else if (check_transparent_alphas(pixels)) { |
| const uint32_t* start = dst; |
| do { |
| dst += 4; |
| } while (dst < end && check_transparent_alphas(pixels = load(dst + delta))); |
| src += dst - start; |
| } else { |
| const uint32_t* start = dst; |
| do { |
| srcover_srgb_srgb_4(dst, dst + delta); |
| dst += 4; |
| } while (dst < end && check_partial_alphas(pixels = load(dst + delta))); |
| src += dst - start; |
| } |
| } while (dst < end); |
| |
| count = count & 3; |
| while (count-- > 0) { |
| srcover_srgb_srgb_1(dst++, *src++); |
| } |
| } |
| } |
| #endif |
| #else |
| |
| static void srcover_srgb_srgb( |
| uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) { |
| while (ndst > 0) { |
| int n = SkTMin(ndst, nsrc); |
| |
| for (int i = 0; i < n; i++) { |
| srcover_srgb_srgb_1(dst++, src[i]); |
| } |
| ndst -= n; |
| } |
| } |
| |
| #endif |
| |
| } // namespace SK_OPTS_NS |
| |
| #endif//SkBlend_opts_DEFINED |