blob: 1da4c4fb04da8b4e566a5c4df1ed395f5a1c28bc [file] [log] [blame]
mtkleinc5091b52016-05-02 11:48:42 -07001/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
herbcc49e592016-05-17 09:57:34 -07008/*
9ninja -C out/Release dm nanobench ; and ./out/Release/dm --match Blend_opts ; and ./out/Release/nanobench --samples 300 --nompd --match LinearSrcOver -q
10 */
11
mtkleinc5091b52016-05-02 11:48:42 -070012#ifndef SkBlend_opts_DEFINED
13#define SkBlend_opts_DEFINED
14
herbcc49e592016-05-17 09:57:34 -070015#include "SkNx.h"
16#include "SkPM4fPriv.h"
17
herb4d1dd662016-06-23 09:40:30 -070018#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
19 #include <immintrin.h>
20#endif
21
mtkleinc5091b52016-05-02 11:48:42 -070022namespace SK_OPTS_NS {
23
mtklein0c902472016-07-20 18:10:07 -070024static inline void srcover_srgb_srgb_1(uint32_t* dst, uint32_t src) {
25 if (src >= 0xFF000000) {
26 *dst = src;
27 return;
herbcc49e592016-05-17 09:57:34 -070028 }
mtklein0c902472016-07-20 18:10:07 -070029 auto d = Sk4f_fromS32(*dst),
30 s = Sk4f_fromS32( src);
31 *dst = Sk4f_toS32(s + d * (1.0f - s[3]));
herbcc49e592016-05-17 09:57:34 -070032}
33
herbcc49e592016-05-17 09:57:34 -070034static inline void srcover_srgb_srgb_4(uint32_t* dst, const uint32_t* src) {
mtklein0c902472016-07-20 18:10:07 -070035 srcover_srgb_srgb_1(dst++, *src++);
36 srcover_srgb_srgb_1(dst++, *src++);
37 srcover_srgb_srgb_1(dst++, *src++);
38 srcover_srgb_srgb_1(dst , *src );
herbcc49e592016-05-17 09:57:34 -070039}
40
41#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
42
43 static inline __m128i load(const uint32_t* p) {
44 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
45 }
46
47 static inline void store(uint32_t* p, __m128i v) {
48 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v);
49 }
50
51 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
52
mtklein0358a6a2016-07-13 08:02:20 -070053 static void srcover_srgb_srgb(
herbcc49e592016-05-17 09:57:34 -070054 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
55 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
56 while (ndst > 0) {
57 int count = SkTMin(ndst, nsrc);
58 ndst -= count;
59 const uint32_t* src = srcStart;
herb074b48e2016-05-23 13:50:12 -070060 const uint32_t* end = dst + (count & ~3);
61 ptrdiff_t delta = src - dst;
herbcc49e592016-05-17 09:57:34 -070062
herb074b48e2016-05-23 13:50:12 -070063 while (dst < end) {
herbcc49e592016-05-17 09:57:34 -070064 __m128i pixels = load(src);
65 if (_mm_testc_si128(pixels, alphaMask)) {
herb074b48e2016-05-23 13:50:12 -070066 uint32_t* start = dst;
herbcc49e592016-05-17 09:57:34 -070067 do {
68 store(dst, pixels);
69 dst += 4;
herb074b48e2016-05-23 13:50:12 -070070 } while (dst < end
71 && _mm_testc_si128(pixels = load(dst + delta), alphaMask));
72 src += dst - start;
herbcc49e592016-05-17 09:57:34 -070073 } else if (_mm_testz_si128(pixels, alphaMask)) {
74 do {
75 dst += 4;
76 src += 4;
herb074b48e2016-05-23 13:50:12 -070077 } while (dst < end
78 && _mm_testz_si128(pixels = load(src), alphaMask));
herbcc49e592016-05-17 09:57:34 -070079 } else {
herb074b48e2016-05-23 13:50:12 -070080 uint32_t* start = dst;
herbcc49e592016-05-17 09:57:34 -070081 do {
herb074b48e2016-05-23 13:50:12 -070082 srcover_srgb_srgb_4(dst, dst + delta);
herbcc49e592016-05-17 09:57:34 -070083 dst += 4;
herb074b48e2016-05-23 13:50:12 -070084 } while (dst < end
85 && _mm_testnzc_si128(pixels = load(dst + delta), alphaMask));
86 src += dst - start;
herbcc49e592016-05-17 09:57:34 -070087 }
88 }
89
90 count = count & 3;
91 while (count-- > 0) {
mtklein0c902472016-07-20 18:10:07 -070092 srcover_srgb_srgb_1(dst++, *src++);
herbcc49e592016-05-17 09:57:34 -070093 }
94 }
95 }
96 #else
97 // SSE2 versions
herb074b48e2016-05-23 13:50:12 -070098
99 // Note: In the next three comparisons a group of 4 pixels is converted to a group of
100 // "signed" pixels because the sse2 does not have an unsigned comparison.
101 // Make it so that we can use the signed comparison operators by biasing
102 // 0x00xxxxxx to 0x80xxxxxxx which is the smallest values and biasing 0xffxxxxxx to
103 // 0x7fxxxxxx which is the largest set of values.
herbcc49e592016-05-17 09:57:34 -0700104 static inline bool check_opaque_alphas(__m128i pixels) {
herb074b48e2016-05-23 13:50:12 -0700105 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
herbcc49e592016-05-17 09:57:34 -0700106 int mask =
107 _mm_movemask_epi8(
herb074b48e2016-05-23 13:50:12 -0700108 _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000)));
109 return mask == 0;
herbcc49e592016-05-17 09:57:34 -0700110 }
111
112 static inline bool check_transparent_alphas(__m128i pixels) {
herb074b48e2016-05-23 13:50:12 -0700113 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
herbcc49e592016-05-17 09:57:34 -0700114 int mask =
115 _mm_movemask_epi8(
herb074b48e2016-05-23 13:50:12 -0700116 _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF)));
117 return mask == 0;
herbcc49e592016-05-17 09:57:34 -0700118 }
119
120 static inline bool check_partial_alphas(__m128i pixels) {
herb074b48e2016-05-23 13:50:12 -0700121 __m128i signedPixels = _mm_xor_si128(pixels, _mm_set1_epi32(0x80000000));
122 __m128i opaque = _mm_cmplt_epi32(signedPixels, _mm_set1_epi32(0x7F000000));
123 __m128i transparent = _mm_cmpgt_epi32(signedPixels, _mm_set1_epi32(0x80FFFFFF));
124 int mask = _mm_movemask_epi8(_mm_xor_si128(opaque, transparent));
125 return mask == 0;
herbcc49e592016-05-17 09:57:34 -0700126 }
127
mtklein0358a6a2016-07-13 08:02:20 -0700128 static void srcover_srgb_srgb(
herbcc49e592016-05-17 09:57:34 -0700129 uint32_t* dst, const uint32_t* const srcStart, int ndst, const int nsrc) {
130 while (ndst > 0) {
131 int count = SkTMin(ndst, nsrc);
132 ndst -= count;
133 const uint32_t* src = srcStart;
herb074b48e2016-05-23 13:50:12 -0700134 const uint32_t* end = dst + (count & ~3);
135 const ptrdiff_t delta = src - dst;
herbcc49e592016-05-17 09:57:34 -0700136
137 __m128i pixels = load(src);
138 do {
139 if (check_opaque_alphas(pixels)) {
herb074b48e2016-05-23 13:50:12 -0700140 uint32_t* start = dst;
herbcc49e592016-05-17 09:57:34 -0700141 do {
142 store(dst, pixels);
143 dst += 4;
herb074b48e2016-05-23 13:50:12 -0700144 } while (dst < end && check_opaque_alphas((pixels = load(dst + delta))));
145 src += dst - start;
herbcc49e592016-05-17 09:57:34 -0700146 } else if (check_transparent_alphas(pixels)) {
herb074b48e2016-05-23 13:50:12 -0700147 const uint32_t* start = dst;
herbcc49e592016-05-17 09:57:34 -0700148 do {
herbcc49e592016-05-17 09:57:34 -0700149 dst += 4;
herb074b48e2016-05-23 13:50:12 -0700150 } while (dst < end && check_transparent_alphas(pixels = load(dst + delta)));
151 src += dst - start;
152 } else {
153 const uint32_t* start = dst;
154 do {
155 srcover_srgb_srgb_4(dst, dst + delta);
156 dst += 4;
157 } while (dst < end && check_partial_alphas(pixels = load(dst + delta)));
158 src += dst - start;
herbcc49e592016-05-17 09:57:34 -0700159 }
herb074b48e2016-05-23 13:50:12 -0700160 } while (dst < end);
herbcc49e592016-05-17 09:57:34 -0700161
162 count = count & 3;
163 while (count-- > 0) {
mtklein0c902472016-07-20 18:10:07 -0700164 srcover_srgb_srgb_1(dst++, *src++);
herbcc49e592016-05-17 09:57:34 -0700165 }
166 }
167 }
168 #endif
mtkleinc5091b52016-05-02 11:48:42 -0700169#else
170
mtklein0358a6a2016-07-13 08:02:20 -0700171 static void srcover_srgb_srgb(
herbcc49e592016-05-17 09:57:34 -0700172 uint32_t* dst, const uint32_t* const src, int ndst, const int nsrc) {
herb2edf0c62016-07-12 15:00:46 -0700173 while (ndst > 0) {
174 int n = SkTMin(ndst, nsrc);
175
176 for (int i = 0; i < n; i++) {
mtklein0c902472016-07-20 18:10:07 -0700177 srcover_srgb_srgb_1(dst++, src[i]);
herb2edf0c62016-07-12 15:00:46 -0700178 }
179 ndst -= n;
180 }
mtkleinc5091b52016-05-02 11:48:42 -0700181 }
182
mtkleinc5091b52016-05-02 11:48:42 -0700183#endif
184
185} // namespace SK_OPTS_NS
186
187#endif//SkBlend_opts_DEFINED