blob: 2bdcf7bfbe64f7f4a6ed7ec1b2e10ffcad448cc9 [file] [log] [blame]
mtklein4a37d082015-09-10 10:38:02 -07001/*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkBlitRow_opts_DEFINED
9#define SkBlitRow_opts_DEFINED
10
Mike Kleinc33e6dc2019-04-10 11:44:42 -050011#include "SkVx.h"
Cary Clarka4083c92017-09-15 11:59:23 -040012#include "SkColorData.h"
mtkleinb4a7dc92016-03-23 06:29:12 -070013#include "SkMSAN.h"
14
15#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
Herbert Derbyd8e2b132017-11-29 11:02:07 -050016 #include <immintrin.h>
Mike Kleinf3086f02018-12-04 15:14:28 -050017
18 static inline __m128i SkPMSrcOver_SSE2(const __m128i& src, const __m128i& dst) {
19 auto SkAlphaMulQ_SSE2 = [](const __m128i& c, const __m128i& scale) {
20 const __m128i mask = _mm_set1_epi32(0xFF00FF);
21 __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);
22
23 // uint32_t rb = ((c & mask) * scale) >> 8
24 __m128i rb = _mm_and_si128(mask, c);
25 rb = _mm_mullo_epi16(rb, s);
26 rb = _mm_srli_epi16(rb, 8);
27
28 // uint32_t ag = ((c >> 8) & mask) * scale
29 __m128i ag = _mm_srli_epi16(c, 8);
30 ag = _mm_mullo_epi16(ag, s);
31
32 // (rb & mask) | (ag & ~mask)
33 ag = _mm_andnot_si128(mask, ag);
34 return _mm_or_si128(rb, ag);
35 };
36 return _mm_add_epi32(src,
37 SkAlphaMulQ_SSE2(dst, _mm_sub_epi32(_mm_set1_epi32(256),
38 _mm_srli_epi32(src, 24))));
39 }
mtkleinb4a7dc92016-03-23 06:29:12 -070040#endif
mtklein4a37d082015-09-10 10:38:02 -070041
42namespace SK_OPTS_NS {
43
Mike Kleinc33e6dc2019-04-10 11:44:42 -050044// Blend constant color over count src pixels, writing into dst.
45inline void blit_row_color32(SkPMColor* dst, const SkPMColor* src, int count, SkPMColor color) {
46 constexpr int N = 8; // 4, 16 also reasonable choices
47 using U32 = skvx::Vec< N, uint32_t>;
48 using U16 = skvx::Vec<4*N, uint16_t>;
49 using U8 = skvx::Vec<4*N, uint8_t>;
50
51 auto kernel = [color](U32 src) {
52 unsigned invA = 255 - SkGetPackedA32(color);
53 invA += invA >> 7;
54 SkASSERT(0 < invA && invA < 256); // We handle alpha == 0 or alpha == 255 specially.
55
56 // (src * invA + (color << 8) + 128) >> 8
57 // Should all fit in 16 bits.
58 // TODO(mtklein): can we do src * invA with umull on ARM?
59 U16 s = skvx::cast<uint16_t>(skvx::bit_pun<U8>(src)),
60 c = skvx::cast<uint16_t>(skvx::bit_pun<U8>(U32(color))),
61 d = (s * invA + (c << 8) + 128)>>8;
62 return skvx::bit_pun<U32>(skvx::cast<uint8_t>(d));
63 };
64
65 while (count >= N) {
66 kernel(U32::Load(src)).store(dst);
67 src += N;
68 dst += N;
69 count -= N;
70 }
71 while (count --> 0) {
72 *dst++ = kernel(U32{*src++})[0];
73 }
74}
75
Matteo Franchina132c382017-05-26 18:56:51 +010076#if defined(SK_ARM_HAS_NEON)
77
78// Return a uint8x8_t value, r, computed as r[i] = SkMulDiv255Round(x[i], y[i]), where r[i], x[i],
79// y[i] are the i-th lanes of the corresponding NEON vectors.
80static inline uint8x8_t SkMulDiv255Round_neon8(uint8x8_t x, uint8x8_t y) {
81 uint16x8_t prod = vmull_u8(x, y);
82 return vraddhn_u16(prod, vrshrq_n_u16(prod, 8));
83}
84
85// The implementations of SkPMSrcOver below perform alpha blending consistently with
86// SkMulDiv255Round. They compute the color components (numbers in the interval [0, 255]) as:
87//
88// result_i = src_i + rint(g(src_alpha, dst_i))
89//
90// where g(x, y) = ((255.0 - x) * y) / 255.0 and rint rounds to the nearest integer.
91
92// In this variant of SkPMSrcOver each NEON register, dst.val[i], src.val[i], contains the value
93// of the same color component for 8 consecutive pixels. The result of this function follows the
94// same convention.
95static inline uint8x8x4_t SkPMSrcOver_neon8(uint8x8x4_t dst, uint8x8x4_t src) {
96 uint8x8_t nalphas = vmvn_u8(src.val[3]);
97 uint8x8x4_t result;
98 result.val[0] = vadd_u8(src.val[0], SkMulDiv255Round_neon8(nalphas, dst.val[0]));
99 result.val[1] = vadd_u8(src.val[1], SkMulDiv255Round_neon8(nalphas, dst.val[1]));
100 result.val[2] = vadd_u8(src.val[2], SkMulDiv255Round_neon8(nalphas, dst.val[2]));
101 result.val[3] = vadd_u8(src.val[3], SkMulDiv255Round_neon8(nalphas, dst.val[3]));
102 return result;
103}
104
105// In this variant of SkPMSrcOver dst and src contain the color components of two consecutive
106// pixels. The return value follows the same convention.
107static inline uint8x8_t SkPMSrcOver_neon2(uint8x8_t dst, uint8x8_t src) {
108 const uint8x8_t alpha_indices = vcreate_u8(0x0707070703030303);
109 uint8x8_t nalphas = vmvn_u8(vtbl1_u8(src, alpha_indices));
110 return vadd_u8(src, SkMulDiv255Round_neon8(nalphas, dst));
111}
112
113#endif
114
Mike Kleincd71f112017-08-23 11:11:55 -0400115/*not static*/ inline
mtkleinb4a7dc92016-03-23 06:29:12 -0700116void blit_row_s32a_opaque(SkPMColor* dst, const SkPMColor* src, int len, U8CPU alpha) {
117 SkASSERT(alpha == 0xFF);
118 sk_msan_assert_initialized(src, src+len);
119
120#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
121 while (len >= 16) {
122 // Load 16 source pixels.
123 auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
124 s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
125 s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
126 s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
127
128 const auto alphaMask = _mm_set1_epi32(0xFF000000);
129
130 auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
131 if (_mm_testz_si128(ORed, alphaMask)) {
132 // All 16 source pixels are transparent. Nothing to do.
133 src += 16;
134 dst += 16;
135 len -= 16;
136 continue;
137 }
138
139 auto d0 = (__m128i*)(dst) + 0,
140 d1 = (__m128i*)(dst) + 1,
141 d2 = (__m128i*)(dst) + 2,
142 d3 = (__m128i*)(dst) + 3;
143
144 auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
145 if (_mm_testc_si128(ANDed, alphaMask)) {
146 // All 16 source pixels are opaque. SrcOver becomes Src.
147 _mm_storeu_si128(d0, s0);
148 _mm_storeu_si128(d1, s1);
149 _mm_storeu_si128(d2, s2);
150 _mm_storeu_si128(d3, s3);
151 src += 16;
152 dst += 16;
153 len -= 16;
154 continue;
155 }
156
157 // TODO: This math is wrong.
158 // Do SrcOver.
159 _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
160 _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
161 _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
162 _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
163 src += 16;
164 dst += 16;
165 len -= 16;
166 }
167
168#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
169 while (len >= 16) {
170 // Load 16 source pixels.
171 auto s0 = _mm_loadu_si128((const __m128i*)(src) + 0),
172 s1 = _mm_loadu_si128((const __m128i*)(src) + 1),
173 s2 = _mm_loadu_si128((const __m128i*)(src) + 2),
174 s3 = _mm_loadu_si128((const __m128i*)(src) + 3);
175
176 const auto alphaMask = _mm_set1_epi32(0xFF000000);
177
178 auto ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
179 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask),
180 _mm_setzero_si128()))) {
181 // All 16 source pixels are transparent. Nothing to do.
182 src += 16;
183 dst += 16;
184 len -= 16;
185 continue;
186 }
187
188 auto d0 = (__m128i*)(dst) + 0,
189 d1 = (__m128i*)(dst) + 1,
190 d2 = (__m128i*)(dst) + 2,
191 d3 = (__m128i*)(dst) + 3;
192
193 auto ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
194 if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask),
195 alphaMask))) {
196 // All 16 source pixels are opaque. SrcOver becomes Src.
197 _mm_storeu_si128(d0, s0);
198 _mm_storeu_si128(d1, s1);
199 _mm_storeu_si128(d2, s2);
200 _mm_storeu_si128(d3, s3);
201 src += 16;
202 dst += 16;
203 len -= 16;
204 continue;
205 }
206
207 // TODO: This math is wrong.
208 // Do SrcOver.
209 _mm_storeu_si128(d0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(d0)));
210 _mm_storeu_si128(d1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(d1)));
211 _mm_storeu_si128(d2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(d2)));
212 _mm_storeu_si128(d3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(d3)));
213
214 src += 16;
215 dst += 16;
216 len -= 16;
217 }
218
219#elif defined(SK_ARM_HAS_NEON)
Matteo Franchina132c382017-05-26 18:56:51 +0100220 // Do 8-pixels at a time. A 16-pixels at a time version of this code was also tested, but it
221 // underperformed on some of the platforms under test for inputs with frequent transitions of
222 // alpha (corresponding to changes of the conditions [~]alpha_u64 == 0 below). It may be worth
223 // revisiting the situation in the future.
224 while (len >= 8) {
225 // Load 8 pixels in 4 NEON registers. src_col.val[i] will contain the same color component
226 // for 8 consecutive pixels (e.g. src_col.val[3] will contain all alpha components of 8
227 // pixels).
228 uint8x8x4_t src_col = vld4_u8(reinterpret_cast<const uint8_t*>(src));
229 src += 8;
230 len -= 8;
231
232 // We now detect 2 special cases: the first occurs when all alphas are zero (the 8 pixels
233 // are all transparent), the second when all alphas are fully set (they are all opaque).
234 uint8x8_t alphas = src_col.val[3];
235 uint64_t alphas_u64 = vget_lane_u64(vreinterpret_u64_u8(alphas), 0);
236 if (alphas_u64 == 0) {
237 // All pixels transparent.
238 dst += 8;
mtkleinb4a7dc92016-03-23 06:29:12 -0700239 continue;
240 }
241
Matteo Franchina132c382017-05-26 18:56:51 +0100242 if (~alphas_u64 == 0) {
243 // All pixels opaque.
244 vst4_u8(reinterpret_cast<uint8_t*>(dst), src_col);
245 dst += 8;
mtkleinb4a7dc92016-03-23 06:29:12 -0700246 continue;
247 }
248
Matteo Franchina132c382017-05-26 18:56:51 +0100249 uint8x8x4_t dst_col = vld4_u8(reinterpret_cast<uint8_t*>(dst));
250 vst4_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon8(dst_col, src_col));
251 dst += 8;
mtkleinb4a7dc92016-03-23 06:29:12 -0700252 }
Matteo Franchina132c382017-05-26 18:56:51 +0100253
254 // Deal with leftover pixels.
255 for (; len >= 2; len -= 2, src += 2, dst += 2) {
256 uint8x8_t src2 = vld1_u8(reinterpret_cast<const uint8_t*>(src));
257 uint8x8_t dst2 = vld1_u8(reinterpret_cast<const uint8_t*>(dst));
258 vst1_u8(reinterpret_cast<uint8_t*>(dst), SkPMSrcOver_neon2(dst2, src2));
259 }
260
261 if (len != 0) {
262 uint8x8_t result = SkPMSrcOver_neon2(vcreate_u8(*dst), vcreate_u8(*src));
263 vst1_lane_u32(dst, vreinterpret_u32_u8(result), 0);
264 }
265 return;
mtkleinb4a7dc92016-03-23 06:29:12 -0700266#endif
267
268 while (len-- > 0) {
mtklein3e318122016-06-17 13:47:53 -0700269 // This 0xFF000000 is not semantically necessary, but for compatibility
270 // with chromium:611002 we need to keep it until we figure out where
271 // the non-premultiplied src values (like 0x00FFFFFF) are coming from.
272 // TODO(mtklein): sort this out and assert *src is premul here.
273 if (*src & 0xFF000000) {
mtkleinb4a7dc92016-03-23 06:29:12 -0700274 *dst = (*src >= 0xFF000000) ? *src : SkPMSrcOver(*src, *dst);
275 }
276 src++;
277 dst++;
278 }
279}
280
mtklein4a37d082015-09-10 10:38:02 -0700281} // SK_OPTS_NS
282
283#endif//SkBlitRow_opts_DEFINED