blob: de99894282b8fdd6adaa96db1be3b2966d93ce53 [file] [log] [blame]
epoger@google.comec3ed6a2011-07-28 14:26:00 +00001/*
2 * Copyright 2011 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
reed@google.com58af9a62011-10-12 13:43:52 +00007
reed@android.comc4cae852009-09-23 15:06:10 +00008#include "SkBlitRow.h"
reed@google.com58af9a62011-10-12 13:43:52 +00009#include "SkBlitMask.h"
reed@android.comc4cae852009-09-23 15:06:10 +000010#include "SkColorPriv.h"
11#include "SkUtils.h"
12
djsollen@google.com57f49692011-02-23 20:46:31 +000013#define UNROLL
14
reed@android.comc4cae852009-09-23 15:06:10 +000015static void S32_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
16 const SkPMColor* SK_RESTRICT src,
17 int count, U8CPU alpha) {
18 SkASSERT(255 == alpha);
commit-bot@chromium.orgf0ea77a2014-05-21 12:43:07 +000019 sk_memcpy32(dst, src, count);
reed@android.comc4cae852009-09-23 15:06:10 +000020}
21
22static void S32_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
23 const SkPMColor* SK_RESTRICT src,
24 int count, U8CPU alpha) {
25 SkASSERT(alpha <= 255);
26 if (count > 0) {
27 unsigned src_scale = SkAlpha255To256(alpha);
28 unsigned dst_scale = 256 - src_scale;
djsollen@google.com57f49692011-02-23 20:46:31 +000029
30#ifdef UNROLL
31 if (count & 1) {
32 *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
33 dst += 1;
34 count -= 1;
35 }
36
37 const SkPMColor* SK_RESTRICT srcEnd = src + count;
38 while (src != srcEnd) {
39 *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
40 dst += 1;
41 *dst = SkAlphaMulQ(*(src++), src_scale) + SkAlphaMulQ(*dst, dst_scale);
42 dst += 1;
43 }
44#else
reed@android.comc4cae852009-09-23 15:06:10 +000045 do {
46 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
47 src += 1;
48 dst += 1;
49 } while (--count > 0);
djsollen@google.com57f49692011-02-23 20:46:31 +000050#endif
reed@android.comc4cae852009-09-23 15:06:10 +000051 }
52}
53
reed@android.comc4cae852009-09-23 15:06:10 +000054static void S32A_Opaque_BlitRow32(SkPMColor* SK_RESTRICT dst,
55 const SkPMColor* SK_RESTRICT src,
56 int count, U8CPU alpha) {
57 SkASSERT(255 == alpha);
58 if (count > 0) {
djsollen@google.com57f49692011-02-23 20:46:31 +000059#ifdef UNROLL
60 if (count & 1) {
61 *dst = SkPMSrcOver(*(src++), *dst);
62 dst += 1;
63 count -= 1;
64 }
65
66 const SkPMColor* SK_RESTRICT srcEnd = src + count;
67 while (src != srcEnd) {
68 *dst = SkPMSrcOver(*(src++), *dst);
69 dst += 1;
70 *dst = SkPMSrcOver(*(src++), *dst);
71 dst += 1;
72 }
73#else
reed@android.comc4cae852009-09-23 15:06:10 +000074 do {
reed@android.comc4cae852009-09-23 15:06:10 +000075 *dst = SkPMSrcOver(*src, *dst);
reed@android.comc4cae852009-09-23 15:06:10 +000076 src += 1;
77 dst += 1;
78 } while (--count > 0);
djsollen@google.com57f49692011-02-23 20:46:31 +000079#endif
reed@android.comc4cae852009-09-23 15:06:10 +000080 }
81}
82
83static void S32A_Blend_BlitRow32(SkPMColor* SK_RESTRICT dst,
84 const SkPMColor* SK_RESTRICT src,
85 int count, U8CPU alpha) {
86 SkASSERT(alpha <= 255);
87 if (count > 0) {
djsollen@google.com57f49692011-02-23 20:46:31 +000088#ifdef UNROLL
89 if (count & 1) {
90 *dst = SkBlendARGB32(*(src++), *dst, alpha);
91 dst += 1;
92 count -= 1;
93 }
94
95 const SkPMColor* SK_RESTRICT srcEnd = src + count;
96 while (src != srcEnd) {
97 *dst = SkBlendARGB32(*(src++), *dst, alpha);
98 dst += 1;
99 *dst = SkBlendARGB32(*(src++), *dst, alpha);
100 dst += 1;
101 }
102#else
reed@android.comc4cae852009-09-23 15:06:10 +0000103 do {
104 *dst = SkBlendARGB32(*src, *dst, alpha);
105 src += 1;
106 dst += 1;
107 } while (--count > 0);
djsollen@google.com57f49692011-02-23 20:46:31 +0000108#endif
reed@android.comc4cae852009-09-23 15:06:10 +0000109 }
110}
111
112///////////////////////////////////////////////////////////////////////////////
113
114static const SkBlitRow::Proc32 gDefault_Procs32[] = {
115 S32_Opaque_BlitRow32,
116 S32_Blend_BlitRow32,
117 S32A_Opaque_BlitRow32,
118 S32A_Blend_BlitRow32
119};
120
121SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
122 SkASSERT(flags < SK_ARRAY_COUNT(gDefault_Procs32));
123 // just so we don't crash
124 flags &= kFlags32_Mask;
reed@google.com981d4792011-03-09 12:55:47 +0000125
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000126 SkBlitRow::Proc32 proc = PlatformProcs32(flags);
reed@android.comc4cae852009-09-23 15:06:10 +0000127 if (NULL == proc) {
128 proc = gDefault_Procs32[flags];
129 }
130 SkASSERT(proc);
131 return proc;
132}
133
mtklein95cc0122015-04-27 15:11:01 -0700134// Color32 uses the blend_256_round_alt algorithm from tests/BlendTest.cpp.
135// It's not quite perfect, but it's never wrong in the interesting edge cases,
136// and it's quite a bit faster than blend_perfect.
mtkleina4a0aeb2015-04-21 08:09:30 -0700137//
138// blend_256_round_alt is our currently blessed algorithm. Please use it or an analogous one.
mtklein95cc0122015-04-27 15:11:01 -0700139void SkBlitRow::Color32(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color) {
mtkleinafe2ffb2015-04-17 11:00:54 -0700140 switch (SkGetPackedA32(color)) {
141 case 0: memmove(dst, src, count * sizeof(SkPMColor)); return;
142 case 255: sk_memset32(dst, color, count); return;
143 }
144
145 unsigned invA = 255 - SkGetPackedA32(color);
mtkleina4a0aeb2015-04-21 08:09:30 -0700146 invA += invA >> 7;
mtklein95cc0122015-04-27 15:11:01 -0700147 SkASSERT(invA < 256); // We've already handled alpha == 0 above.
148
149#if defined(SK_ARM_HAS_NEON)
150 uint16x8_t colorHigh = vshll_n_u8((uint8x8_t)vdup_n_u32(color), 8);
151 uint16x8_t colorAndRound = vaddq_u16(colorHigh, vdupq_n_u16(128));
152 uint8x8_t invA8 = vdup_n_u8(invA);
153
154 // Does the core work of blending color onto 4 pixels, returning the resulting 4 pixels.
155 auto kernel = [&](const uint32x4_t& src4) -> uint32x4_t {
156 uint16x8_t lo = vmull_u8(vget_low_u8( (uint8x16_t)src4), invA8),
157 hi = vmull_u8(vget_high_u8((uint8x16_t)src4), invA8);
158 return (uint32x4_t)
159 vcombine_u8(vaddhn_u16(colorAndRound, lo), vaddhn_u16(colorAndRound, hi));
160 };
161
162 while (count >= 8) {
163 uint32x4_t dst0 = kernel(vld1q_u32(src+0)),
164 dst4 = kernel(vld1q_u32(src+4));
165 vst1q_u32(dst+0, dst0);
166 vst1q_u32(dst+4, dst4);
167 src += 8;
168 dst += 8;
169 count -= 8;
170 }
171 if (count >= 4) {
172 vst1q_u32(dst, kernel(vld1q_u32(src)));
173 src += 4;
174 dst += 4;
175 count -= 4;
176 }
177 if (count >= 2) {
178 uint32x2_t src2 = vld1_u32(src);
179 vst1_u32(dst, vget_low_u32(kernel(vcombine_u32(src2, src2))));
180 src += 2;
181 dst += 2;
182 count -= 2;
183 }
184 if (count >= 1) {
185 vst1q_lane_u32(dst, kernel(vdupq_n_u32(*src)), 0);
186 }
187
188#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
189 __m128i colorHigh = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_set1_epi32(color));
190 __m128i colorAndRound = _mm_add_epi16(colorHigh, _mm_set1_epi16(128));
191 __m128i invA16 = _mm_set1_epi16(invA);
192
193 // Does the core work of blending color onto 4 pixels, returning the resulting 4 pixels.
194 auto kernel = [&](const __m128i& src4) -> __m128i {
195 __m128i lo = _mm_mullo_epi16(invA16, _mm_unpacklo_epi8(src4, _mm_setzero_si128())),
196 hi = _mm_mullo_epi16(invA16, _mm_unpackhi_epi8(src4, _mm_setzero_si128()));
197 return _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(colorAndRound, lo), 8),
198 _mm_srli_epi16(_mm_add_epi16(colorAndRound, hi), 8));
199 };
200
201 while (count >= 8) {
202 __m128i dst0 = kernel(_mm_loadu_si128((const __m128i*)(src+0))),
203 dst4 = kernel(_mm_loadu_si128((const __m128i*)(src+4)));
204 _mm_storeu_si128((__m128i*)(dst+0), dst0);
205 _mm_storeu_si128((__m128i*)(dst+4), dst4);
206 src += 8;
207 dst += 8;
208 count -= 8;
209 }
210 if (count >= 4) {
211 _mm_storeu_si128((__m128i*)dst, kernel(_mm_loadu_si128((const __m128i*)src)));
212 src += 4;
213 dst += 4;
214 count -= 4;
215 }
216 if (count >= 2) {
217 _mm_storel_epi64((__m128i*)dst, kernel(_mm_loadl_epi64((const __m128i*)src)));
218 src += 2;
219 dst += 2;
220 count -= 2;
221 }
222 if (count >= 1) {
223 *dst = _mm_cvtsi128_si32(kernel(_mm_cvtsi32_si128(*src)));
224 }
225#else // Neither NEON nor SSE2.
mtkleina4a0aeb2015-04-21 08:09:30 -0700226 unsigned round = (128 << 16) + (128 << 0);
mtkleina4a0aeb2015-04-21 08:09:30 -0700227
mtkleinafe2ffb2015-04-17 11:00:54 -0700228 while (count --> 0) {
229 // Our math is 16-bit, so we can do a little bit of SIMD in 32-bit registers.
230 const uint32_t mask = 0x00FF00FF;
mtkleina4a0aeb2015-04-21 08:09:30 -0700231 uint32_t rb = (((*src >> 0) & mask) * invA + round) >> 8, // _r_b
232 ag = (((*src >> 8) & mask) * invA + round) >> 0; // a_g_
233 *dst = color + ((rb & mask) | (ag & ~mask));
mtkleinafe2ffb2015-04-17 11:00:54 -0700234 src++;
235 dst++;
reed@android.comc4cae852009-09-23 15:06:10 +0000236 }
mtklein95cc0122015-04-27 15:11:01 -0700237#endif
reed@android.comc4cae852009-09-23 15:06:10 +0000238}
239