blob: 391b24c8673809c9a40a0225be52129af2369237 [file] [log] [blame]
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001/*
tomhudson@google.com98a5b422012-02-28 16:15:26 +00002 * Copyright 2012 The Android Open Source Project
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
senorblanco@chromium.org92727612009-11-04 20:51:06 +00006 */
7
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +00008#include <emmintrin.h>
caryclark@google.com83ecdc32012-06-06 12:10:26 +00009#include "SkBitmapProcState_opts_SSE2.h"
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +000010#include "SkBlitRow_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000011#include "SkColorPriv.h"
commit-bot@chromium.org47591072014-02-19 03:09:52 +000012#include "SkColor_opts_SSE2.h"
commit-bot@chromium.org27580472014-03-07 03:25:32 +000013#include "SkDither.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000014#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000015
senorblanco@chromium.org92727612009-11-04 20:51:06 +000016/* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000019void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20 const SkPMColor* SK_RESTRICT src,
21 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000022 SkASSERT(alpha <= 255);
23 if (count <= 0) {
24 return;
25 }
26
27 uint32_t src_scale = SkAlpha255To256(alpha);
28 uint32_t dst_scale = 256 - src_scale;
29
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000030 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++;
35 dst++;
36 count--;
37 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000038
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000039 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
tomhudson@google.com98a5b422012-02-28 16:15:26 +000042 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43
44 // Move scale factors to upper byte of word
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000047 while (count >= 4) {
48 // Load 4 pixels each of src and dest.
49 __m128i src_pixel = _mm_loadu_si128(s);
50 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000051
tomhudson@google.com98a5b422012-02-28 16:15:26 +000052 // Interleave Atom port 0/1 operations based on the execution port
53 // constraints that multiply can only be executed on port 0 (while
54 // boolean operations can be executed on either port 0 or port 1)
55 // because GCC currently doesn't do a good job scheduling
56 // instructions based on these constraints.
57
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000058 // Get red and blue pixels into lower byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000059 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000060 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000061
tomhudson@google.com98a5b422012-02-28 16:15:26 +000062 // Multiply by scale.
63 // (4 x (0, rs.h, 0, bs.h))
64 // where rs.h stands for the higher byte of r * scale, and
65 // bs.h the higher byte of b * scale.
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68 // Get alpha and green pixels into higher byte of each word.
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000071
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000072 // Multiply by scale.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000073 // (4 x (as.h, as.l, gs.h, gs.l))
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000075
tomhudson@google.com98a5b422012-02-28 16:15:26 +000076 // Clear the lower byte of the a*scale and g*scale results
77 // (4 x (as.h, 0, gs.h, 0))
78 src_ag = _mm_and_si128(src_ag, ag_mask);
79
80 // Operations the destination pixels are the same as on the
81 // source pixels. See the comments above.
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86 dst_ag = _mm_and_si128(dst_ag, ag_mask);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000087
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000088 // Combine back into RGBA.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000089 // (4 x (as.h, rs.h, gs.h, bs.h))
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000090 src_pixel = _mm_or_si128(src_rb, src_ag);
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92
93 // Add result
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95 _mm_store_si128(d, result);
96 s++;
97 d++;
98 count -= 4;
99 }
100 src = reinterpret_cast<const SkPMColor*>(s);
101 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000102 }
103
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000104 while (count > 0) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000105 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106 src++;
107 dst++;
108 count--;
109 }
110}
111
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000112void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113 const SkPMColor* SK_RESTRICT src,
114 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000115 SkASSERT(alpha == 255);
116 if (count <= 0) {
117 return;
118 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000119
120 if (count >= 4) {
121 SkASSERT(((size_t)dst & 0x03) == 0);
122 while (((size_t)dst & 0x0F) != 0) {
123 *dst = SkPMSrcOver(*src, *dst);
124 src++;
125 dst++;
126 count--;
127 }
128
129 const __m128i *s = reinterpret_cast<const __m128i*>(src);
130 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000131#ifdef SK_USE_ACCURATE_BLENDING
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000132 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
134 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
135 while (count >= 4) {
136 // Load 4 pixels
137 __m128i src_pixel = _mm_loadu_si128(s);
138 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000139
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000140 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000141 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000142 // Shift alphas down to lower 8 bits of each quad.
143 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000144
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000145 // Copy alpha to upper 3rd byte of each quad
146 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000147
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000148 // Subtract alphas from 255, to get 0..255
149 alpha = _mm_sub_epi16(c_255, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000150
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000151 // Multiply by red and blue by src alpha.
152 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153 // Multiply by alpha and green by src alpha.
154 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000155
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000156 // dst_rb_low = (dst_rb >> 8)
157 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000159
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000160 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162 dst_rb = _mm_add_epi16(dst_rb, c_128);
163 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000164
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000165 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167 dst_ag = _mm_add_epi16(dst_ag, c_128);
168 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000169
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000170 // Combine back into RGBA.
171 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000172
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000173 // Add result
174 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175 _mm_store_si128(d, result);
176 s++;
177 d++;
178 count -= 4;
179 }
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +0000180#else
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000181 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
183 while (count >= 4) {
184 // Load 4 pixels
185 __m128i src_pixel = _mm_loadu_si128(s);
186 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000187
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000188 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000189 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000190
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000191 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
192 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193
194 // (a0, a0, a1, a1, a2, g2, a3, g3)
195 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196
197 // (a0, a0, a1, a1, a2, a2, a3, a3)
198 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000199
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000200 // Subtract alphas from 256, to get 1..256
201 alpha = _mm_sub_epi16(c_256, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000202
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000203 // Multiply by red and blue by src alpha.
204 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205 // Multiply by alpha and green by src alpha.
206 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000207
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000208 // Divide by 256.
209 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000210
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000211 // Mask out high bits (already in the right place)
212 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000213
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000214 // Combine back into RGBA.
215 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000216
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000217 // Add result
218 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219 _mm_store_si128(d, result);
220 s++;
221 d++;
222 count -= 4;
223 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000224#endif
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000225 src = reinterpret_cast<const SkPMColor*>(s);
226 dst = reinterpret_cast<SkPMColor*>(d);
227 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000228
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000229 while (count > 0) {
230 *dst = SkPMSrcOver(*src, *dst);
231 src++;
232 dst++;
233 count--;
234 }
235}
236
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000237void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238 const SkPMColor* SK_RESTRICT src,
239 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000240 SkASSERT(alpha <= 255);
241 if (count <= 0) {
242 return;
243 }
244
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000245 if (count >= 4) {
246 while (((size_t)dst & 0x0F) != 0) {
247 *dst = SkBlendARGB32(*src, *dst, alpha);
248 src++;
249 dst++;
250 count--;
251 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000252
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000253 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000254
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000255 const __m128i *s = reinterpret_cast<const __m128i*>(src);
256 __m128i *d = reinterpret_cast<__m128i*>(dst);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
260 while (count >= 4) {
261 // Load 4 pixels each of src and dest.
262 __m128i src_pixel = _mm_loadu_si128(s);
263 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000264
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000265 // Get red and blue pixels into lower byte of each word.
266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000268
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000269 // Get alpha and green into lower byte of each word.
270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000272
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000273 // Put per-pixel alpha in low byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000274 // After the following two statements, the dst_alpha looks like
275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000278
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000279 // dst_alpha = dst_alpha * src_scale
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000280 // Because src_scales are in the higher byte of each word and
281 // we use mulhi here, the resulting alpha values are already
282 // in the right place and don't need to be divided by 256.
283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000285
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000286 // Subtract alphas from 256, to get 1..256
287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000288
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000289 // Multiply red and blue by dst pixel alpha.
290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291 // Multiply alpha and green by dst pixel alpha.
292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000293
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000294 // Multiply red and blue by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000295 // (4 x (0, rs.h, 0, bs.h))
296 // where rs.h stands for the higher byte of r * src_scale,
297 // and bs.h the higher byte of b * src_scale.
298 // Again, because we use mulhi, the resuling red and blue
299 // values are already in the right place and don't need to
300 // be divided by 256.
301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000302 // Multiply alpha and green by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000303 // (4 x (0, as.h, 0, gs.h))
304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000305
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000306 // Divide by 256.
307 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000308
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000309 // Mask out low bits (goodies already in the right place; no need to divide)
310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000311 // Shift alpha and green to higher byte of each word.
312 // (4 x (as.h, 0, gs.h, 0))
313 src_ag = _mm_slli_epi16(src_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000314
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000315 // Combine back into RGBA.
316 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317 src_pixel = _mm_or_si128(src_rb, src_ag);
318
319 // Add two pixels into result.
320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321 _mm_store_si128(d, result);
322 s++;
323 d++;
324 count -= 4;
325 }
326 src = reinterpret_cast<const SkPMColor*>(s);
327 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000328 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000329
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000330 while (count > 0) {
331 *dst = SkBlendARGB32(*src, *dst, alpha);
332 src++;
333 dst++;
334 count--;
335 }
336}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000337
338/* SSE2 version of Color32()
339 * portable version is in core/SkBlitRow_D32.cpp
340 */
341void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342 SkPMColor color) {
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000343 if (count <= 0) {
344 return;
345 }
346
347 if (0 == color) {
348 if (src != dst) {
349 memcpy(dst, src, count * sizeof(SkPMColor));
350 }
reed@google.comc909a1e2011-10-25 19:07:23 +0000351 return;
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000352 }
353
354 unsigned colorA = SkGetPackedA32(color);
355 if (255 == colorA) {
356 sk_memset32(dst, color, count);
357 } else {
358 unsigned scale = 256 - SkAlpha255To256(colorA);
359
360 if (count >= 4) {
361 SkASSERT(((size_t)dst & 0x03) == 0);
362 while (((size_t)dst & 0x0F) != 0) {
363 *dst = color + SkAlphaMulQ(*src, scale);
364 src++;
365 dst++;
366 count--;
367 }
368
369 const __m128i *s = reinterpret_cast<const __m128i*>(src);
370 __m128i *d = reinterpret_cast<__m128i*>(dst);
371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
372 __m128i src_scale_wide = _mm_set1_epi16(scale);
373 __m128i color_wide = _mm_set1_epi32(color);
374 while (count >= 4) {
375 // Load 4 pixels each of src and dest.
376 __m128i src_pixel = _mm_loadu_si128(s);
377
378 // Get red and blue pixels into lower byte of each word.
379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
reed@google.com981d4792011-03-09 12:55:47 +0000380
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000381 // Get alpha and green into lower byte of each word.
382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
383
384 // Multiply by scale.
385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
387
388 // Divide by 256.
389 src_rb = _mm_srli_epi16(src_rb, 8);
390 src_ag = _mm_andnot_si128(rb_mask, src_ag);
391
392 // Combine back into RGBA.
393 src_pixel = _mm_or_si128(src_rb, src_ag);
394
395 // Add color to result.
396 __m128i result = _mm_add_epi8(color_wide, src_pixel);
397
398 // Store result.
399 _mm_store_si128(d, result);
400 s++;
401 d++;
402 count -= 4;
403 }
404 src = reinterpret_cast<const SkPMColor*>(s);
405 dst = reinterpret_cast<SkPMColor*>(d);
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +0000406 }
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000407
408 while (count > 0) {
409 *dst = color + SkAlphaMulQ(*src, scale);
410 src += 1;
411 dst += 1;
412 count--;
reed@google.com981d4792011-03-09 12:55:47 +0000413 }
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000414 }
415}
reed@google.com981d4792011-03-09 12:55:47 +0000416
reed@google.comedb606c2011-10-18 13:56:50 +0000417void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
418 size_t maskRB, SkColor origColor,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000419 int width, int height) {
reed@google.comee467ee2011-03-09 13:23:57 +0000420 SkPMColor color = SkPreMultiplyColor(origColor);
reed@google.com981d4792011-03-09 12:55:47 +0000421 size_t dstOffset = dstRB - (width << 2);
422 size_t maskOffset = maskRB - width;
423 SkPMColor* dst = (SkPMColor *)device;
reed@google.comedb606c2011-10-18 13:56:50 +0000424 const uint8_t* mask = (const uint8_t*)maskPtr;
reed@google.com981d4792011-03-09 12:55:47 +0000425 do {
426 int count = width;
427 if (count >= 4) {
428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
429 *dst = SkBlendARGB32(color, *dst, *mask);
430 mask++;
431 dst++;
432 count--;
433 }
434 __m128i *d = reinterpret_cast<__m128i*>(dst);
435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
436 __m128i c_256 = _mm_set1_epi16(256);
437 __m128i c_1 = _mm_set1_epi16(1);
438 __m128i src_pixel = _mm_set1_epi32(color);
439 while (count >= 4) {
440 // Load 4 pixels each of src and dest.
441 __m128i dst_pixel = _mm_load_si128(d);
442
443 //set the aphla value
444 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
445 0, *(mask+3),0, \
446 *(mask+2),0, *(mask+2),\
447 0,*(mask+1), 0,*(mask+1),\
448 0, *mask,0,*mask);
449
450 //call SkAlpha255To256()
451 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
452
453 // Get red and blue pixels into lower byte of each word.
454 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
455 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
456
457 // Get alpha and green into lower byte of each word.
458 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
459 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
460
461 // Put per-pixel alpha in low byte of each word.
462 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
463 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
464
465 // dst_alpha = dst_alpha * src_scale
466 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
467
468 // Divide by 256.
469 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
470
471 // Subtract alphas from 256, to get 1..256
472 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
473 // Multiply red and blue by dst pixel alpha.
474 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
475 // Multiply alpha and green by dst pixel alpha.
476 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
477
478 // Multiply red and blue by global alpha.
479 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
480 // Multiply alpha and green by global alpha.
481 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
482 // Divide by 256.
483 dst_rb = _mm_srli_epi16(dst_rb, 8);
484 src_rb = _mm_srli_epi16(src_rb, 8);
485
486 // Mask out low bits (goodies already in the right place; no need to divide)
487 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
488 src_ag = _mm_andnot_si128(rb_mask, src_ag);
489
490 // Combine back into RGBA.
491 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
492 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
493
494 // Add two pixels into result.
495 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
496 _mm_store_si128(d, result);
497 // load the next 4 pixel
498 mask = mask + 4;
499 d++;
500 count -= 4;
501 }
502 dst = reinterpret_cast<SkPMColor *>(d);
503 }
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +0000504 while (count > 0) {
reed@google.com981d4792011-03-09 12:55:47 +0000505 *dst= SkBlendARGB32(color, *dst, *mask);
506 dst += 1;
507 mask++;
508 count --;
509 }
510 dst = (SkPMColor *)((char*)dst + dstOffset);
511 mask += maskOffset;
512 } while (--height != 0);
513}
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000514
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000515// The following (left) shifts cause the top 5 bits of the mask components to
516// line up with the corresponding components in an SkPMColor.
517// Note that the mask's RGB16 order may differ from the SkPMColor order.
518#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
519#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
520#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
521
522#if SK_R16x5_R32x5_SHIFT == 0
523 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
524#elif SK_R16x5_R32x5_SHIFT > 0
525 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
526#else
527 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
528#endif
529
530#if SK_G16x5_G32x5_SHIFT == 0
531 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
532#elif SK_G16x5_G32x5_SHIFT > 0
533 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
534#else
535 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
536#endif
537
538#if SK_B16x5_B32x5_SHIFT == 0
539 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
540#elif SK_B16x5_B32x5_SHIFT > 0
541 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
542#else
543 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
544#endif
545
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000546static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
547 __m128i &mask, __m128i &srcA) {
548 // In the following comments, the components of src, dst and mask are
549 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
550 // by an R, G, B, or A suffix. Components of one of the four pixels that
551 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
552 // example is the blue channel of the second destination pixel. Memory
553 // layout is shown for an ARGB byte order in a color value.
554
555 // src and srcA store 8-bit values interleaved with zeros.
556 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
557 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
558 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
559 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
560 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
561 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
562 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
563
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000564 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000565 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000566 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
567 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000568
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000569 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000570 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
571 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000572
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000573 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000574 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
575 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000576
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000577 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000578 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
579 // 8-bit position
580 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
581 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000582 mask = _mm_or_si128(_mm_or_si128(r, g), b);
583
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000584 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000585 // i.e. split the sixteen 8-bit values from mask into two sets of eight
586 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000587 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000588 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000589 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000590 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000591 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
592
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000593 // Upscale from 0..31 to 0..32
594 // (allows to replace division by left-shift further down)
595 // Left-shift each component by 4 and add the result back to that component,
596 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000597 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
598 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
599
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000600 // Multiply each component of maskLo and maskHi by srcA
601 maskLo = _mm_mullo_epi16(maskLo, srcA);
602 maskHi = _mm_mullo_epi16(maskHi, srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000603
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000604 // Left shift mask components by 8 (divide by 256)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000605 maskLo = _mm_srli_epi16(maskLo, 8);
606 maskHi = _mm_srli_epi16(maskHi, 8);
607
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000608 // Interleave R,G,B into the lower byte of the word
609 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000610 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000611 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000612 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
613
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000614 // mask = (src - dst) * mask
615 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
616 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000617
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000618 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000619 maskLo = _mm_srai_epi16(maskLo, 5);
620 maskHi = _mm_srai_epi16(maskHi, 5);
621
622 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000623 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000624 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
625 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
626
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000627 // Pack into 4 32bit dst pixels.
628 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
629 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
630 // clamping to 255 if necessary.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000631 return _mm_packus_epi16(resultLo, resultHi);
632}
633
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000634static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000635 __m128i &mask) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000636 // In the following comments, the components of src, dst and mask are
637 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
638 // by an R, G, B, or A suffix. Components of one of the four pixels that
639 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
640 // example is the blue channel of the second destination pixel. Memory
641 // layout is shown for an ARGB byte order in a color value.
642
643 // src and srcA store 8-bit values interleaved with zeros.
644 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
645 // mask stores 16-bit values (shown as high and low bytes) interleaved with
646 // zeros
647 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
648 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
649
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000650 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000651 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000652 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
653 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000654
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000655 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000656 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
657 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000658
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000659 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000660 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
661 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000662
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000663 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000664 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
665 // 8-bit position
666 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
667 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000668 mask = _mm_or_si128(_mm_or_si128(r, g), b);
669
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000670 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000671 // i.e. split the sixteen 8-bit values from mask into two sets of eight
672 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000673 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000674 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000675 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000676 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000677 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
678
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000679 // Upscale from 0..31 to 0..32
680 // (allows to replace division by left-shift further down)
681 // Left-shift each component by 4 and add the result back to that component,
682 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000683 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
684 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
685
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000686 // Interleave R,G,B into the lower byte of the word
687 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000688 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000689 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000690 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
691
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000692 // mask = (src - dst) * mask
693 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
694 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000695
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000696 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000697 maskLo = _mm_srai_epi16(maskLo, 5);
698 maskHi = _mm_srai_epi16(maskHi, 5);
699
700 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000701 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000702 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
703 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
704
bungeman@google.com27123cd2012-08-21 19:25:42 +0000705 // Pack into 4 32bit dst pixels and force opaque.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000706 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
707 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
708 // clamping to 255 if necessary. Set alpha components to 0xFF.
bungeman@google.com27123cd2012-08-21 19:25:42 +0000709 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
710 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000711}
712
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000713void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
714 SkColor src, int width, SkPMColor) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000715 if (width <= 0) {
716 return;
717 }
718
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000719 int srcA = SkColorGetA(src);
720 int srcR = SkColorGetR(src);
721 int srcG = SkColorGetG(src);
722 int srcB = SkColorGetB(src);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000723
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000724 srcA = SkAlpha255To256(srcA);
725
726 if (width >= 4) {
727 SkASSERT(((size_t)dst & 0x03) == 0);
728 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000729 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
730 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000731 dst++;
732 width--;
733 }
734
735 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000736 // Set alpha to 0xFF and replicate source four times in SSE register.
737 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
738 // Interleave with zeros to get two sets of four 16-bit values.
739 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
740 // Set srcA_sse to contain eight copies of srcA, padded with zero.
741 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
742 __m128i srcA_sse = _mm_set1_epi16(srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000743 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000744 // Load four destination pixels into dst_sse.
745 __m128i dst_sse = _mm_load_si128(d);
746 // Load four 16-bit masks into lower half of mask_sse.
747 __m128i mask_sse = _mm_loadl_epi64(
748 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000749
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000750 // Check whether masks are equal to 0 and get the highest bit
751 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000752 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000753 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000754 _mm_setzero_si128()));
755
756 // if mask pixels are not all zero, we will blend the dst pixels
757 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000758 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000759 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
760 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
761 mask_sse = _mm_unpacklo_epi16(mask_sse,
762 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000763
764 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000765 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
766 mask_sse, srcA_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000767 _mm_store_si128(d, result);
768 }
769
770 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000771 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000772 width -= 4;
773 }
774
775 dst = reinterpret_cast<SkPMColor*>(d);
776 }
777
778 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000779 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
780 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000781 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000782 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000783 }
784}
785
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000786void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
787 SkColor src, int width, SkPMColor opaqueDst) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000788 if (width <= 0) {
789 return;
790 }
791
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000792 int srcR = SkColorGetR(src);
793 int srcG = SkColorGetG(src);
794 int srcB = SkColorGetB(src);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000795
796 if (width >= 4) {
797 SkASSERT(((size_t)dst & 0x03) == 0);
798 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000799 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
800 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000801 dst++;
802 width--;
803 }
804
805 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000806 // Set alpha to 0xFF and replicate source four times in SSE register.
807 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
808 // Set srcA_sse to contain eight copies of srcA, padded with zero.
809 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
810 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000811 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000812 // Load four destination pixels into dst_sse.
813 __m128i dst_sse = _mm_load_si128(d);
814 // Load four 16-bit masks into lower half of mask_sse.
815 __m128i mask_sse = _mm_loadl_epi64(
816 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000817
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000818 // Check whether masks are equal to 0 and get the highest bit
819 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000820 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000821 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000822 _mm_setzero_si128()));
823
824 // if mask pixels are not all zero, we will blend the dst pixels
825 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000826 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000827 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
828 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
829 mask_sse = _mm_unpacklo_epi16(mask_sse,
830 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000831
832 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000833 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
834 mask_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000835 _mm_store_si128(d, result);
836 }
837
838 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000839 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000840 width -= 4;
841 }
842
843 dst = reinterpret_cast<SkPMColor*>(d);
844 }
845
846 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000847 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
848 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000849 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000850 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000851 }
852}
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000853
commit-bot@chromium.org39ce33a2014-02-24 04:23:39 +0000854/* SSE2 version of S32_D565_Opaque()
855 * portable version is in core/SkBlitRow_D16.cpp
856 */
857void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
858 const SkPMColor* SK_RESTRICT src, int count,
859 U8CPU alpha, int /*x*/, int /*y*/) {
860 SkASSERT(255 == alpha);
861
862 if (count <= 0) {
863 return;
864 }
865
866 if (count >= 8) {
867 while (((size_t)dst & 0x0F) != 0) {
868 SkPMColor c = *src++;
869 SkPMColorAssert(c);
870
871 *dst++ = SkPixel32ToPixel16_ToU16(c);
872 count--;
873 }
874
875 const __m128i* s = reinterpret_cast<const __m128i*>(src);
876 __m128i* d = reinterpret_cast<__m128i*>(dst);
877 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
878 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
879 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
880
881 while (count >= 8) {
882 // Load 8 pixels of src.
883 __m128i src_pixel1 = _mm_loadu_si128(s++);
884 __m128i src_pixel2 = _mm_loadu_si128(s++);
885
886 // Calculate result r.
887 __m128i r1 = _mm_srli_epi32(src_pixel1,
888 SK_R32_SHIFT + (8 - SK_R16_BITS));
889 r1 = _mm_and_si128(r1, r16_mask);
890 __m128i r2 = _mm_srli_epi32(src_pixel2,
891 SK_R32_SHIFT + (8 - SK_R16_BITS));
892 r2 = _mm_and_si128(r2, r16_mask);
893 __m128i r = _mm_packs_epi32(r1, r2);
894
895 // Calculate result g.
896 __m128i g1 = _mm_srli_epi32(src_pixel1,
897 SK_G32_SHIFT + (8 - SK_G16_BITS));
898 g1 = _mm_and_si128(g1, g16_mask);
899 __m128i g2 = _mm_srli_epi32(src_pixel2,
900 SK_G32_SHIFT + (8 - SK_G16_BITS));
901 g2 = _mm_and_si128(g2, g16_mask);
902 __m128i g = _mm_packs_epi32(g1, g2);
903
904 // Calculate result b.
905 __m128i b1 = _mm_srli_epi32(src_pixel1,
906 SK_B32_SHIFT + (8 - SK_B16_BITS));
907 b1 = _mm_and_si128(b1, b16_mask);
908 __m128i b2 = _mm_srli_epi32(src_pixel2,
909 SK_B32_SHIFT + (8 - SK_B16_BITS));
910 b2 = _mm_and_si128(b2, b16_mask);
911 __m128i b = _mm_packs_epi32(b1, b2);
912
913 // Store 8 16-bit colors in dst.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000914 __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);
commit-bot@chromium.org39ce33a2014-02-24 04:23:39 +0000915 _mm_store_si128(d++, d_pixel);
916 count -= 8;
917 }
918 src = reinterpret_cast<const SkPMColor*>(s);
919 dst = reinterpret_cast<uint16_t*>(d);
920 }
921
922 if (count > 0) {
923 do {
924 SkPMColor c = *src++;
925 SkPMColorAssert(c);
926 *dst++ = SkPixel32ToPixel16_ToU16(c);
927 } while (--count != 0);
928 }
929}
930
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000931/* SSE2 version of S32A_D565_Opaque()
932 * portable version is in core/SkBlitRow_D16.cpp
933 */
934void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
935 const SkPMColor* SK_RESTRICT src,
936 int count, U8CPU alpha, int /*x*/, int /*y*/) {
937 SkASSERT(255 == alpha);
938
939 if (count <= 0) {
940 return;
941 }
942
943 if (count >= 8) {
944 // Make dst 16 bytes alignment
945 while (((size_t)dst & 0x0F) != 0) {
946 SkPMColor c = *src++;
947 if (c) {
948 *dst = SkSrcOver32To16(c, *dst);
949 }
950 dst += 1;
951 count--;
952 }
953
954 const __m128i* s = reinterpret_cast<const __m128i*>(src);
955 __m128i* d = reinterpret_cast<__m128i*>(dst);
956 __m128i var255 = _mm_set1_epi16(255);
957 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
958 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
959 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
960
961 while (count >= 8) {
962 // Load 8 pixels of src.
963 __m128i src_pixel1 = _mm_loadu_si128(s++);
964 __m128i src_pixel2 = _mm_loadu_si128(s++);
965
966 // Check whether src pixels are equal to 0 and get the highest bit
967 // of each byte of result, if src pixels are all zero, src_cmp1 and
968 // src_cmp2 will be 0xFFFF.
969 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
970 _mm_setzero_si128()));
971 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
972 _mm_setzero_si128()));
973 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
974 d++;
975 count -= 8;
976 continue;
977 }
978
979 // Load 8 pixels of dst.
980 __m128i dst_pixel = _mm_load_si128(d);
981
982 // Extract A from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000983 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000984 sa1 = _mm_srli_epi32(sa1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000985 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000986 sa2 = _mm_srli_epi32(sa2, 24);
987 __m128i sa = _mm_packs_epi32(sa1, sa2);
988
989 // Extract R from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000990 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000991 sr1 = _mm_srli_epi32(sr1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000992 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000993 sr2 = _mm_srli_epi32(sr2, 24);
994 __m128i sr = _mm_packs_epi32(sr1, sr2);
995
996 // Extract G from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000997 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000998 sg1 = _mm_srli_epi32(sg1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000999 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001000 sg2 = _mm_srli_epi32(sg2, 24);
1001 __m128i sg = _mm_packs_epi32(sg1, sg2);
1002
1003 // Extract B from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001004 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001005 sb1 = _mm_srli_epi32(sb1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001006 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001007 sb2 = _mm_srli_epi32(sb2, 24);
1008 __m128i sb = _mm_packs_epi32(sb1, sb2);
1009
1010 // Extract R G B from dst.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001011 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001012 dr = _mm_and_si128(dr, r16_mask);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001013 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001014 dg = _mm_and_si128(dg, g16_mask);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001015 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001016 db = _mm_and_si128(db, b16_mask);
1017
1018 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1019
1020 // Calculate R G B of result.
1021 // Original algorithm is in SkSrcOver32To16().
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001022 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001023 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001024 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001025 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001026 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001027 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1028
1029 // Pack R G B into 16-bit color.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001030 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
commit-bot@chromium.org47591072014-02-19 03:09:52 +00001031
1032 // Store 8 16-bit colors in dst.
1033 _mm_store_si128(d++, d_pixel);
1034 count -= 8;
1035 }
1036
1037 src = reinterpret_cast<const SkPMColor*>(s);
1038 dst = reinterpret_cast<uint16_t*>(d);
1039 }
1040
1041 if (count > 0) {
1042 do {
1043 SkPMColor c = *src++;
1044 SkPMColorAssert(c);
1045 if (c) {
1046 *dst = SkSrcOver32To16(c, *dst);
1047 }
1048 dst += 1;
1049 } while (--count != 0);
1050 }
1051}
commit-bot@chromium.org27580472014-03-07 03:25:32 +00001052
1053void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1054 const SkPMColor* SK_RESTRICT src,
1055 int count, U8CPU alpha, int x, int y) {
1056 SkASSERT(255 == alpha);
1057
1058 if (count <= 0) {
1059 return;
1060 }
1061
1062 if (count >= 8) {
1063 while (((size_t)dst & 0x0F) != 0) {
1064 DITHER_565_SCAN(y);
1065 SkPMColor c = *src++;
1066 SkPMColorAssert(c);
1067
1068 unsigned dither = DITHER_VALUE(x);
1069 *dst++ = SkDitherRGB32To565(c, dither);
1070 DITHER_INC_X(x);
1071 count--;
1072 }
1073
1074 unsigned short dither_value[8];
1075 __m128i dither;
1076#ifdef ENABLE_DITHER_MATRIX_4X4
1077 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1078 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1079 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1080 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1081 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1082#else
1083 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1084 dither_value[0] = dither_value[4] = (dither_scan
1085 >> (((x) & 3) << 2)) & 0xF;
1086 dither_value[1] = dither_value[5] = (dither_scan
1087 >> (((x + 1) & 3) << 2)) & 0xF;
1088 dither_value[2] = dither_value[6] = (dither_scan
1089 >> (((x + 2) & 3) << 2)) & 0xF;
1090 dither_value[3] = dither_value[7] = (dither_scan
1091 >> (((x + 3) & 3) << 2)) & 0xF;
1092#endif
1093 dither = _mm_loadu_si128((__m128i*) dither_value);
1094
1095 const __m128i* s = reinterpret_cast<const __m128i*>(src);
1096 __m128i* d = reinterpret_cast<__m128i*>(dst);
1097
1098 while (count >= 8) {
1099 // Load 8 pixels of src.
1100 __m128i src_pixel1 = _mm_loadu_si128(s++);
1101 __m128i src_pixel2 = _mm_loadu_si128(s++);
1102
1103 // Extract R from src.
1104 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1105 sr1 = _mm_srli_epi32(sr1, 24);
1106 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1107 sr2 = _mm_srli_epi32(sr2, 24);
1108 __m128i sr = _mm_packs_epi32(sr1, sr2);
1109
1110 // SkDITHER_R32To565(sr, dither)
1111 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1112 sr = _mm_add_epi16(sr, dither);
1113 sr = _mm_sub_epi16(sr, sr_offset);
1114 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
1115
1116 // Extract G from src.
1117 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1118 sg1 = _mm_srli_epi32(sg1, 24);
1119 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1120 sg2 = _mm_srli_epi32(sg2, 24);
1121 __m128i sg = _mm_packs_epi32(sg1, sg2);
1122
1123 // SkDITHER_R32To565(sg, dither)
1124 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1125 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
1126 sg = _mm_sub_epi16(sg, sg_offset);
1127 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
1128
1129 // Extract B from src.
1130 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1131 sb1 = _mm_srli_epi32(sb1, 24);
1132 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1133 sb2 = _mm_srli_epi32(sb2, 24);
1134 __m128i sb = _mm_packs_epi32(sb1, sb2);
1135
1136 // SkDITHER_R32To565(sb, dither)
1137 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1138 sb = _mm_add_epi16(sb, dither);
1139 sb = _mm_sub_epi16(sb, sb_offset);
1140 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
1141
1142 // Pack and store 16-bit dst pixel.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001143 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
commit-bot@chromium.org27580472014-03-07 03:25:32 +00001144 _mm_store_si128(d++, d_pixel);
1145
1146 count -= 8;
1147 x += 8;
1148 }
1149
1150 src = reinterpret_cast<const SkPMColor*>(s);
1151 dst = reinterpret_cast<uint16_t*>(d);
1152 }
1153
1154 if (count > 0) {
1155 DITHER_565_SCAN(y);
1156 do {
1157 SkPMColor c = *src++;
1158 SkPMColorAssert(c);
1159
1160 unsigned dither = DITHER_VALUE(x);
1161 *dst++ = SkDitherRGB32To565(c, dither);
1162 DITHER_INC_X(x);
1163 } while (--count != 0);
1164 }
1165}
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +00001166
1167/* SSE2 version of S32A_D565_Opaque_Dither()
1168 * portable version is in core/SkBlitRow_D16.cpp
1169 */
1170void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1171 const SkPMColor* SK_RESTRICT src,
1172 int count, U8CPU alpha, int x, int y) {
1173 SkASSERT(255 == alpha);
1174
1175 if (count <= 0) {
1176 return;
1177 }
1178
1179 if (count >= 8) {
1180 while (((size_t)dst & 0x0F) != 0) {
1181 DITHER_565_SCAN(y);
1182 SkPMColor c = *src++;
1183 SkPMColorAssert(c);
1184 if (c) {
1185 unsigned a = SkGetPackedA32(c);
1186
1187 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1188
1189 unsigned sr = SkGetPackedR32(c);
1190 unsigned sg = SkGetPackedG32(c);
1191 unsigned sb = SkGetPackedB32(c);
1192 sr = SkDITHER_R32_FOR_565(sr, d);
1193 sg = SkDITHER_G32_FOR_565(sg, d);
1194 sb = SkDITHER_B32_FOR_565(sb, d);
1195
1196 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1197 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1198 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1199 // now src and dst expanded are in g:11 r:10 x:1 b:10
1200 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1201 }
1202 dst += 1;
1203 DITHER_INC_X(x);
1204 count--;
1205 }
1206
1207 unsigned short dither_value[8];
1208 __m128i dither, dither_cur;
1209#ifdef ENABLE_DITHER_MATRIX_4X4
1210 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1211 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1212 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1213 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1214 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1215#else
1216 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1217 dither_value[0] = dither_value[4] = (dither_scan
1218 >> (((x) & 3) << 2)) & 0xF;
1219 dither_value[1] = dither_value[5] = (dither_scan
1220 >> (((x + 1) & 3) << 2)) & 0xF;
1221 dither_value[2] = dither_value[6] = (dither_scan
1222 >> (((x + 2) & 3) << 2)) & 0xF;
1223 dither_value[3] = dither_value[7] = (dither_scan
1224 >> (((x + 3) & 3) << 2)) & 0xF;
1225#endif
1226 dither = _mm_loadu_si128((__m128i*) dither_value);
1227
1228 const __m128i* s = reinterpret_cast<const __m128i*>(src);
1229 __m128i* d = reinterpret_cast<__m128i*>(dst);
1230 __m128i var256 = _mm_set1_epi16(256);
1231 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1232 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1233 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1234
1235 while (count >= 8) {
1236 // Load 8 pixels of src and dst.
1237 __m128i src_pixel1 = _mm_loadu_si128(s++);
1238 __m128i src_pixel2 = _mm_loadu_si128(s++);
1239 __m128i dst_pixel = _mm_load_si128(d);
1240
1241 // Extract A from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001242 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +00001243 sa1 = _mm_srli_epi32(sa1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001244 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +00001245 sa2 = _mm_srli_epi32(sa2, 24);
1246 __m128i sa = _mm_packs_epi32(sa1, sa2);
1247
1248 // Calculate current dither value.
1249 dither_cur = _mm_mullo_epi16(dither,
1250 _mm_add_epi16(sa, _mm_set1_epi16(1)));
1251 dither_cur = _mm_srli_epi16(dither_cur, 8);
1252
1253 // Extract R from src.
1254 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1255 sr1 = _mm_srli_epi32(sr1, 24);
1256 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1257 sr2 = _mm_srli_epi32(sr2, 24);
1258 __m128i sr = _mm_packs_epi32(sr1, sr2);
1259
1260 // SkDITHER_R32_FOR_565(sr, d)
1261 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1262 sr = _mm_add_epi16(sr, dither_cur);
1263 sr = _mm_sub_epi16(sr, sr_offset);
1264
1265 // Expand sr.
1266 sr = _mm_slli_epi16(sr, 2);
1267
1268 // Extract G from src.
1269 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1270 sg1 = _mm_srli_epi32(sg1, 24);
1271 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1272 sg2 = _mm_srli_epi32(sg2, 24);
1273 __m128i sg = _mm_packs_epi32(sg1, sg2);
1274
1275 // sg = SkDITHER_G32_FOR_565(sg, d).
1276 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1277 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1278 sg = _mm_sub_epi16(sg, sg_offset);
1279
1280 // Expand sg.
1281 sg = _mm_slli_epi16(sg, 3);
1282
1283 // Extract B from src.
1284 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1285 sb1 = _mm_srli_epi32(sb1, 24);
1286 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1287 sb2 = _mm_srli_epi32(sb2, 24);
1288 __m128i sb = _mm_packs_epi32(sb1, sb2);
1289
1290 // sb = SkDITHER_B32_FOR_565(sb, d).
1291 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1292 sb = _mm_add_epi16(sb, dither_cur);
1293 sb = _mm_sub_epi16(sb, sb_offset);
1294
1295 // Expand sb.
1296 sb = _mm_slli_epi16(sb, 2);
1297
1298 // Extract R G B from dst.
1299 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1300 dr = _mm_and_si128(dr, r16_mask);
1301 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1302 dg = _mm_and_si128(dg, g16_mask);
1303 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1304 db = _mm_and_si128(db, b16_mask);
1305
1306 // SkAlpha255To256(255 - a) >> 3
1307 __m128i isa = _mm_sub_epi16(var256, sa);
1308 isa = _mm_srli_epi16(isa, 3);
1309
1310 dr = _mm_mullo_epi16(dr, isa);
1311 dr = _mm_add_epi16(dr, sr);
1312 dr = _mm_srli_epi16(dr, 5);
1313
1314 dg = _mm_mullo_epi16(dg, isa);
1315 dg = _mm_add_epi16(dg, sg);
1316 dg = _mm_srli_epi16(dg, 5);
1317
1318 db = _mm_mullo_epi16(db, isa);
1319 db = _mm_add_epi16(db, sb);
1320 db = _mm_srli_epi16(db, 5);
1321
1322 // Package and store dst pixel.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +00001323 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +00001324 _mm_store_si128(d++, d_pixel);
1325
1326 count -= 8;
1327 x += 8;
1328 }
1329
1330 src = reinterpret_cast<const SkPMColor*>(s);
1331 dst = reinterpret_cast<uint16_t*>(d);
1332 }
1333
1334 if (count > 0) {
1335 DITHER_565_SCAN(y);
1336 do {
1337 SkPMColor c = *src++;
1338 SkPMColorAssert(c);
1339 if (c) {
1340 unsigned a = SkGetPackedA32(c);
1341
1342 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1343
1344 unsigned sr = SkGetPackedR32(c);
1345 unsigned sg = SkGetPackedG32(c);
1346 unsigned sb = SkGetPackedB32(c);
1347 sr = SkDITHER_R32_FOR_565(sr, d);
1348 sg = SkDITHER_G32_FOR_565(sg, d);
1349 sb = SkDITHER_B32_FOR_565(sb, d);
1350
1351 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1352 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1353 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1354 // now src and dst expanded are in g:11 r:10 x:1 b:10
1355 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1356 }
1357 dst += 1;
1358 DITHER_INC_X(x);
1359 } while (--count != 0);
1360 }
1361}