blob: f3d010e3bc4502f1c54d4cb9b09488c6494a1a98 [file] [log] [blame]
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001/*
tomhudson@google.com98a5b422012-02-28 16:15:26 +00002 * Copyright 2012 The Android Open Source Project
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
senorblanco@chromium.org92727612009-11-04 20:51:06 +00006 */
7
epoger@google.comec3ed6a2011-07-28 14:26:00 +00008
senorblanco@chromium.org4e753552009-11-16 21:09:00 +00009#include "SkBlitRow_opts_SSE2.h"
caryclark@google.com83ecdc32012-06-06 12:10:26 +000010#include "SkBitmapProcState_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000011#include "SkColorPriv.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000012#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000013
14#include <emmintrin.h>
15
senorblanco@chromium.org92727612009-11-04 20:51:06 +000016/* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000019void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20 const SkPMColor* SK_RESTRICT src,
21 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000022 SkASSERT(alpha <= 255);
23 if (count <= 0) {
24 return;
25 }
26
27 uint32_t src_scale = SkAlpha255To256(alpha);
28 uint32_t dst_scale = 256 - src_scale;
29
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000030 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++;
35 dst++;
36 count--;
37 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000038
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000039 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
tomhudson@google.com98a5b422012-02-28 16:15:26 +000042 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
43
44 // Move scale factors to upper byte of word
45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000047 while (count >= 4) {
48 // Load 4 pixels each of src and dest.
49 __m128i src_pixel = _mm_loadu_si128(s);
50 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000051
tomhudson@google.com98a5b422012-02-28 16:15:26 +000052 // Interleave Atom port 0/1 operations based on the execution port
53 // constraints that multiply can only be executed on port 0 (while
54 // boolean operations can be executed on either port 0 or port 1)
55 // because GCC currently doesn't do a good job scheduling
56 // instructions based on these constraints.
57
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000058 // Get red and blue pixels into lower byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000059 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000060 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000061
tomhudson@google.com98a5b422012-02-28 16:15:26 +000062 // Multiply by scale.
63 // (4 x (0, rs.h, 0, bs.h))
64 // where rs.h stands for the higher byte of r * scale, and
65 // bs.h the higher byte of b * scale.
66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
67
68 // Get alpha and green pixels into higher byte of each word.
69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000071
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000072 // Multiply by scale.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000073 // (4 x (as.h, as.l, gs.h, gs.l))
74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000075
tomhudson@google.com98a5b422012-02-28 16:15:26 +000076 // Clear the lower byte of the a*scale and g*scale results
77 // (4 x (as.h, 0, gs.h, 0))
78 src_ag = _mm_and_si128(src_ag, ag_mask);
79
80 // Operations the destination pixels are the same as on the
81 // source pixels. See the comments above.
82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
86 dst_ag = _mm_and_si128(dst_ag, ag_mask);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000087
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000088 // Combine back into RGBA.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000089 // (4 x (as.h, rs.h, gs.h, bs.h))
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000090 src_pixel = _mm_or_si128(src_rb, src_ag);
91 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
92
93 // Add result
94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
95 _mm_store_si128(d, result);
96 s++;
97 d++;
98 count -= 4;
99 }
100 src = reinterpret_cast<const SkPMColor*>(s);
101 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000102 }
103
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000104 while (count > 0) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000105 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
106 src++;
107 dst++;
108 count--;
109 }
110}
111
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000112void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
113 const SkPMColor* SK_RESTRICT src,
114 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000115 SkASSERT(alpha == 255);
116 if (count <= 0) {
117 return;
118 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000119
120 if (count >= 4) {
121 SkASSERT(((size_t)dst & 0x03) == 0);
122 while (((size_t)dst & 0x0F) != 0) {
123 *dst = SkPMSrcOver(*src, *dst);
124 src++;
125 dst++;
126 count--;
127 }
128
129 const __m128i *s = reinterpret_cast<const __m128i*>(src);
130 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000131#ifdef SK_USE_ACCURATE_BLENDING
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000132 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
133 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
134 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
135 while (count >= 4) {
136 // Load 4 pixels
137 __m128i src_pixel = _mm_loadu_si128(s);
138 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000139
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000140 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000141 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000142 // Shift alphas down to lower 8 bits of each quad.
143 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000144
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000145 // Copy alpha to upper 3rd byte of each quad
146 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000147
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000148 // Subtract alphas from 255, to get 0..255
149 alpha = _mm_sub_epi16(c_255, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000150
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000151 // Multiply by red and blue by src alpha.
152 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
153 // Multiply by alpha and green by src alpha.
154 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000155
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000156 // dst_rb_low = (dst_rb >> 8)
157 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
158 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000159
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000160 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
161 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
162 dst_rb = _mm_add_epi16(dst_rb, c_128);
163 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000164
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000165 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
166 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
167 dst_ag = _mm_add_epi16(dst_ag, c_128);
168 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000169
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000170 // Combine back into RGBA.
171 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000172
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000173 // Add result
174 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
175 _mm_store_si128(d, result);
176 s++;
177 d++;
178 count -= 4;
179 }
180 #else
181 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
182 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
183 while (count >= 4) {
184 // Load 4 pixels
185 __m128i src_pixel = _mm_loadu_si128(s);
186 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000187
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000188 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000189 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000190
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000191 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
192 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
193
194 // (a0, a0, a1, a1, a2, g2, a3, g3)
195 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
196
197 // (a0, a0, a1, a1, a2, a2, a3, a3)
198 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000199
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000200 // Subtract alphas from 256, to get 1..256
201 alpha = _mm_sub_epi16(c_256, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000202
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000203 // Multiply by red and blue by src alpha.
204 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
205 // Multiply by alpha and green by src alpha.
206 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000207
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000208 // Divide by 256.
209 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000210
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000211 // Mask out high bits (already in the right place)
212 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000213
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000214 // Combine back into RGBA.
215 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000216
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000217 // Add result
218 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
219 _mm_store_si128(d, result);
220 s++;
221 d++;
222 count -= 4;
223 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000224#endif
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000225 src = reinterpret_cast<const SkPMColor*>(s);
226 dst = reinterpret_cast<SkPMColor*>(d);
227 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000228
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000229 while (count > 0) {
230 *dst = SkPMSrcOver(*src, *dst);
231 src++;
232 dst++;
233 count--;
234 }
235}
236
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000237void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
238 const SkPMColor* SK_RESTRICT src,
239 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000240 SkASSERT(alpha <= 255);
241 if (count <= 0) {
242 return;
243 }
244
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000245 if (count >= 4) {
246 while (((size_t)dst & 0x0F) != 0) {
247 *dst = SkBlendARGB32(*src, *dst, alpha);
248 src++;
249 dst++;
250 count--;
251 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000252
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000253 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000254
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000255 const __m128i *s = reinterpret_cast<const __m128i*>(src);
256 __m128i *d = reinterpret_cast<__m128i*>(dst);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
260 while (count >= 4) {
261 // Load 4 pixels each of src and dest.
262 __m128i src_pixel = _mm_loadu_si128(s);
263 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000264
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000265 // Get red and blue pixels into lower byte of each word.
266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000268
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000269 // Get alpha and green into lower byte of each word.
270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000272
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000273 // Put per-pixel alpha in low byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000274 // After the following two statements, the dst_alpha looks like
275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000278
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000279 // dst_alpha = dst_alpha * src_scale
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000280 // Because src_scales are in the higher byte of each word and
281 // we use mulhi here, the resulting alpha values are already
282 // in the right place and don't need to be divided by 256.
283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000285
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000286 // Subtract alphas from 256, to get 1..256
287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000288
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000289 // Multiply red and blue by dst pixel alpha.
290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
291 // Multiply alpha and green by dst pixel alpha.
292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000293
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000294 // Multiply red and blue by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000295 // (4 x (0, rs.h, 0, bs.h))
296 // where rs.h stands for the higher byte of r * src_scale,
297 // and bs.h the higher byte of b * src_scale.
298 // Again, because we use mulhi, the resuling red and blue
299 // values are already in the right place and don't need to
300 // be divided by 256.
301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000302 // Multiply alpha and green by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000303 // (4 x (0, as.h, 0, gs.h))
304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000305
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000306 // Divide by 256.
307 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000308
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000309 // Mask out low bits (goodies already in the right place; no need to divide)
310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000311 // Shift alpha and green to higher byte of each word.
312 // (4 x (as.h, 0, gs.h, 0))
313 src_ag = _mm_slli_epi16(src_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000314
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000315 // Combine back into RGBA.
316 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
317 src_pixel = _mm_or_si128(src_rb, src_ag);
318
319 // Add two pixels into result.
320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
321 _mm_store_si128(d, result);
322 s++;
323 d++;
324 count -= 4;
325 }
326 src = reinterpret_cast<const SkPMColor*>(s);
327 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000328 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000329
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000330 while (count > 0) {
331 *dst = SkBlendARGB32(*src, *dst, alpha);
332 src++;
333 dst++;
334 count--;
335 }
336}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000337
338/* SSE2 version of Color32()
339 * portable version is in core/SkBlitRow_D32.cpp
340 */
341void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
342 SkPMColor color) {
343
344 if (count <= 0) {
345 return;
346 }
347
348 if (0 == color) {
349 if (src != dst) {
350 memcpy(dst, src, count * sizeof(SkPMColor));
351 }
reed@google.comc909a1e2011-10-25 19:07:23 +0000352 return;
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000353 }
354
355 unsigned colorA = SkGetPackedA32(color);
356 if (255 == colorA) {
357 sk_memset32(dst, color, count);
358 } else {
359 unsigned scale = 256 - SkAlpha255To256(colorA);
360
361 if (count >= 4) {
362 SkASSERT(((size_t)dst & 0x03) == 0);
363 while (((size_t)dst & 0x0F) != 0) {
364 *dst = color + SkAlphaMulQ(*src, scale);
365 src++;
366 dst++;
367 count--;
368 }
369
370 const __m128i *s = reinterpret_cast<const __m128i*>(src);
371 __m128i *d = reinterpret_cast<__m128i*>(dst);
372 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
373 __m128i src_scale_wide = _mm_set1_epi16(scale);
374 __m128i color_wide = _mm_set1_epi32(color);
375 while (count >= 4) {
376 // Load 4 pixels each of src and dest.
377 __m128i src_pixel = _mm_loadu_si128(s);
378
379 // Get red and blue pixels into lower byte of each word.
380 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
reed@google.com981d4792011-03-09 12:55:47 +0000381
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000382 // Get alpha and green into lower byte of each word.
383 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
384
385 // Multiply by scale.
386 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
387 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
388
389 // Divide by 256.
390 src_rb = _mm_srli_epi16(src_rb, 8);
391 src_ag = _mm_andnot_si128(rb_mask, src_ag);
392
393 // Combine back into RGBA.
394 src_pixel = _mm_or_si128(src_rb, src_ag);
395
396 // Add color to result.
397 __m128i result = _mm_add_epi8(color_wide, src_pixel);
398
399 // Store result.
400 _mm_store_si128(d, result);
401 s++;
402 d++;
403 count -= 4;
404 }
405 src = reinterpret_cast<const SkPMColor*>(s);
406 dst = reinterpret_cast<SkPMColor*>(d);
407 }
408
409 while (count > 0) {
410 *dst = color + SkAlphaMulQ(*src, scale);
411 src += 1;
412 dst += 1;
413 count--;
reed@google.com981d4792011-03-09 12:55:47 +0000414 }
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000415 }
416}
reed@google.com981d4792011-03-09 12:55:47 +0000417
reed@google.comedb606c2011-10-18 13:56:50 +0000418void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
419 size_t maskRB, SkColor origColor,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000420 int width, int height) {
reed@google.comee467ee2011-03-09 13:23:57 +0000421 SkPMColor color = SkPreMultiplyColor(origColor);
reed@google.com981d4792011-03-09 12:55:47 +0000422 size_t dstOffset = dstRB - (width << 2);
423 size_t maskOffset = maskRB - width;
424 SkPMColor* dst = (SkPMColor *)device;
reed@google.comedb606c2011-10-18 13:56:50 +0000425 const uint8_t* mask = (const uint8_t*)maskPtr;
reed@google.com981d4792011-03-09 12:55:47 +0000426 do {
427 int count = width;
428 if (count >= 4) {
429 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
430 *dst = SkBlendARGB32(color, *dst, *mask);
431 mask++;
432 dst++;
433 count--;
434 }
435 __m128i *d = reinterpret_cast<__m128i*>(dst);
436 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
437 __m128i c_256 = _mm_set1_epi16(256);
438 __m128i c_1 = _mm_set1_epi16(1);
439 __m128i src_pixel = _mm_set1_epi32(color);
440 while (count >= 4) {
441 // Load 4 pixels each of src and dest.
442 __m128i dst_pixel = _mm_load_si128(d);
443
444 //set the aphla value
445 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
446 0, *(mask+3),0, \
447 *(mask+2),0, *(mask+2),\
448 0,*(mask+1), 0,*(mask+1),\
449 0, *mask,0,*mask);
450
451 //call SkAlpha255To256()
452 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
453
454 // Get red and blue pixels into lower byte of each word.
455 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
456 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
457
458 // Get alpha and green into lower byte of each word.
459 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
460 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
461
462 // Put per-pixel alpha in low byte of each word.
463 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
464 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
465
466 // dst_alpha = dst_alpha * src_scale
467 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
468
469 // Divide by 256.
470 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
471
472 // Subtract alphas from 256, to get 1..256
473 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
474 // Multiply red and blue by dst pixel alpha.
475 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
476 // Multiply alpha and green by dst pixel alpha.
477 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
478
479 // Multiply red and blue by global alpha.
480 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
481 // Multiply alpha and green by global alpha.
482 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
483 // Divide by 256.
484 dst_rb = _mm_srli_epi16(dst_rb, 8);
485 src_rb = _mm_srli_epi16(src_rb, 8);
486
487 // Mask out low bits (goodies already in the right place; no need to divide)
488 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
489 src_ag = _mm_andnot_si128(rb_mask, src_ag);
490
491 // Combine back into RGBA.
492 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
493 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
494
495 // Add two pixels into result.
496 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
497 _mm_store_si128(d, result);
498 // load the next 4 pixel
499 mask = mask + 4;
500 d++;
501 count -= 4;
502 }
503 dst = reinterpret_cast<SkPMColor *>(d);
504 }
505 while(count > 0) {
506 *dst= SkBlendARGB32(color, *dst, *mask);
507 dst += 1;
508 mask++;
509 count --;
510 }
511 dst = (SkPMColor *)((char*)dst + dstOffset);
512 mask += maskOffset;
513 } while (--height != 0);
514}
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000515
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000516// The following (left) shifts cause the top 5 bits of the mask components to
517// line up with the corresponding components in an SkPMColor.
518// Note that the mask's RGB16 order may differ from the SkPMColor order.
519#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
520#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
521#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
522
523#if SK_R16x5_R32x5_SHIFT == 0
524 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
525#elif SK_R16x5_R32x5_SHIFT > 0
526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
527#else
528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
529#endif
530
531#if SK_G16x5_G32x5_SHIFT == 0
532 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
533#elif SK_G16x5_G32x5_SHIFT > 0
534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
535#else
536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
537#endif
538
539#if SK_B16x5_B32x5_SHIFT == 0
540 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
541#elif SK_B16x5_B32x5_SHIFT > 0
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
543#else
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
545#endif
546
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000547static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
548 __m128i &mask, __m128i &srcA) {
549 // In the following comments, the components of src, dst and mask are
550 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
551 // by an R, G, B, or A suffix. Components of one of the four pixels that
552 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
553 // example is the blue channel of the second destination pixel. Memory
554 // layout is shown for an ARGB byte order in a color value.
555
556 // src and srcA store 8-bit values interleaved with zeros.
557 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
558 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
559 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
560 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
561 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
562 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
563 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
564
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000565 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000566 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000567 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
568 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000569
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000570 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000571 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
572 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000573
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000574 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000575 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
576 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000577
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000578 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000579 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
580 // 8-bit position
581 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
582 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000583 mask = _mm_or_si128(_mm_or_si128(r, g), b);
584
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000585 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000586 // i.e. split the sixteen 8-bit values from mask into two sets of eight
587 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000588 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000589 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000590 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000591 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000592 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
593
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000594 // Upscale from 0..31 to 0..32
595 // (allows to replace division by left-shift further down)
596 // Left-shift each component by 4 and add the result back to that component,
597 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000598 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
599 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
600
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000601 // Multiply each component of maskLo and maskHi by srcA
602 maskLo = _mm_mullo_epi16(maskLo, srcA);
603 maskHi = _mm_mullo_epi16(maskHi, srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000604
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000605 // Left shift mask components by 8 (divide by 256)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000606 maskLo = _mm_srli_epi16(maskLo, 8);
607 maskHi = _mm_srli_epi16(maskHi, 8);
608
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000609 // Interleave R,G,B into the lower byte of the word
610 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000611 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000612 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000613 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
614
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000615 // mask = (src - dst) * mask
616 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
617 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000618
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000619 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000620 maskLo = _mm_srai_epi16(maskLo, 5);
621 maskHi = _mm_srai_epi16(maskHi, 5);
622
623 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000624 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000625 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
626 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
627
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000628 // Pack into 4 32bit dst pixels.
629 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
630 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
631 // clamping to 255 if necessary.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000632 return _mm_packus_epi16(resultLo, resultHi);
633}
634
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000635static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000636 __m128i &mask) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000637 // In the following comments, the components of src, dst and mask are
638 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
639 // by an R, G, B, or A suffix. Components of one of the four pixels that
640 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
641 // example is the blue channel of the second destination pixel. Memory
642 // layout is shown for an ARGB byte order in a color value.
643
644 // src and srcA store 8-bit values interleaved with zeros.
645 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
646 // mask stores 16-bit values (shown as high and low bytes) interleaved with
647 // zeros
648 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
649 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
650
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000651 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000652 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000653 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
654 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000655
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000656 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000657 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
658 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000659
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000660 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000661 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
662 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000663
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000664 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000665 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
666 // 8-bit position
667 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
668 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000669 mask = _mm_or_si128(_mm_or_si128(r, g), b);
670
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000671 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000672 // i.e. split the sixteen 8-bit values from mask into two sets of eight
673 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000674 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000675 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000676 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000677 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000678 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
679
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000680 // Upscale from 0..31 to 0..32
681 // (allows to replace division by left-shift further down)
682 // Left-shift each component by 4 and add the result back to that component,
683 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000684 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
685 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
686
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000687 // Interleave R,G,B into the lower byte of the word
688 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000689 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000690 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000691 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
692
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000693 // mask = (src - dst) * mask
694 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
695 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000696
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000697 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000698 maskLo = _mm_srai_epi16(maskLo, 5);
699 maskHi = _mm_srai_epi16(maskHi, 5);
700
701 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000702 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000703 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
704 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
705
bungeman@google.com27123cd2012-08-21 19:25:42 +0000706 // Pack into 4 32bit dst pixels and force opaque.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000707 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
708 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
709 // clamping to 255 if necessary. Set alpha components to 0xFF.
bungeman@google.com27123cd2012-08-21 19:25:42 +0000710 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
711 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000712}
713
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000714void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
715 SkColor src, int width, SkPMColor) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000716 if (width <= 0) {
717 return;
718 }
719
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000720 int srcA = SkColorGetA(src);
721 int srcR = SkColorGetR(src);
722 int srcG = SkColorGetG(src);
723 int srcB = SkColorGetB(src);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000724
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000725 srcA = SkAlpha255To256(srcA);
726
727 if (width >= 4) {
728 SkASSERT(((size_t)dst & 0x03) == 0);
729 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000730 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
731 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000732 dst++;
733 width--;
734 }
735
736 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000737 // Set alpha to 0xFF and replicate source four times in SSE register.
738 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
739 // Interleave with zeros to get two sets of four 16-bit values.
740 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
741 // Set srcA_sse to contain eight copies of srcA, padded with zero.
742 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
743 __m128i srcA_sse = _mm_set1_epi16(srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000744 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000745 // Load four destination pixels into dst_sse.
746 __m128i dst_sse = _mm_load_si128(d);
747 // Load four 16-bit masks into lower half of mask_sse.
748 __m128i mask_sse = _mm_loadl_epi64(
749 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000750
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000751 // Check whether masks are equal to 0 and get the highest bit
752 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000753 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000754 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000755 _mm_setzero_si128()));
756
757 // if mask pixels are not all zero, we will blend the dst pixels
758 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000759 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000760 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
761 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
762 mask_sse = _mm_unpacklo_epi16(mask_sse,
763 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000764
765 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000766 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
767 mask_sse, srcA_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000768 _mm_store_si128(d, result);
769 }
770
771 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000772 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000773 width -= 4;
774 }
775
776 dst = reinterpret_cast<SkPMColor*>(d);
777 }
778
779 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000780 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
781 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000782 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000783 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000784 }
785}
786
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000787void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
788 SkColor src, int width, SkPMColor opaqueDst) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000789 if (width <= 0) {
790 return;
791 }
792
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000793 int srcR = SkColorGetR(src);
794 int srcG = SkColorGetG(src);
795 int srcB = SkColorGetB(src);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000796
797 if (width >= 4) {
798 SkASSERT(((size_t)dst & 0x03) == 0);
799 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000800 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
801 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000802 dst++;
803 width--;
804 }
805
806 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000807 // Set alpha to 0xFF and replicate source four times in SSE register.
808 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
809 // Set srcA_sse to contain eight copies of srcA, padded with zero.
810 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
811 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000812 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000813 // Load four destination pixels into dst_sse.
814 __m128i dst_sse = _mm_load_si128(d);
815 // Load four 16-bit masks into lower half of mask_sse.
816 __m128i mask_sse = _mm_loadl_epi64(
817 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000818
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000819 // Check whether masks are equal to 0 and get the highest bit
820 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000821 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000822 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000823 _mm_setzero_si128()));
824
825 // if mask pixels are not all zero, we will blend the dst pixels
826 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000827 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000828 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
829 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
830 mask_sse = _mm_unpacklo_epi16(mask_sse,
831 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000832
833 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000834 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
835 mask_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000836 _mm_store_si128(d, result);
837 }
838
839 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000840 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000841 width -= 4;
842 }
843
844 dst = reinterpret_cast<SkPMColor*>(d);
845 }
846
847 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000848 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
849 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000850 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000851 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000852 }
853}