blob: 3ea1a9bfae243f58133bbf997220afb4695fce45 [file] [log] [blame]
epoger@google.comec3ed6a2011-07-28 14:26:00 +00001
senorblanco@chromium.org92727612009-11-04 20:51:06 +00002/*
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 * Copyright 2009 The Android Open Source Project
4 *
5 * Use of this source code is governed by a BSD-style license that can be
6 * found in the LICENSE file.
senorblanco@chromium.org92727612009-11-04 20:51:06 +00007 */
8
epoger@google.comec3ed6a2011-07-28 14:26:00 +00009
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000010#include "SkBlitRow_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000011#include "SkColorPriv.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000012#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000013
14#include <emmintrin.h>
15
senorblanco@chromium.org92727612009-11-04 20:51:06 +000016/* SSE2 version of S32_Blend_BlitRow32()
17 * portable version is in core/SkBlitRow_D32.cpp
18 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000019void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
20 const SkPMColor* SK_RESTRICT src,
21 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000022 SkASSERT(alpha <= 255);
23 if (count <= 0) {
24 return;
25 }
26
27 uint32_t src_scale = SkAlpha255To256(alpha);
28 uint32_t dst_scale = 256 - src_scale;
29
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000030 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
34 src++;
35 dst++;
36 count--;
37 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000038
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000039 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
42 __m128i src_scale_wide = _mm_set1_epi16(src_scale);
43 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
44 while (count >= 4) {
45 // Load 4 pixels each of src and dest.
46 __m128i src_pixel = _mm_loadu_si128(s);
47 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000048
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000049 // Get red and blue pixels into lower byte of each word.
50 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
51 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000052
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000053 // Get alpha and green into lower byte of each word.
54 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
55 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000056
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000057 // Multiply by scale.
58 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
59 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
60 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
61 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000062
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000063 // Divide by 256.
64 src_rb = _mm_srli_epi16(src_rb, 8);
65 dst_rb = _mm_srli_epi16(dst_rb, 8);
66 src_ag = _mm_andnot_si128(rb_mask, src_ag);
67 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000068
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000069 // Combine back into RGBA.
70 src_pixel = _mm_or_si128(src_rb, src_ag);
71 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
72
73 // Add result
74 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
75 _mm_store_si128(d, result);
76 s++;
77 d++;
78 count -= 4;
79 }
80 src = reinterpret_cast<const SkPMColor*>(s);
81 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000082 }
83
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000084 while (count > 0) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000085 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
86 src++;
87 dst++;
88 count--;
89 }
90}
91
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000092void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
93 const SkPMColor* SK_RESTRICT src,
94 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000095 SkASSERT(alpha == 255);
96 if (count <= 0) {
97 return;
98 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000099
100 if (count >= 4) {
101 SkASSERT(((size_t)dst & 0x03) == 0);
102 while (((size_t)dst & 0x0F) != 0) {
103 *dst = SkPMSrcOver(*src, *dst);
104 src++;
105 dst++;
106 count--;
107 }
108
109 const __m128i *s = reinterpret_cast<const __m128i*>(src);
110 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000111#ifdef SK_USE_ACCURATE_BLENDING
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000112 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
113 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
114 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
115 while (count >= 4) {
116 // Load 4 pixels
117 __m128i src_pixel = _mm_loadu_si128(s);
118 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000119
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000120 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000121 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000122 // Shift alphas down to lower 8 bits of each quad.
123 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000124
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000125 // Copy alpha to upper 3rd byte of each quad
126 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000127
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000128 // Subtract alphas from 255, to get 0..255
129 alpha = _mm_sub_epi16(c_255, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000130
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000131 // Multiply by red and blue by src alpha.
132 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
133 // Multiply by alpha and green by src alpha.
134 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000135
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000136 // dst_rb_low = (dst_rb >> 8)
137 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
138 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000139
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000140 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
141 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
142 dst_rb = _mm_add_epi16(dst_rb, c_128);
143 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000144
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000145 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
146 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
147 dst_ag = _mm_add_epi16(dst_ag, c_128);
148 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000149
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000150 // Combine back into RGBA.
151 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000152
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000153 // Add result
154 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
155 _mm_store_si128(d, result);
156 s++;
157 d++;
158 count -= 4;
159 }
160 #else
161 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
162 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
163 while (count >= 4) {
164 // Load 4 pixels
165 __m128i src_pixel = _mm_loadu_si128(s);
166 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000167
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000168 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000169 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000170
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000171 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
172 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
173
174 // (a0, a0, a1, a1, a2, g2, a3, g3)
175 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
176
177 // (a0, a0, a1, a1, a2, a2, a3, a3)
178 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000179
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000180 // Subtract alphas from 256, to get 1..256
181 alpha = _mm_sub_epi16(c_256, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000182
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000183 // Multiply by red and blue by src alpha.
184 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
185 // Multiply by alpha and green by src alpha.
186 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000187
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000188 // Divide by 256.
189 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000190
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000191 // Mask out high bits (already in the right place)
192 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000193
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000194 // Combine back into RGBA.
195 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000196
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000197 // Add result
198 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
199 _mm_store_si128(d, result);
200 s++;
201 d++;
202 count -= 4;
203 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000204#endif
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000205 src = reinterpret_cast<const SkPMColor*>(s);
206 dst = reinterpret_cast<SkPMColor*>(d);
207 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000208
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000209 while (count > 0) {
210 *dst = SkPMSrcOver(*src, *dst);
211 src++;
212 dst++;
213 count--;
214 }
215}
216
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000217void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
218 const SkPMColor* SK_RESTRICT src,
219 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000220 SkASSERT(alpha <= 255);
221 if (count <= 0) {
222 return;
223 }
224
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000225 if (count >= 4) {
226 while (((size_t)dst & 0x0F) != 0) {
227 *dst = SkBlendARGB32(*src, *dst, alpha);
228 src++;
229 dst++;
230 count--;
231 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000232
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000233 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000234
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000235 const __m128i *s = reinterpret_cast<const __m128i*>(src);
236 __m128i *d = reinterpret_cast<__m128i*>(dst);
237 __m128i src_scale_wide = _mm_set1_epi16(src_scale);
238 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
239 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
240 while (count >= 4) {
241 // Load 4 pixels each of src and dest.
242 __m128i src_pixel = _mm_loadu_si128(s);
243 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000244
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000245 // Get red and blue pixels into lower byte of each word.
246 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
247 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000248
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000249 // Get alpha and green into lower byte of each word.
250 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
251 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000252
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000253 // Put per-pixel alpha in low byte of each word.
254 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
255 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000256
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000257 // dst_alpha = dst_alpha * src_scale
258 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000259
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000260 // Divide by 256.
261 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000262
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000263 // Subtract alphas from 256, to get 1..256
264 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000265
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000266 // Multiply red and blue by dst pixel alpha.
267 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
268 // Multiply alpha and green by dst pixel alpha.
269 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000270
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000271 // Multiply red and blue by global alpha.
272 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
273 // Multiply alpha and green by global alpha.
274 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000275
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000276 // Divide by 256.
277 dst_rb = _mm_srli_epi16(dst_rb, 8);
278 src_rb = _mm_srli_epi16(src_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000279
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000280 // Mask out low bits (goodies already in the right place; no need to divide)
281 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
282 src_ag = _mm_andnot_si128(rb_mask, src_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000283
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000284 // Combine back into RGBA.
285 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
286 src_pixel = _mm_or_si128(src_rb, src_ag);
287
288 // Add two pixels into result.
289 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
290 _mm_store_si128(d, result);
291 s++;
292 d++;
293 count -= 4;
294 }
295 src = reinterpret_cast<const SkPMColor*>(s);
296 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000297 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000298
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000299 while (count > 0) {
300 *dst = SkBlendARGB32(*src, *dst, alpha);
301 src++;
302 dst++;
303 count--;
304 }
305}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000306
307/* SSE2 version of Color32()
308 * portable version is in core/SkBlitRow_D32.cpp
309 */
310void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
311 SkPMColor color) {
312
313 if (count <= 0) {
314 return;
315 }
316
317 if (0 == color) {
318 if (src != dst) {
319 memcpy(dst, src, count * sizeof(SkPMColor));
320 }
321 }
322
323 unsigned colorA = SkGetPackedA32(color);
324 if (255 == colorA) {
325 sk_memset32(dst, color, count);
326 } else {
327 unsigned scale = 256 - SkAlpha255To256(colorA);
328
329 if (count >= 4) {
330 SkASSERT(((size_t)dst & 0x03) == 0);
331 while (((size_t)dst & 0x0F) != 0) {
332 *dst = color + SkAlphaMulQ(*src, scale);
333 src++;
334 dst++;
335 count--;
336 }
337
338 const __m128i *s = reinterpret_cast<const __m128i*>(src);
339 __m128i *d = reinterpret_cast<__m128i*>(dst);
340 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
341 __m128i src_scale_wide = _mm_set1_epi16(scale);
342 __m128i color_wide = _mm_set1_epi32(color);
343 while (count >= 4) {
344 // Load 4 pixels each of src and dest.
345 __m128i src_pixel = _mm_loadu_si128(s);
346
347 // Get red and blue pixels into lower byte of each word.
348 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
reed@google.com981d4792011-03-09 12:55:47 +0000349
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000350 // Get alpha and green into lower byte of each word.
351 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
352
353 // Multiply by scale.
354 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
355 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
356
357 // Divide by 256.
358 src_rb = _mm_srli_epi16(src_rb, 8);
359 src_ag = _mm_andnot_si128(rb_mask, src_ag);
360
361 // Combine back into RGBA.
362 src_pixel = _mm_or_si128(src_rb, src_ag);
363
364 // Add color to result.
365 __m128i result = _mm_add_epi8(color_wide, src_pixel);
366
367 // Store result.
368 _mm_store_si128(d, result);
369 s++;
370 d++;
371 count -= 4;
372 }
373 src = reinterpret_cast<const SkPMColor*>(s);
374 dst = reinterpret_cast<SkPMColor*>(d);
375 }
376
377 while (count > 0) {
378 *dst = color + SkAlphaMulQ(*src, scale);
379 src += 1;
380 dst += 1;
381 count--;
reed@google.com981d4792011-03-09 12:55:47 +0000382 }
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000383 }
384}
reed@google.com981d4792011-03-09 12:55:47 +0000385
386void SkARGB32_BlitMask_SSE2(void* device, size_t dstRB,
387 SkBitmap::Config dstConfig, const uint8_t* mask,
reed@google.comee467ee2011-03-09 13:23:57 +0000388 size_t maskRB, SkColor origColor,
reed@google.com981d4792011-03-09 12:55:47 +0000389 int width, int height)
390{
reed@google.comee467ee2011-03-09 13:23:57 +0000391 SkPMColor color = SkPreMultiplyColor(origColor);
reed@google.com981d4792011-03-09 12:55:47 +0000392 size_t dstOffset = dstRB - (width << 2);
393 size_t maskOffset = maskRB - width;
394 SkPMColor* dst = (SkPMColor *)device;
395 do {
396 int count = width;
397 if (count >= 4) {
398 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
399 *dst = SkBlendARGB32(color, *dst, *mask);
400 mask++;
401 dst++;
402 count--;
403 }
404 __m128i *d = reinterpret_cast<__m128i*>(dst);
405 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
406 __m128i c_256 = _mm_set1_epi16(256);
407 __m128i c_1 = _mm_set1_epi16(1);
408 __m128i src_pixel = _mm_set1_epi32(color);
409 while (count >= 4) {
410 // Load 4 pixels each of src and dest.
411 __m128i dst_pixel = _mm_load_si128(d);
412
413 //set the aphla value
414 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
415 0, *(mask+3),0, \
416 *(mask+2),0, *(mask+2),\
417 0,*(mask+1), 0,*(mask+1),\
418 0, *mask,0,*mask);
419
420 //call SkAlpha255To256()
421 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
422
423 // Get red and blue pixels into lower byte of each word.
424 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
425 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
426
427 // Get alpha and green into lower byte of each word.
428 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
429 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
430
431 // Put per-pixel alpha in low byte of each word.
432 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
433 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
434
435 // dst_alpha = dst_alpha * src_scale
436 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
437
438 // Divide by 256.
439 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
440
441 // Subtract alphas from 256, to get 1..256
442 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
443 // Multiply red and blue by dst pixel alpha.
444 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
445 // Multiply alpha and green by dst pixel alpha.
446 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
447
448 // Multiply red and blue by global alpha.
449 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
450 // Multiply alpha and green by global alpha.
451 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
452 // Divide by 256.
453 dst_rb = _mm_srli_epi16(dst_rb, 8);
454 src_rb = _mm_srli_epi16(src_rb, 8);
455
456 // Mask out low bits (goodies already in the right place; no need to divide)
457 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
458 src_ag = _mm_andnot_si128(rb_mask, src_ag);
459
460 // Combine back into RGBA.
461 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
462 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
463
464 // Add two pixels into result.
465 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
466 _mm_store_si128(d, result);
467 // load the next 4 pixel
468 mask = mask + 4;
469 d++;
470 count -= 4;
471 }
472 dst = reinterpret_cast<SkPMColor *>(d);
473 }
474 while(count > 0) {
475 *dst= SkBlendARGB32(color, *dst, *mask);
476 dst += 1;
477 mask++;
478 count --;
479 }
480 dst = (SkPMColor *)((char*)dst + dstOffset);
481 mask += maskOffset;
482 } while (--height != 0);
483}