blob: 4f69fdd60d5574c992d6580fd47c6f399df1910f [file] [log] [blame]
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001/*
2 **
3 ** Copyright 2009, The Android Open Source Project
4 **
reed@google.com981d4792011-03-09 12:55:47 +00005 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
senorblanco@chromium.org92727612009-11-04 20:51:06 +00008 **
reed@google.com981d4792011-03-09 12:55:47 +00009 ** http://www.apache.org/licenses/LICENSE-2.0
senorblanco@chromium.org92727612009-11-04 20:51:06 +000010 **
reed@google.com981d4792011-03-09 12:55:47 +000011 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
senorblanco@chromium.org92727612009-11-04 20:51:06 +000015 ** limitations under the License.
16 */
17
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000018#include "SkBlitRow_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000019#include "SkColorPriv.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000020#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000021
22#include <emmintrin.h>
23
senorblanco@chromium.org92727612009-11-04 20:51:06 +000024/* SSE2 version of S32_Blend_BlitRow32()
25 * portable version is in core/SkBlitRow_D32.cpp
26 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000027void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
28 const SkPMColor* SK_RESTRICT src,
29 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000030 SkASSERT(alpha <= 255);
31 if (count <= 0) {
32 return;
33 }
34
35 uint32_t src_scale = SkAlpha255To256(alpha);
36 uint32_t dst_scale = 256 - src_scale;
37
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000038 if (count >= 4) {
39 SkASSERT(((size_t)dst & 0x03) == 0);
40 while (((size_t)dst & 0x0F) != 0) {
41 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
42 src++;
43 dst++;
44 count--;
45 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000046
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000047 const __m128i *s = reinterpret_cast<const __m128i*>(src);
48 __m128i *d = reinterpret_cast<__m128i*>(dst);
49 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
50 __m128i src_scale_wide = _mm_set1_epi16(src_scale);
51 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale);
52 while (count >= 4) {
53 // Load 4 pixels each of src and dest.
54 __m128i src_pixel = _mm_loadu_si128(s);
55 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000056
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000057 // Get red and blue pixels into lower byte of each word.
58 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
59 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000060
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000061 // Get alpha and green into lower byte of each word.
62 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
63 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000064
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000065 // Multiply by scale.
66 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
67 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
68 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide);
69 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000070
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000071 // Divide by 256.
72 src_rb = _mm_srli_epi16(src_rb, 8);
73 dst_rb = _mm_srli_epi16(dst_rb, 8);
74 src_ag = _mm_andnot_si128(rb_mask, src_ag);
75 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000076
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000077 // Combine back into RGBA.
78 src_pixel = _mm_or_si128(src_rb, src_ag);
79 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
80
81 // Add result
82 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
83 _mm_store_si128(d, result);
84 s++;
85 d++;
86 count -= 4;
87 }
88 src = reinterpret_cast<const SkPMColor*>(s);
89 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000090 }
91
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000092 while (count > 0) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000093 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
94 src++;
95 dst++;
96 count--;
97 }
98}
99
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000100void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
101 const SkPMColor* SK_RESTRICT src,
102 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000103 SkASSERT(alpha == 255);
104 if (count <= 0) {
105 return;
106 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000107
108 if (count >= 4) {
109 SkASSERT(((size_t)dst & 0x03) == 0);
110 while (((size_t)dst & 0x0F) != 0) {
111 *dst = SkPMSrcOver(*src, *dst);
112 src++;
113 dst++;
114 count--;
115 }
116
117 const __m128i *s = reinterpret_cast<const __m128i*>(src);
118 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000119#ifdef SK_USE_ACCURATE_BLENDING
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000120 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
121 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
122 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
123 while (count >= 4) {
124 // Load 4 pixels
125 __m128i src_pixel = _mm_loadu_si128(s);
126 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000127
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000128 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000129 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000130 // Shift alphas down to lower 8 bits of each quad.
131 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000132
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000133 // Copy alpha to upper 3rd byte of each quad
134 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000135
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000136 // Subtract alphas from 255, to get 0..255
137 alpha = _mm_sub_epi16(c_255, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000138
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000139 // Multiply by red and blue by src alpha.
140 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
141 // Multiply by alpha and green by src alpha.
142 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000143
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000144 // dst_rb_low = (dst_rb >> 8)
145 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
146 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000147
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000148 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
149 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
150 dst_rb = _mm_add_epi16(dst_rb, c_128);
151 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000152
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000153 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
154 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
155 dst_ag = _mm_add_epi16(dst_ag, c_128);
156 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000157
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000158 // Combine back into RGBA.
159 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000160
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000161 // Add result
162 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
163 _mm_store_si128(d, result);
164 s++;
165 d++;
166 count -= 4;
167 }
168 #else
169 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
170 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
171 while (count >= 4) {
172 // Load 4 pixels
173 __m128i src_pixel = _mm_loadu_si128(s);
174 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000175
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000176 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000177 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000178
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000179 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
180 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
181
182 // (a0, a0, a1, a1, a2, g2, a3, g3)
183 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
184
185 // (a0, a0, a1, a1, a2, a2, a3, a3)
186 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000187
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000188 // Subtract alphas from 256, to get 1..256
189 alpha = _mm_sub_epi16(c_256, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000190
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000191 // Multiply by red and blue by src alpha.
192 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
193 // Multiply by alpha and green by src alpha.
194 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000195
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000196 // Divide by 256.
197 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000198
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000199 // Mask out high bits (already in the right place)
200 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000201
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000202 // Combine back into RGBA.
203 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000204
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000205 // Add result
206 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
207 _mm_store_si128(d, result);
208 s++;
209 d++;
210 count -= 4;
211 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000212#endif
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000213 src = reinterpret_cast<const SkPMColor*>(s);
214 dst = reinterpret_cast<SkPMColor*>(d);
215 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000216
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000217 while (count > 0) {
218 *dst = SkPMSrcOver(*src, *dst);
219 src++;
220 dst++;
221 count--;
222 }
223}
224
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000225void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
226 const SkPMColor* SK_RESTRICT src,
227 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000228 SkASSERT(alpha <= 255);
229 if (count <= 0) {
230 return;
231 }
232
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000233 if (count >= 4) {
234 while (((size_t)dst & 0x0F) != 0) {
235 *dst = SkBlendARGB32(*src, *dst, alpha);
236 src++;
237 dst++;
238 count--;
239 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000240
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000241 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000242
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000243 const __m128i *s = reinterpret_cast<const __m128i*>(src);
244 __m128i *d = reinterpret_cast<__m128i*>(dst);
245 __m128i src_scale_wide = _mm_set1_epi16(src_scale);
246 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
247 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
248 while (count >= 4) {
249 // Load 4 pixels each of src and dest.
250 __m128i src_pixel = _mm_loadu_si128(s);
251 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000252
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000253 // Get red and blue pixels into lower byte of each word.
254 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
255 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000256
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000257 // Get alpha and green into lower byte of each word.
258 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
259 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000260
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000261 // Put per-pixel alpha in low byte of each word.
262 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
263 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000264
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000265 // dst_alpha = dst_alpha * src_scale
266 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000267
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000268 // Divide by 256.
269 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000270
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000271 // Subtract alphas from 256, to get 1..256
272 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000273
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000274 // Multiply red and blue by dst pixel alpha.
275 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
276 // Multiply alpha and green by dst pixel alpha.
277 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000278
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000279 // Multiply red and blue by global alpha.
280 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
281 // Multiply alpha and green by global alpha.
282 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000283
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000284 // Divide by 256.
285 dst_rb = _mm_srli_epi16(dst_rb, 8);
286 src_rb = _mm_srli_epi16(src_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000287
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000288 // Mask out low bits (goodies already in the right place; no need to divide)
289 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
290 src_ag = _mm_andnot_si128(rb_mask, src_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000291
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000292 // Combine back into RGBA.
293 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
294 src_pixel = _mm_or_si128(src_rb, src_ag);
295
296 // Add two pixels into result.
297 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
298 _mm_store_si128(d, result);
299 s++;
300 d++;
301 count -= 4;
302 }
303 src = reinterpret_cast<const SkPMColor*>(s);
304 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000305 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000306
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000307 while (count > 0) {
308 *dst = SkBlendARGB32(*src, *dst, alpha);
309 src++;
310 dst++;
311 count--;
312 }
313}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000314
315/* SSE2 version of Color32()
316 * portable version is in core/SkBlitRow_D32.cpp
317 */
318void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
319 SkPMColor color) {
320
321 if (count <= 0) {
322 return;
323 }
324
325 if (0 == color) {
326 if (src != dst) {
327 memcpy(dst, src, count * sizeof(SkPMColor));
328 }
329 }
330
331 unsigned colorA = SkGetPackedA32(color);
332 if (255 == colorA) {
333 sk_memset32(dst, color, count);
334 } else {
335 unsigned scale = 256 - SkAlpha255To256(colorA);
336
337 if (count >= 4) {
338 SkASSERT(((size_t)dst & 0x03) == 0);
339 while (((size_t)dst & 0x0F) != 0) {
340 *dst = color + SkAlphaMulQ(*src, scale);
341 src++;
342 dst++;
343 count--;
344 }
345
346 const __m128i *s = reinterpret_cast<const __m128i*>(src);
347 __m128i *d = reinterpret_cast<__m128i*>(dst);
348 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
349 __m128i src_scale_wide = _mm_set1_epi16(scale);
350 __m128i color_wide = _mm_set1_epi32(color);
351 while (count >= 4) {
352 // Load 4 pixels each of src and dest.
353 __m128i src_pixel = _mm_loadu_si128(s);
354
355 // Get red and blue pixels into lower byte of each word.
356 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
reed@google.com981d4792011-03-09 12:55:47 +0000357
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000358 // Get alpha and green into lower byte of each word.
359 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
360
361 // Multiply by scale.
362 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
363 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
364
365 // Divide by 256.
366 src_rb = _mm_srli_epi16(src_rb, 8);
367 src_ag = _mm_andnot_si128(rb_mask, src_ag);
368
369 // Combine back into RGBA.
370 src_pixel = _mm_or_si128(src_rb, src_ag);
371
372 // Add color to result.
373 __m128i result = _mm_add_epi8(color_wide, src_pixel);
374
375 // Store result.
376 _mm_store_si128(d, result);
377 s++;
378 d++;
379 count -= 4;
380 }
381 src = reinterpret_cast<const SkPMColor*>(s);
382 dst = reinterpret_cast<SkPMColor*>(d);
383 }
384
385 while (count > 0) {
386 *dst = color + SkAlphaMulQ(*src, scale);
387 src += 1;
388 dst += 1;
389 count--;
reed@google.com981d4792011-03-09 12:55:47 +0000390 }
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000391 }
392}
reed@google.com981d4792011-03-09 12:55:47 +0000393
394void SkARGB32_BlitMask_SSE2(void* device, size_t dstRB,
395 SkBitmap::Config dstConfig, const uint8_t* mask,
reed@google.comee467ee2011-03-09 13:23:57 +0000396 size_t maskRB, SkColor origColor,
reed@google.com981d4792011-03-09 12:55:47 +0000397 int width, int height)
398{
reed@google.comee467ee2011-03-09 13:23:57 +0000399 SkPMColor color = SkPreMultiplyColor(origColor);
reed@google.com981d4792011-03-09 12:55:47 +0000400 size_t dstOffset = dstRB - (width << 2);
401 size_t maskOffset = maskRB - width;
402 SkPMColor* dst = (SkPMColor *)device;
403 do {
404 int count = width;
405 if (count >= 4) {
406 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
407 *dst = SkBlendARGB32(color, *dst, *mask);
408 mask++;
409 dst++;
410 count--;
411 }
412 __m128i *d = reinterpret_cast<__m128i*>(dst);
413 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
414 __m128i c_256 = _mm_set1_epi16(256);
415 __m128i c_1 = _mm_set1_epi16(1);
416 __m128i src_pixel = _mm_set1_epi32(color);
417 while (count >= 4) {
418 // Load 4 pixels each of src and dest.
419 __m128i dst_pixel = _mm_load_si128(d);
420
421 //set the aphla value
422 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
423 0, *(mask+3),0, \
424 *(mask+2),0, *(mask+2),\
425 0,*(mask+1), 0,*(mask+1),\
426 0, *mask,0,*mask);
427
428 //call SkAlpha255To256()
429 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
430
431 // Get red and blue pixels into lower byte of each word.
432 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
433 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
434
435 // Get alpha and green into lower byte of each word.
436 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
437 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
438
439 // Put per-pixel alpha in low byte of each word.
440 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
441 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
442
443 // dst_alpha = dst_alpha * src_scale
444 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
445
446 // Divide by 256.
447 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
448
449 // Subtract alphas from 256, to get 1..256
450 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
451 // Multiply red and blue by dst pixel alpha.
452 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
453 // Multiply alpha and green by dst pixel alpha.
454 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
455
456 // Multiply red and blue by global alpha.
457 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
458 // Multiply alpha and green by global alpha.
459 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
460 // Divide by 256.
461 dst_rb = _mm_srli_epi16(dst_rb, 8);
462 src_rb = _mm_srli_epi16(src_rb, 8);
463
464 // Mask out low bits (goodies already in the right place; no need to divide)
465 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
466 src_ag = _mm_andnot_si128(rb_mask, src_ag);
467
468 // Combine back into RGBA.
469 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
470 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
471
472 // Add two pixels into result.
473 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
474 _mm_store_si128(d, result);
475 // load the next 4 pixel
476 mask = mask + 4;
477 d++;
478 count -= 4;
479 }
480 dst = reinterpret_cast<SkPMColor *>(d);
481 }
482 while(count > 0) {
483 *dst= SkBlendARGB32(color, *dst, *mask);
484 dst += 1;
485 mask++;
486 count --;
487 }
488 dst = (SkPMColor *)((char*)dst + dstOffset);
489 mask += maskOffset;
490 } while (--height != 0);
491}