blob: 9e99b4bc4654da338c8fa0d92ddb5b728131497d [file] [log] [blame]
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001/*
tomhudson@google.com98a5b422012-02-28 16:15:26 +00002 * Copyright 2012 The Android Open Source Project
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
senorblanco@chromium.org92727612009-11-04 20:51:06 +00006 */
7
epoger@google.comec3ed6a2011-07-28 14:26:00 +00008
senorblanco@chromium.org4e753552009-11-16 21:09:00 +00009#include "SkBlitRow_opts_SSE2.h"
caryclark@google.com83ecdc32012-06-06 12:10:26 +000010#include "SkBitmapProcState_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000011#include "SkColorPriv.h"
commit-bot@chromium.org47591072014-02-19 03:09:52 +000012#include "SkColor_opts_SSE2.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000013#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000014
15#include <emmintrin.h>
16
senorblanco@chromium.org92727612009-11-04 20:51:06 +000017/* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000020void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21 const SkPMColor* SK_RESTRICT src,
22 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000023 SkASSERT(alpha <= 255);
24 if (count <= 0) {
25 return;
26 }
27
28 uint32_t src_scale = SkAlpha255To256(alpha);
29 uint32_t dst_scale = 256 - src_scale;
30
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000031 if (count >= 4) {
32 SkASSERT(((size_t)dst & 0x03) == 0);
33 while (((size_t)dst & 0x0F) != 0) {
34 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
35 src++;
36 dst++;
37 count--;
38 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000039
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000040 const __m128i *s = reinterpret_cast<const __m128i*>(src);
41 __m128i *d = reinterpret_cast<__m128i*>(dst);
42 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
tomhudson@google.com98a5b422012-02-28 16:15:26 +000043 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
44
45 // Move scale factors to upper byte of word
46 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
47 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000048 while (count >= 4) {
49 // Load 4 pixels each of src and dest.
50 __m128i src_pixel = _mm_loadu_si128(s);
51 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000052
tomhudson@google.com98a5b422012-02-28 16:15:26 +000053 // Interleave Atom port 0/1 operations based on the execution port
54 // constraints that multiply can only be executed on port 0 (while
55 // boolean operations can be executed on either port 0 or port 1)
56 // because GCC currently doesn't do a good job scheduling
57 // instructions based on these constraints.
58
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000059 // Get red and blue pixels into lower byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000060 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000061 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000062
tomhudson@google.com98a5b422012-02-28 16:15:26 +000063 // Multiply by scale.
64 // (4 x (0, rs.h, 0, bs.h))
65 // where rs.h stands for the higher byte of r * scale, and
66 // bs.h the higher byte of b * scale.
67 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
68
69 // Get alpha and green pixels into higher byte of each word.
70 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
71 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000072
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000073 // Multiply by scale.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000074 // (4 x (as.h, as.l, gs.h, gs.l))
75 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000076
tomhudson@google.com98a5b422012-02-28 16:15:26 +000077 // Clear the lower byte of the a*scale and g*scale results
78 // (4 x (as.h, 0, gs.h, 0))
79 src_ag = _mm_and_si128(src_ag, ag_mask);
80
81 // Operations the destination pixels are the same as on the
82 // source pixels. See the comments above.
83 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
84 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
85 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
86 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
87 dst_ag = _mm_and_si128(dst_ag, ag_mask);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000088
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000089 // Combine back into RGBA.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000090 // (4 x (as.h, rs.h, gs.h, bs.h))
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000091 src_pixel = _mm_or_si128(src_rb, src_ag);
92 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
93
94 // Add result
95 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
96 _mm_store_si128(d, result);
97 s++;
98 d++;
99 count -= 4;
100 }
101 src = reinterpret_cast<const SkPMColor*>(s);
102 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000103 }
104
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000105 while (count > 0) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000106 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
107 src++;
108 dst++;
109 count--;
110 }
111}
112
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000113void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
114 const SkPMColor* SK_RESTRICT src,
115 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000116 SkASSERT(alpha == 255);
117 if (count <= 0) {
118 return;
119 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000120
121 if (count >= 4) {
122 SkASSERT(((size_t)dst & 0x03) == 0);
123 while (((size_t)dst & 0x0F) != 0) {
124 *dst = SkPMSrcOver(*src, *dst);
125 src++;
126 dst++;
127 count--;
128 }
129
130 const __m128i *s = reinterpret_cast<const __m128i*>(src);
131 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000132#ifdef SK_USE_ACCURATE_BLENDING
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000133 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
134 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
135 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
136 while (count >= 4) {
137 // Load 4 pixels
138 __m128i src_pixel = _mm_loadu_si128(s);
139 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000140
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000141 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000142 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000143 // Shift alphas down to lower 8 bits of each quad.
144 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000145
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000146 // Copy alpha to upper 3rd byte of each quad
147 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000148
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000149 // Subtract alphas from 255, to get 0..255
150 alpha = _mm_sub_epi16(c_255, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000151
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000152 // Multiply by red and blue by src alpha.
153 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
154 // Multiply by alpha and green by src alpha.
155 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000156
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000157 // dst_rb_low = (dst_rb >> 8)
158 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
159 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000160
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000161 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
162 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
163 dst_rb = _mm_add_epi16(dst_rb, c_128);
164 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000165
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000166 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
167 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
168 dst_ag = _mm_add_epi16(dst_ag, c_128);
169 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000170
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000171 // Combine back into RGBA.
172 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000173
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000174 // Add result
175 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
176 _mm_store_si128(d, result);
177 s++;
178 d++;
179 count -= 4;
180 }
181 #else
182 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
183 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
184 while (count >= 4) {
185 // Load 4 pixels
186 __m128i src_pixel = _mm_loadu_si128(s);
187 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000188
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000189 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000190 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000191
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000192 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
193 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
194
195 // (a0, a0, a1, a1, a2, g2, a3, g3)
196 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
197
198 // (a0, a0, a1, a1, a2, a2, a3, a3)
199 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000200
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000201 // Subtract alphas from 256, to get 1..256
202 alpha = _mm_sub_epi16(c_256, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000203
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000204 // Multiply by red and blue by src alpha.
205 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
206 // Multiply by alpha and green by src alpha.
207 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000208
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000209 // Divide by 256.
210 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000211
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000212 // Mask out high bits (already in the right place)
213 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000214
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000215 // Combine back into RGBA.
216 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000217
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000218 // Add result
219 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
220 _mm_store_si128(d, result);
221 s++;
222 d++;
223 count -= 4;
224 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000225#endif
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000226 src = reinterpret_cast<const SkPMColor*>(s);
227 dst = reinterpret_cast<SkPMColor*>(d);
228 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000229
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000230 while (count > 0) {
231 *dst = SkPMSrcOver(*src, *dst);
232 src++;
233 dst++;
234 count--;
235 }
236}
237
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000238void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
239 const SkPMColor* SK_RESTRICT src,
240 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000241 SkASSERT(alpha <= 255);
242 if (count <= 0) {
243 return;
244 }
245
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000246 if (count >= 4) {
247 while (((size_t)dst & 0x0F) != 0) {
248 *dst = SkBlendARGB32(*src, *dst, alpha);
249 src++;
250 dst++;
251 count--;
252 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000253
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000254 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000255
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000256 const __m128i *s = reinterpret_cast<const __m128i*>(src);
257 __m128i *d = reinterpret_cast<__m128i*>(dst);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000258 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000259 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
260 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
261 while (count >= 4) {
262 // Load 4 pixels each of src and dest.
263 __m128i src_pixel = _mm_loadu_si128(s);
264 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000265
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000266 // Get red and blue pixels into lower byte of each word.
267 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
268 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000269
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000270 // Get alpha and green into lower byte of each word.
271 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
272 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000273
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000274 // Put per-pixel alpha in low byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000275 // After the following two statements, the dst_alpha looks like
276 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000277 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
278 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000279
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000280 // dst_alpha = dst_alpha * src_scale
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000281 // Because src_scales are in the higher byte of each word and
282 // we use mulhi here, the resulting alpha values are already
283 // in the right place and don't need to be divided by 256.
284 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
285 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000286
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000287 // Subtract alphas from 256, to get 1..256
288 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000289
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000290 // Multiply red and blue by dst pixel alpha.
291 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
292 // Multiply alpha and green by dst pixel alpha.
293 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000294
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000295 // Multiply red and blue by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000296 // (4 x (0, rs.h, 0, bs.h))
297 // where rs.h stands for the higher byte of r * src_scale,
298 // and bs.h the higher byte of b * src_scale.
299 // Again, because we use mulhi, the resuling red and blue
300 // values are already in the right place and don't need to
301 // be divided by 256.
302 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000303 // Multiply alpha and green by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000304 // (4 x (0, as.h, 0, gs.h))
305 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000306
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000307 // Divide by 256.
308 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000309
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000310 // Mask out low bits (goodies already in the right place; no need to divide)
311 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000312 // Shift alpha and green to higher byte of each word.
313 // (4 x (as.h, 0, gs.h, 0))
314 src_ag = _mm_slli_epi16(src_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000315
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000316 // Combine back into RGBA.
317 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
318 src_pixel = _mm_or_si128(src_rb, src_ag);
319
320 // Add two pixels into result.
321 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
322 _mm_store_si128(d, result);
323 s++;
324 d++;
325 count -= 4;
326 }
327 src = reinterpret_cast<const SkPMColor*>(s);
328 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000329 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000330
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000331 while (count > 0) {
332 *dst = SkBlendARGB32(*src, *dst, alpha);
333 src++;
334 dst++;
335 count--;
336 }
337}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000338
339/* SSE2 version of Color32()
340 * portable version is in core/SkBlitRow_D32.cpp
341 */
342void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
343 SkPMColor color) {
344
345 if (count <= 0) {
346 return;
347 }
348
349 if (0 == color) {
350 if (src != dst) {
351 memcpy(dst, src, count * sizeof(SkPMColor));
352 }
reed@google.comc909a1e2011-10-25 19:07:23 +0000353 return;
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000354 }
355
356 unsigned colorA = SkGetPackedA32(color);
357 if (255 == colorA) {
358 sk_memset32(dst, color, count);
359 } else {
360 unsigned scale = 256 - SkAlpha255To256(colorA);
361
362 if (count >= 4) {
363 SkASSERT(((size_t)dst & 0x03) == 0);
364 while (((size_t)dst & 0x0F) != 0) {
365 *dst = color + SkAlphaMulQ(*src, scale);
366 src++;
367 dst++;
368 count--;
369 }
370
371 const __m128i *s = reinterpret_cast<const __m128i*>(src);
372 __m128i *d = reinterpret_cast<__m128i*>(dst);
373 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
374 __m128i src_scale_wide = _mm_set1_epi16(scale);
375 __m128i color_wide = _mm_set1_epi32(color);
376 while (count >= 4) {
377 // Load 4 pixels each of src and dest.
378 __m128i src_pixel = _mm_loadu_si128(s);
379
380 // Get red and blue pixels into lower byte of each word.
381 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
reed@google.com981d4792011-03-09 12:55:47 +0000382
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000383 // Get alpha and green into lower byte of each word.
384 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
385
386 // Multiply by scale.
387 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
388 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
389
390 // Divide by 256.
391 src_rb = _mm_srli_epi16(src_rb, 8);
392 src_ag = _mm_andnot_si128(rb_mask, src_ag);
393
394 // Combine back into RGBA.
395 src_pixel = _mm_or_si128(src_rb, src_ag);
396
397 // Add color to result.
398 __m128i result = _mm_add_epi8(color_wide, src_pixel);
399
400 // Store result.
401 _mm_store_si128(d, result);
402 s++;
403 d++;
404 count -= 4;
405 }
406 src = reinterpret_cast<const SkPMColor*>(s);
407 dst = reinterpret_cast<SkPMColor*>(d);
408 }
409
410 while (count > 0) {
411 *dst = color + SkAlphaMulQ(*src, scale);
412 src += 1;
413 dst += 1;
414 count--;
reed@google.com981d4792011-03-09 12:55:47 +0000415 }
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000416 }
417}
reed@google.com981d4792011-03-09 12:55:47 +0000418
reed@google.comedb606c2011-10-18 13:56:50 +0000419void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
420 size_t maskRB, SkColor origColor,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000421 int width, int height) {
reed@google.comee467ee2011-03-09 13:23:57 +0000422 SkPMColor color = SkPreMultiplyColor(origColor);
reed@google.com981d4792011-03-09 12:55:47 +0000423 size_t dstOffset = dstRB - (width << 2);
424 size_t maskOffset = maskRB - width;
425 SkPMColor* dst = (SkPMColor *)device;
reed@google.comedb606c2011-10-18 13:56:50 +0000426 const uint8_t* mask = (const uint8_t*)maskPtr;
reed@google.com981d4792011-03-09 12:55:47 +0000427 do {
428 int count = width;
429 if (count >= 4) {
430 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
431 *dst = SkBlendARGB32(color, *dst, *mask);
432 mask++;
433 dst++;
434 count--;
435 }
436 __m128i *d = reinterpret_cast<__m128i*>(dst);
437 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
438 __m128i c_256 = _mm_set1_epi16(256);
439 __m128i c_1 = _mm_set1_epi16(1);
440 __m128i src_pixel = _mm_set1_epi32(color);
441 while (count >= 4) {
442 // Load 4 pixels each of src and dest.
443 __m128i dst_pixel = _mm_load_si128(d);
444
445 //set the aphla value
446 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
447 0, *(mask+3),0, \
448 *(mask+2),0, *(mask+2),\
449 0,*(mask+1), 0,*(mask+1),\
450 0, *mask,0,*mask);
451
452 //call SkAlpha255To256()
453 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
454
455 // Get red and blue pixels into lower byte of each word.
456 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
457 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
458
459 // Get alpha and green into lower byte of each word.
460 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
461 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
462
463 // Put per-pixel alpha in low byte of each word.
464 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
465 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
466
467 // dst_alpha = dst_alpha * src_scale
468 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
469
470 // Divide by 256.
471 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
472
473 // Subtract alphas from 256, to get 1..256
474 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
475 // Multiply red and blue by dst pixel alpha.
476 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
477 // Multiply alpha and green by dst pixel alpha.
478 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
479
480 // Multiply red and blue by global alpha.
481 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
482 // Multiply alpha and green by global alpha.
483 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
484 // Divide by 256.
485 dst_rb = _mm_srli_epi16(dst_rb, 8);
486 src_rb = _mm_srli_epi16(src_rb, 8);
487
488 // Mask out low bits (goodies already in the right place; no need to divide)
489 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
490 src_ag = _mm_andnot_si128(rb_mask, src_ag);
491
492 // Combine back into RGBA.
493 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
494 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
495
496 // Add two pixels into result.
497 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
498 _mm_store_si128(d, result);
499 // load the next 4 pixel
500 mask = mask + 4;
501 d++;
502 count -= 4;
503 }
504 dst = reinterpret_cast<SkPMColor *>(d);
505 }
506 while(count > 0) {
507 *dst= SkBlendARGB32(color, *dst, *mask);
508 dst += 1;
509 mask++;
510 count --;
511 }
512 dst = (SkPMColor *)((char*)dst + dstOffset);
513 mask += maskOffset;
514 } while (--height != 0);
515}
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000516
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000517// The following (left) shifts cause the top 5 bits of the mask components to
518// line up with the corresponding components in an SkPMColor.
519// Note that the mask's RGB16 order may differ from the SkPMColor order.
520#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
521#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
522#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
523
524#if SK_R16x5_R32x5_SHIFT == 0
525 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
526#elif SK_R16x5_R32x5_SHIFT > 0
527 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
528#else
529 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
530#endif
531
532#if SK_G16x5_G32x5_SHIFT == 0
533 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
534#elif SK_G16x5_G32x5_SHIFT > 0
535 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
536#else
537 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
538#endif
539
540#if SK_B16x5_B32x5_SHIFT == 0
541 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
542#elif SK_B16x5_B32x5_SHIFT > 0
543 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
544#else
545 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
546#endif
547
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000548static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
549 __m128i &mask, __m128i &srcA) {
550 // In the following comments, the components of src, dst and mask are
551 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
552 // by an R, G, B, or A suffix. Components of one of the four pixels that
553 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
554 // example is the blue channel of the second destination pixel. Memory
555 // layout is shown for an ARGB byte order in a color value.
556
557 // src and srcA store 8-bit values interleaved with zeros.
558 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
559 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
560 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
561 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
562 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
563 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
564 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
565
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000566 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000567 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000568 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
569 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000570
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000571 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000572 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
573 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000574
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000575 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000576 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
577 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000578
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000579 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000580 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
581 // 8-bit position
582 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
583 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000584 mask = _mm_or_si128(_mm_or_si128(r, g), b);
585
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000586 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000587 // i.e. split the sixteen 8-bit values from mask into two sets of eight
588 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000589 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000590 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000591 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000592 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000593 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
594
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000595 // Upscale from 0..31 to 0..32
596 // (allows to replace division by left-shift further down)
597 // Left-shift each component by 4 and add the result back to that component,
598 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000599 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
600 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
601
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000602 // Multiply each component of maskLo and maskHi by srcA
603 maskLo = _mm_mullo_epi16(maskLo, srcA);
604 maskHi = _mm_mullo_epi16(maskHi, srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000605
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000606 // Left shift mask components by 8 (divide by 256)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000607 maskLo = _mm_srli_epi16(maskLo, 8);
608 maskHi = _mm_srli_epi16(maskHi, 8);
609
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000610 // Interleave R,G,B into the lower byte of the word
611 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000612 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000613 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000614 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
615
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000616 // mask = (src - dst) * mask
617 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
618 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000619
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000620 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000621 maskLo = _mm_srai_epi16(maskLo, 5);
622 maskHi = _mm_srai_epi16(maskHi, 5);
623
624 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000625 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000626 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
627 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
628
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000629 // Pack into 4 32bit dst pixels.
630 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
631 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
632 // clamping to 255 if necessary.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000633 return _mm_packus_epi16(resultLo, resultHi);
634}
635
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000636static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000637 __m128i &mask) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000638 // In the following comments, the components of src, dst and mask are
639 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
640 // by an R, G, B, or A suffix. Components of one of the four pixels that
641 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
642 // example is the blue channel of the second destination pixel. Memory
643 // layout is shown for an ARGB byte order in a color value.
644
645 // src and srcA store 8-bit values interleaved with zeros.
646 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
647 // mask stores 16-bit values (shown as high and low bytes) interleaved with
648 // zeros
649 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
650 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
651
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000652 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000653 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000654 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
655 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000656
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000657 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000658 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
659 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000660
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000661 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000662 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
663 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000664
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000665 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000666 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
667 // 8-bit position
668 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
669 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000670 mask = _mm_or_si128(_mm_or_si128(r, g), b);
671
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000672 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000673 // i.e. split the sixteen 8-bit values from mask into two sets of eight
674 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000675 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000676 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000677 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000678 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000679 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
680
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000681 // Upscale from 0..31 to 0..32
682 // (allows to replace division by left-shift further down)
683 // Left-shift each component by 4 and add the result back to that component,
684 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000685 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
686 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
687
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000688 // Interleave R,G,B into the lower byte of the word
689 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000690 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000691 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000692 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
693
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000694 // mask = (src - dst) * mask
695 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
696 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000697
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000698 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000699 maskLo = _mm_srai_epi16(maskLo, 5);
700 maskHi = _mm_srai_epi16(maskHi, 5);
701
702 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000703 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000704 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
705 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
706
bungeman@google.com27123cd2012-08-21 19:25:42 +0000707 // Pack into 4 32bit dst pixels and force opaque.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000708 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
709 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
710 // clamping to 255 if necessary. Set alpha components to 0xFF.
bungeman@google.com27123cd2012-08-21 19:25:42 +0000711 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
712 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000713}
714
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000715void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
716 SkColor src, int width, SkPMColor) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000717 if (width <= 0) {
718 return;
719 }
720
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000721 int srcA = SkColorGetA(src);
722 int srcR = SkColorGetR(src);
723 int srcG = SkColorGetG(src);
724 int srcB = SkColorGetB(src);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000725
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000726 srcA = SkAlpha255To256(srcA);
727
728 if (width >= 4) {
729 SkASSERT(((size_t)dst & 0x03) == 0);
730 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000731 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
732 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000733 dst++;
734 width--;
735 }
736
737 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000738 // Set alpha to 0xFF and replicate source four times in SSE register.
739 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
740 // Interleave with zeros to get two sets of four 16-bit values.
741 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
742 // Set srcA_sse to contain eight copies of srcA, padded with zero.
743 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
744 __m128i srcA_sse = _mm_set1_epi16(srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000745 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000746 // Load four destination pixels into dst_sse.
747 __m128i dst_sse = _mm_load_si128(d);
748 // Load four 16-bit masks into lower half of mask_sse.
749 __m128i mask_sse = _mm_loadl_epi64(
750 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000751
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000752 // Check whether masks are equal to 0 and get the highest bit
753 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000754 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000755 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000756 _mm_setzero_si128()));
757
758 // if mask pixels are not all zero, we will blend the dst pixels
759 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000760 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000761 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
762 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
763 mask_sse = _mm_unpacklo_epi16(mask_sse,
764 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000765
766 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000767 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
768 mask_sse, srcA_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000769 _mm_store_si128(d, result);
770 }
771
772 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000773 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000774 width -= 4;
775 }
776
777 dst = reinterpret_cast<SkPMColor*>(d);
778 }
779
780 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000781 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
782 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000783 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000784 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000785 }
786}
787
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000788void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
789 SkColor src, int width, SkPMColor opaqueDst) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000790 if (width <= 0) {
791 return;
792 }
793
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000794 int srcR = SkColorGetR(src);
795 int srcG = SkColorGetG(src);
796 int srcB = SkColorGetB(src);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000797
798 if (width >= 4) {
799 SkASSERT(((size_t)dst & 0x03) == 0);
800 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000801 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
802 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000803 dst++;
804 width--;
805 }
806
807 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000808 // Set alpha to 0xFF and replicate source four times in SSE register.
809 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
810 // Set srcA_sse to contain eight copies of srcA, padded with zero.
811 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
812 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000813 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000814 // Load four destination pixels into dst_sse.
815 __m128i dst_sse = _mm_load_si128(d);
816 // Load four 16-bit masks into lower half of mask_sse.
817 __m128i mask_sse = _mm_loadl_epi64(
818 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000819
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000820 // Check whether masks are equal to 0 and get the highest bit
821 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000822 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000823 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000824 _mm_setzero_si128()));
825
826 // if mask pixels are not all zero, we will blend the dst pixels
827 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000828 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000829 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
830 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
831 mask_sse = _mm_unpacklo_epi16(mask_sse,
832 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000833
834 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000835 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
836 mask_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000837 _mm_store_si128(d, result);
838 }
839
840 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000841 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000842 width -= 4;
843 }
844
845 dst = reinterpret_cast<SkPMColor*>(d);
846 }
847
848 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000849 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
850 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000851 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000852 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000853 }
854}
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000855
commit-bot@chromium.org39ce33a2014-02-24 04:23:39 +0000856/* SSE2 version of S32_D565_Opaque()
857 * portable version is in core/SkBlitRow_D16.cpp
858 */
859void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
860 const SkPMColor* SK_RESTRICT src, int count,
861 U8CPU alpha, int /*x*/, int /*y*/) {
862 SkASSERT(255 == alpha);
863
864 if (count <= 0) {
865 return;
866 }
867
868 if (count >= 8) {
869 while (((size_t)dst & 0x0F) != 0) {
870 SkPMColor c = *src++;
871 SkPMColorAssert(c);
872
873 *dst++ = SkPixel32ToPixel16_ToU16(c);
874 count--;
875 }
876
877 const __m128i* s = reinterpret_cast<const __m128i*>(src);
878 __m128i* d = reinterpret_cast<__m128i*>(dst);
879 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
880 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
881 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
882
883 while (count >= 8) {
884 // Load 8 pixels of src.
885 __m128i src_pixel1 = _mm_loadu_si128(s++);
886 __m128i src_pixel2 = _mm_loadu_si128(s++);
887
888 // Calculate result r.
889 __m128i r1 = _mm_srli_epi32(src_pixel1,
890 SK_R32_SHIFT + (8 - SK_R16_BITS));
891 r1 = _mm_and_si128(r1, r16_mask);
892 __m128i r2 = _mm_srli_epi32(src_pixel2,
893 SK_R32_SHIFT + (8 - SK_R16_BITS));
894 r2 = _mm_and_si128(r2, r16_mask);
895 __m128i r = _mm_packs_epi32(r1, r2);
896
897 // Calculate result g.
898 __m128i g1 = _mm_srli_epi32(src_pixel1,
899 SK_G32_SHIFT + (8 - SK_G16_BITS));
900 g1 = _mm_and_si128(g1, g16_mask);
901 __m128i g2 = _mm_srli_epi32(src_pixel2,
902 SK_G32_SHIFT + (8 - SK_G16_BITS));
903 g2 = _mm_and_si128(g2, g16_mask);
904 __m128i g = _mm_packs_epi32(g1, g2);
905
906 // Calculate result b.
907 __m128i b1 = _mm_srli_epi32(src_pixel1,
908 SK_B32_SHIFT + (8 - SK_B16_BITS));
909 b1 = _mm_and_si128(b1, b16_mask);
910 __m128i b2 = _mm_srli_epi32(src_pixel2,
911 SK_B32_SHIFT + (8 - SK_B16_BITS));
912 b2 = _mm_and_si128(b2, b16_mask);
913 __m128i b = _mm_packs_epi32(b1, b2);
914
915 // Store 8 16-bit colors in dst.
916 __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
917 _mm_store_si128(d++, d_pixel);
918 count -= 8;
919 }
920 src = reinterpret_cast<const SkPMColor*>(s);
921 dst = reinterpret_cast<uint16_t*>(d);
922 }
923
924 if (count > 0) {
925 do {
926 SkPMColor c = *src++;
927 SkPMColorAssert(c);
928 *dst++ = SkPixel32ToPixel16_ToU16(c);
929 } while (--count != 0);
930 }
931}
932
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000933/* SSE2 version of S32A_D565_Opaque()
934 * portable version is in core/SkBlitRow_D16.cpp
935 */
936void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
937 const SkPMColor* SK_RESTRICT src,
938 int count, U8CPU alpha, int /*x*/, int /*y*/) {
939 SkASSERT(255 == alpha);
940
941 if (count <= 0) {
942 return;
943 }
944
945 if (count >= 8) {
946 // Make dst 16 bytes alignment
947 while (((size_t)dst & 0x0F) != 0) {
948 SkPMColor c = *src++;
949 if (c) {
950 *dst = SkSrcOver32To16(c, *dst);
951 }
952 dst += 1;
953 count--;
954 }
955
956 const __m128i* s = reinterpret_cast<const __m128i*>(src);
957 __m128i* d = reinterpret_cast<__m128i*>(dst);
958 __m128i var255 = _mm_set1_epi16(255);
959 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
960 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
961 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
962
963 while (count >= 8) {
964 // Load 8 pixels of src.
965 __m128i src_pixel1 = _mm_loadu_si128(s++);
966 __m128i src_pixel2 = _mm_loadu_si128(s++);
967
968 // Check whether src pixels are equal to 0 and get the highest bit
969 // of each byte of result, if src pixels are all zero, src_cmp1 and
970 // src_cmp2 will be 0xFFFF.
971 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
972 _mm_setzero_si128()));
973 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
974 _mm_setzero_si128()));
975 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
976 d++;
977 count -= 8;
978 continue;
979 }
980
981 // Load 8 pixels of dst.
982 __m128i dst_pixel = _mm_load_si128(d);
983
984 // Extract A from src.
985 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
986 sa1 = _mm_srli_epi32(sa1, 24);
987 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
988 sa2 = _mm_srli_epi32(sa2, 24);
989 __m128i sa = _mm_packs_epi32(sa1, sa2);
990
991 // Extract R from src.
992 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
993 sr1 = _mm_srli_epi32(sr1, 24);
994 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
995 sr2 = _mm_srli_epi32(sr2, 24);
996 __m128i sr = _mm_packs_epi32(sr1, sr2);
997
998 // Extract G from src.
999 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
1000 sg1 = _mm_srli_epi32(sg1, 24);
1001 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
1002 sg2 = _mm_srli_epi32(sg2, 24);
1003 __m128i sg = _mm_packs_epi32(sg1, sg2);
1004
1005 // Extract B from src.
1006 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
1007 sb1 = _mm_srli_epi32(sb1, 24);
1008 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
1009 sb2 = _mm_srli_epi32(sb2, 24);
1010 __m128i sb = _mm_packs_epi32(sb1, sb2);
1011
1012 // Extract R G B from dst.
1013 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
1014 dr = _mm_and_si128(dr, r16_mask);
1015 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
1016 dg = _mm_and_si128(dg, g16_mask);
1017 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
1018 db = _mm_and_si128(db, b16_mask);
1019
1020 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1021
1022 // Calculate R G B of result.
1023 // Original algorithm is in SkSrcOver32To16().
1024 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
1025 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1026 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
1027 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1028 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
1029 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1030
1031 // Pack R G B into 16-bit color.
1032 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1033
1034 // Store 8 16-bit colors in dst.
1035 _mm_store_si128(d++, d_pixel);
1036 count -= 8;
1037 }
1038
1039 src = reinterpret_cast<const SkPMColor*>(s);
1040 dst = reinterpret_cast<uint16_t*>(d);
1041 }
1042
1043 if (count > 0) {
1044 do {
1045 SkPMColor c = *src++;
1046 SkPMColorAssert(c);
1047 if (c) {
1048 *dst = SkSrcOver32To16(c, *dst);
1049 }
1050 dst += 1;
1051 } while (--count != 0);
1052 }
1053}