blob: 4aa08e81657a63388535b8d1a454bf129e260b0f [file] [log] [blame]
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001/*
tomhudson@google.com98a5b422012-02-28 16:15:26 +00002 * Copyright 2012 The Android Open Source Project
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
senorblanco@chromium.org92727612009-11-04 20:51:06 +00006 */
7
epoger@google.comec3ed6a2011-07-28 14:26:00 +00008
senorblanco@chromium.org4e753552009-11-16 21:09:00 +00009#include "SkBlitRow_opts_SSE2.h"
caryclark@google.com83ecdc32012-06-06 12:10:26 +000010#include "SkBitmapProcState_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000011#include "SkColorPriv.h"
commit-bot@chromium.org47591072014-02-19 03:09:52 +000012#include "SkColor_opts_SSE2.h"
commit-bot@chromium.org27580472014-03-07 03:25:32 +000013#include "SkDither.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000014#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000015
16#include <emmintrin.h>
17
senorblanco@chromium.org92727612009-11-04 20:51:06 +000018/* SSE2 version of S32_Blend_BlitRow32()
19 * portable version is in core/SkBlitRow_D32.cpp
20 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000021void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src,
23 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000024 SkASSERT(alpha <= 255);
25 if (count <= 0) {
26 return;
27 }
28
29 uint32_t src_scale = SkAlpha255To256(alpha);
30 uint32_t dst_scale = 256 - src_scale;
31
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000032 if (count >= 4) {
33 SkASSERT(((size_t)dst & 0x03) == 0);
34 while (((size_t)dst & 0x0F) != 0) {
35 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
36 src++;
37 dst++;
38 count--;
39 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000040
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000041 const __m128i *s = reinterpret_cast<const __m128i*>(src);
42 __m128i *d = reinterpret_cast<__m128i*>(dst);
43 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
tomhudson@google.com98a5b422012-02-28 16:15:26 +000044 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);
45
46 // Move scale factors to upper byte of word
47 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
48 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000049 while (count >= 4) {
50 // Load 4 pixels each of src and dest.
51 __m128i src_pixel = _mm_loadu_si128(s);
52 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000053
tomhudson@google.com98a5b422012-02-28 16:15:26 +000054 // Interleave Atom port 0/1 operations based on the execution port
55 // constraints that multiply can only be executed on port 0 (while
56 // boolean operations can be executed on either port 0 or port 1)
57 // because GCC currently doesn't do a good job scheduling
58 // instructions based on these constraints.
59
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000060 // Get red and blue pixels into lower byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000061 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000062 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000063
tomhudson@google.com98a5b422012-02-28 16:15:26 +000064 // Multiply by scale.
65 // (4 x (0, rs.h, 0, bs.h))
66 // where rs.h stands for the higher byte of r * scale, and
67 // bs.h the higher byte of b * scale.
68 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
69
70 // Get alpha and green pixels into higher byte of each word.
71 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
72 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000073
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000074 // Multiply by scale.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000075 // (4 x (as.h, as.l, gs.h, gs.l))
76 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000077
tomhudson@google.com98a5b422012-02-28 16:15:26 +000078 // Clear the lower byte of the a*scale and g*scale results
79 // (4 x (as.h, 0, gs.h, 0))
80 src_ag = _mm_and_si128(src_ag, ag_mask);
81
82 // Operations the destination pixels are the same as on the
83 // source pixels. See the comments above.
84 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
85 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
86 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
87 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
88 dst_ag = _mm_and_si128(dst_ag, ag_mask);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000089
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000090 // Combine back into RGBA.
tomhudson@google.com98a5b422012-02-28 16:15:26 +000091 // (4 x (as.h, rs.h, gs.h, bs.h))
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000092 src_pixel = _mm_or_si128(src_rb, src_ag);
93 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
94
95 // Add result
96 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
97 _mm_store_si128(d, result);
98 s++;
99 d++;
100 count -= 4;
101 }
102 src = reinterpret_cast<const SkPMColor*>(s);
103 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000104 }
105
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000106 while (count > 0) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000107 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
108 src++;
109 dst++;
110 count--;
111 }
112}
113
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000114void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
115 const SkPMColor* SK_RESTRICT src,
116 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000117 SkASSERT(alpha == 255);
118 if (count <= 0) {
119 return;
120 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000121
122 if (count >= 4) {
123 SkASSERT(((size_t)dst & 0x03) == 0);
124 while (((size_t)dst & 0x0F) != 0) {
125 *dst = SkPMSrcOver(*src, *dst);
126 src++;
127 dst++;
128 count--;
129 }
130
131 const __m128i *s = reinterpret_cast<const __m128i*>(src);
132 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000133#ifdef SK_USE_ACCURATE_BLENDING
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000134 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
135 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
136 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
137 while (count >= 4) {
138 // Load 4 pixels
139 __m128i src_pixel = _mm_loadu_si128(s);
140 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000141
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000142 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000143 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000144 // Shift alphas down to lower 8 bits of each quad.
145 __m128i alpha = _mm_srli_epi32(src_pixel, 24);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000146
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000147 // Copy alpha to upper 3rd byte of each quad
148 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000149
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000150 // Subtract alphas from 255, to get 0..255
151 alpha = _mm_sub_epi16(c_255, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000152
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000153 // Multiply by red and blue by src alpha.
154 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
155 // Multiply by alpha and green by src alpha.
156 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000157
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000158 // dst_rb_low = (dst_rb >> 8)
159 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
160 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000161
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000162 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
163 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
164 dst_rb = _mm_add_epi16(dst_rb, c_128);
165 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000166
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000167 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
168 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
169 dst_ag = _mm_add_epi16(dst_ag, c_128);
170 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000171
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000172 // Combine back into RGBA.
173 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000174
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000175 // Add result
176 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
177 _mm_store_si128(d, result);
178 s++;
179 d++;
180 count -= 4;
181 }
182 #else
183 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
184 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
185 while (count >= 4) {
186 // Load 4 pixels
187 __m128i src_pixel = _mm_loadu_si128(s);
188 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000189
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000190 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000191 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000192
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000193 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
194 __m128i alpha = _mm_srli_epi16(src_pixel, 8);
195
196 // (a0, a0, a1, a1, a2, g2, a3, g3)
197 alpha = _mm_shufflehi_epi16(alpha, 0xF5);
198
199 // (a0, a0, a1, a1, a2, a2, a3, a3)
200 alpha = _mm_shufflelo_epi16(alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000201
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000202 // Subtract alphas from 256, to get 1..256
203 alpha = _mm_sub_epi16(c_256, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000204
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000205 // Multiply by red and blue by src alpha.
206 dst_rb = _mm_mullo_epi16(dst_rb, alpha);
207 // Multiply by alpha and green by src alpha.
208 dst_ag = _mm_mullo_epi16(dst_ag, alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000209
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000210 // Divide by 256.
211 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000212
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000213 // Mask out high bits (already in the right place)
214 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000215
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000216 // Combine back into RGBA.
217 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000218
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000219 // Add result
220 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
221 _mm_store_si128(d, result);
222 s++;
223 d++;
224 count -= 4;
225 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000226#endif
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000227 src = reinterpret_cast<const SkPMColor*>(s);
228 dst = reinterpret_cast<SkPMColor*>(d);
229 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000230
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000231 while (count > 0) {
232 *dst = SkPMSrcOver(*src, *dst);
233 src++;
234 dst++;
235 count--;
236 }
237}
238
senorblanco@chromium.org4e753552009-11-16 21:09:00 +0000239void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
240 const SkPMColor* SK_RESTRICT src,
241 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000242 SkASSERT(alpha <= 255);
243 if (count <= 0) {
244 return;
245 }
246
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000247 if (count >= 4) {
248 while (((size_t)dst & 0x0F) != 0) {
249 *dst = SkBlendARGB32(*src, *dst, alpha);
250 src++;
251 dst++;
252 count--;
253 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000254
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000255 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000256
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000257 const __m128i *s = reinterpret_cast<const __m128i*>(src);
258 __m128i *d = reinterpret_cast<__m128i*>(dst);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000259 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000260 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
261 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
262 while (count >= 4) {
263 // Load 4 pixels each of src and dest.
264 __m128i src_pixel = _mm_loadu_si128(s);
265 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000266
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000267 // Get red and blue pixels into lower byte of each word.
268 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
269 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000270
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000271 // Get alpha and green into lower byte of each word.
272 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
273 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000274
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000275 // Put per-pixel alpha in low byte of each word.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000276 // After the following two statements, the dst_alpha looks like
277 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000278 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
279 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000280
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000281 // dst_alpha = dst_alpha * src_scale
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000282 // Because src_scales are in the higher byte of each word and
283 // we use mulhi here, the resulting alpha values are already
284 // in the right place and don't need to be divided by 256.
285 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
286 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000287
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000288 // Subtract alphas from 256, to get 1..256
289 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000290
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000291 // Multiply red and blue by dst pixel alpha.
292 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
293 // Multiply alpha and green by dst pixel alpha.
294 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000295
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000296 // Multiply red and blue by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000297 // (4 x (0, rs.h, 0, bs.h))
298 // where rs.h stands for the higher byte of r * src_scale,
299 // and bs.h the higher byte of b * src_scale.
300 // Again, because we use mulhi, the resuling red and blue
301 // values are already in the right place and don't need to
302 // be divided by 256.
303 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000304 // Multiply alpha and green by global alpha.
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000305 // (4 x (0, as.h, 0, gs.h))
306 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000307
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000308 // Divide by 256.
309 dst_rb = _mm_srli_epi16(dst_rb, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000310
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000311 // Mask out low bits (goodies already in the right place; no need to divide)
312 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
tomhudson@google.com98a5b422012-02-28 16:15:26 +0000313 // Shift alpha and green to higher byte of each word.
314 // (4 x (as.h, 0, gs.h, 0))
315 src_ag = _mm_slli_epi16(src_ag, 8);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000316
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000317 // Combine back into RGBA.
318 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
319 src_pixel = _mm_or_si128(src_rb, src_ag);
320
321 // Add two pixels into result.
322 __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
323 _mm_store_si128(d, result);
324 s++;
325 d++;
326 count -= 4;
327 }
328 src = reinterpret_cast<const SkPMColor*>(s);
329 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000330 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +0000331
senorblanco@chromium.org92727612009-11-04 20:51:06 +0000332 while (count > 0) {
333 *dst = SkBlendARGB32(*src, *dst, alpha);
334 src++;
335 dst++;
336 count--;
337 }
338}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000339
340/* SSE2 version of Color32()
341 * portable version is in core/SkBlitRow_D32.cpp
342 */
343void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
344 SkPMColor color) {
345
346 if (count <= 0) {
347 return;
348 }
349
350 if (0 == color) {
351 if (src != dst) {
352 memcpy(dst, src, count * sizeof(SkPMColor));
353 }
reed@google.comc909a1e2011-10-25 19:07:23 +0000354 return;
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000355 }
356
357 unsigned colorA = SkGetPackedA32(color);
358 if (255 == colorA) {
359 sk_memset32(dst, color, count);
360 } else {
361 unsigned scale = 256 - SkAlpha255To256(colorA);
362
363 if (count >= 4) {
364 SkASSERT(((size_t)dst & 0x03) == 0);
365 while (((size_t)dst & 0x0F) != 0) {
366 *dst = color + SkAlphaMulQ(*src, scale);
367 src++;
368 dst++;
369 count--;
370 }
371
372 const __m128i *s = reinterpret_cast<const __m128i*>(src);
373 __m128i *d = reinterpret_cast<__m128i*>(dst);
374 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
375 __m128i src_scale_wide = _mm_set1_epi16(scale);
376 __m128i color_wide = _mm_set1_epi32(color);
377 while (count >= 4) {
378 // Load 4 pixels each of src and dest.
379 __m128i src_pixel = _mm_loadu_si128(s);
380
381 // Get red and blue pixels into lower byte of each word.
382 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
reed@google.com981d4792011-03-09 12:55:47 +0000383
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000384 // Get alpha and green into lower byte of each word.
385 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
386
387 // Multiply by scale.
388 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
389 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
390
391 // Divide by 256.
392 src_rb = _mm_srli_epi16(src_rb, 8);
393 src_ag = _mm_andnot_si128(rb_mask, src_ag);
394
395 // Combine back into RGBA.
396 src_pixel = _mm_or_si128(src_rb, src_ag);
397
398 // Add color to result.
399 __m128i result = _mm_add_epi8(color_wide, src_pixel);
400
401 // Store result.
402 _mm_store_si128(d, result);
403 s++;
404 d++;
405 count -= 4;
406 }
407 src = reinterpret_cast<const SkPMColor*>(s);
408 dst = reinterpret_cast<SkPMColor*>(d);
409 }
410
411 while (count > 0) {
412 *dst = color + SkAlphaMulQ(*src, scale);
413 src += 1;
414 dst += 1;
415 count--;
reed@google.com981d4792011-03-09 12:55:47 +0000416 }
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000417 }
418}
reed@google.com981d4792011-03-09 12:55:47 +0000419
reed@google.comedb606c2011-10-18 13:56:50 +0000420void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
421 size_t maskRB, SkColor origColor,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000422 int width, int height) {
reed@google.comee467ee2011-03-09 13:23:57 +0000423 SkPMColor color = SkPreMultiplyColor(origColor);
reed@google.com981d4792011-03-09 12:55:47 +0000424 size_t dstOffset = dstRB - (width << 2);
425 size_t maskOffset = maskRB - width;
426 SkPMColor* dst = (SkPMColor *)device;
reed@google.comedb606c2011-10-18 13:56:50 +0000427 const uint8_t* mask = (const uint8_t*)maskPtr;
reed@google.com981d4792011-03-09 12:55:47 +0000428 do {
429 int count = width;
430 if (count >= 4) {
431 while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
432 *dst = SkBlendARGB32(color, *dst, *mask);
433 mask++;
434 dst++;
435 count--;
436 }
437 __m128i *d = reinterpret_cast<__m128i*>(dst);
438 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
439 __m128i c_256 = _mm_set1_epi16(256);
440 __m128i c_1 = _mm_set1_epi16(1);
441 __m128i src_pixel = _mm_set1_epi32(color);
442 while (count >= 4) {
443 // Load 4 pixels each of src and dest.
444 __m128i dst_pixel = _mm_load_si128(d);
445
446 //set the aphla value
447 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
448 0, *(mask+3),0, \
449 *(mask+2),0, *(mask+2),\
450 0,*(mask+1), 0,*(mask+1),\
451 0, *mask,0,*mask);
452
453 //call SkAlpha255To256()
454 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);
455
456 // Get red and blue pixels into lower byte of each word.
457 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
458 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);
459
460 // Get alpha and green into lower byte of each word.
461 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
462 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);
463
464 // Put per-pixel alpha in low byte of each word.
465 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
466 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);
467
468 // dst_alpha = dst_alpha * src_scale
469 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);
470
471 // Divide by 256.
472 dst_alpha = _mm_srli_epi16(dst_alpha, 8);
473
474 // Subtract alphas from 256, to get 1..256
475 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
476 // Multiply red and blue by dst pixel alpha.
477 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
478 // Multiply alpha and green by dst pixel alpha.
479 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);
480
481 // Multiply red and blue by global alpha.
482 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
483 // Multiply alpha and green by global alpha.
484 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
485 // Divide by 256.
486 dst_rb = _mm_srli_epi16(dst_rb, 8);
487 src_rb = _mm_srli_epi16(src_rb, 8);
488
489 // Mask out low bits (goodies already in the right place; no need to divide)
490 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
491 src_ag = _mm_andnot_si128(rb_mask, src_ag);
492
493 // Combine back into RGBA.
494 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
495 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);
496
497 // Add two pixels into result.
498 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
499 _mm_store_si128(d, result);
500 // load the next 4 pixel
501 mask = mask + 4;
502 d++;
503 count -= 4;
504 }
505 dst = reinterpret_cast<SkPMColor *>(d);
506 }
507 while(count > 0) {
508 *dst= SkBlendARGB32(color, *dst, *mask);
509 dst += 1;
510 mask++;
511 count --;
512 }
513 dst = (SkPMColor *)((char*)dst + dstOffset);
514 mask += maskOffset;
515 } while (--height != 0);
516}
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000517
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000518// The following (left) shifts cause the top 5 bits of the mask components to
519// line up with the corresponding components in an SkPMColor.
520// Note that the mask's RGB16 order may differ from the SkPMColor order.
521#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
522#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
523#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
524
525#if SK_R16x5_R32x5_SHIFT == 0
526 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
527#elif SK_R16x5_R32x5_SHIFT > 0
528 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
529#else
530 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
531#endif
532
533#if SK_G16x5_G32x5_SHIFT == 0
534 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
535#elif SK_G16x5_G32x5_SHIFT > 0
536 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
537#else
538 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
539#endif
540
541#if SK_B16x5_B32x5_SHIFT == 0
542 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
543#elif SK_B16x5_B32x5_SHIFT > 0
544 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
545#else
546 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
547#endif
548
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000549static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
550 __m128i &mask, __m128i &srcA) {
551 // In the following comments, the components of src, dst and mask are
552 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
553 // by an R, G, B, or A suffix. Components of one of the four pixels that
554 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
555 // example is the blue channel of the second destination pixel. Memory
556 // layout is shown for an ARGB byte order in a color value.
557
558 // src and srcA store 8-bit values interleaved with zeros.
559 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
560 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
561 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
562 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
563 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
564 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
565 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
566
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000567 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000568 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000569 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
570 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000571
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000572 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000573 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
574 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000575
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000576 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000577 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
578 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000579
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000580 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000581 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
582 // 8-bit position
583 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
584 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000585 mask = _mm_or_si128(_mm_or_si128(r, g), b);
586
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000587 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000588 // i.e. split the sixteen 8-bit values from mask into two sets of eight
589 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000590 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000591 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000592 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000593 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000594 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
595
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000596 // Upscale from 0..31 to 0..32
597 // (allows to replace division by left-shift further down)
598 // Left-shift each component by 4 and add the result back to that component,
599 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000600 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
601 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
602
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000603 // Multiply each component of maskLo and maskHi by srcA
604 maskLo = _mm_mullo_epi16(maskLo, srcA);
605 maskHi = _mm_mullo_epi16(maskHi, srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000606
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000607 // Left shift mask components by 8 (divide by 256)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000608 maskLo = _mm_srli_epi16(maskLo, 8);
609 maskHi = _mm_srli_epi16(maskHi, 8);
610
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000611 // Interleave R,G,B into the lower byte of the word
612 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000613 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000614 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000615 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
616
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000617 // mask = (src - dst) * mask
618 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
619 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000620
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000621 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000622 maskLo = _mm_srai_epi16(maskLo, 5);
623 maskHi = _mm_srai_epi16(maskHi, 5);
624
625 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000626 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000627 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
628 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
629
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000630 // Pack into 4 32bit dst pixels.
631 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
632 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
633 // clamping to 255 if necessary.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000634 return _mm_packus_epi16(resultLo, resultHi);
635}
636
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000637static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000638 __m128i &mask) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000639 // In the following comments, the components of src, dst and mask are
640 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
641 // by an R, G, B, or A suffix. Components of one of the four pixels that
642 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
643 // example is the blue channel of the second destination pixel. Memory
644 // layout is shown for an ARGB byte order in a color value.
645
646 // src and srcA store 8-bit values interleaved with zeros.
647 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
648 // mask stores 16-bit values (shown as high and low bytes) interleaved with
649 // zeros
650 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
651 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
652
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000653 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000654 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000655 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
656 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000657
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000658 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000659 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
660 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000661
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000662 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000663 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
664 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000665
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000666 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000667 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
668 // 8-bit position
669 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
670 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000671 mask = _mm_or_si128(_mm_or_si128(r, g), b);
672
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000673 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000674 // i.e. split the sixteen 8-bit values from mask into two sets of eight
675 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000676 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000677 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000678 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000679 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000680 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
681
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000682 // Upscale from 0..31 to 0..32
683 // (allows to replace division by left-shift further down)
684 // Left-shift each component by 4 and add the result back to that component,
685 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000686 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
687 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
688
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000689 // Interleave R,G,B into the lower byte of the word
690 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000691 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000692 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000693 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
694
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000695 // mask = (src - dst) * mask
696 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
697 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000698
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000699 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000700 maskLo = _mm_srai_epi16(maskLo, 5);
701 maskHi = _mm_srai_epi16(maskHi, 5);
702
703 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000704 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000705 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
706 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
707
bungeman@google.com27123cd2012-08-21 19:25:42 +0000708 // Pack into 4 32bit dst pixels and force opaque.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000709 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
710 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
711 // clamping to 255 if necessary. Set alpha components to 0xFF.
bungeman@google.com27123cd2012-08-21 19:25:42 +0000712 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
713 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000714}
715
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000716void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
717 SkColor src, int width, SkPMColor) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000718 if (width <= 0) {
719 return;
720 }
721
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000722 int srcA = SkColorGetA(src);
723 int srcR = SkColorGetR(src);
724 int srcG = SkColorGetG(src);
725 int srcB = SkColorGetB(src);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000726
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000727 srcA = SkAlpha255To256(srcA);
728
729 if (width >= 4) {
730 SkASSERT(((size_t)dst & 0x03) == 0);
731 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000732 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
733 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000734 dst++;
735 width--;
736 }
737
738 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000739 // Set alpha to 0xFF and replicate source four times in SSE register.
740 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
741 // Interleave with zeros to get two sets of four 16-bit values.
742 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
743 // Set srcA_sse to contain eight copies of srcA, padded with zero.
744 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
745 __m128i srcA_sse = _mm_set1_epi16(srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000746 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000747 // Load four destination pixels into dst_sse.
748 __m128i dst_sse = _mm_load_si128(d);
749 // Load four 16-bit masks into lower half of mask_sse.
750 __m128i mask_sse = _mm_loadl_epi64(
751 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000752
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000753 // Check whether masks are equal to 0 and get the highest bit
754 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000755 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000756 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000757 _mm_setzero_si128()));
758
759 // if mask pixels are not all zero, we will blend the dst pixels
760 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000761 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000762 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
763 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
764 mask_sse = _mm_unpacklo_epi16(mask_sse,
765 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000766
767 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000768 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
769 mask_sse, srcA_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000770 _mm_store_si128(d, result);
771 }
772
773 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000774 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000775 width -= 4;
776 }
777
778 dst = reinterpret_cast<SkPMColor*>(d);
779 }
780
781 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000782 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
783 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000784 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000785 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000786 }
787}
788
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000789void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
790 SkColor src, int width, SkPMColor opaqueDst) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000791 if (width <= 0) {
792 return;
793 }
794
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000795 int srcR = SkColorGetR(src);
796 int srcG = SkColorGetG(src);
797 int srcB = SkColorGetB(src);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000798
799 if (width >= 4) {
800 SkASSERT(((size_t)dst & 0x03) == 0);
801 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000802 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
803 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000804 dst++;
805 width--;
806 }
807
808 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000809 // Set alpha to 0xFF and replicate source four times in SSE register.
810 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
811 // Set srcA_sse to contain eight copies of srcA, padded with zero.
812 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
813 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000814 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000815 // Load four destination pixels into dst_sse.
816 __m128i dst_sse = _mm_load_si128(d);
817 // Load four 16-bit masks into lower half of mask_sse.
818 __m128i mask_sse = _mm_loadl_epi64(
819 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000820
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000821 // Check whether masks are equal to 0 and get the highest bit
822 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000823 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000824 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000825 _mm_setzero_si128()));
826
827 // if mask pixels are not all zero, we will blend the dst pixels
828 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000829 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000830 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
831 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
832 mask_sse = _mm_unpacklo_epi16(mask_sse,
833 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000834
835 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000836 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
837 mask_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000838 _mm_store_si128(d, result);
839 }
840
841 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000842 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000843 width -= 4;
844 }
845
846 dst = reinterpret_cast<SkPMColor*>(d);
847 }
848
849 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000850 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
851 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000852 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000853 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000854 }
855}
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000856
commit-bot@chromium.org39ce33a2014-02-24 04:23:39 +0000857/* SSE2 version of S32_D565_Opaque()
858 * portable version is in core/SkBlitRow_D16.cpp
859 */
860void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
861 const SkPMColor* SK_RESTRICT src, int count,
862 U8CPU alpha, int /*x*/, int /*y*/) {
863 SkASSERT(255 == alpha);
864
865 if (count <= 0) {
866 return;
867 }
868
869 if (count >= 8) {
870 while (((size_t)dst & 0x0F) != 0) {
871 SkPMColor c = *src++;
872 SkPMColorAssert(c);
873
874 *dst++ = SkPixel32ToPixel16_ToU16(c);
875 count--;
876 }
877
878 const __m128i* s = reinterpret_cast<const __m128i*>(src);
879 __m128i* d = reinterpret_cast<__m128i*>(dst);
880 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);
881 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);
882 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);
883
884 while (count >= 8) {
885 // Load 8 pixels of src.
886 __m128i src_pixel1 = _mm_loadu_si128(s++);
887 __m128i src_pixel2 = _mm_loadu_si128(s++);
888
889 // Calculate result r.
890 __m128i r1 = _mm_srli_epi32(src_pixel1,
891 SK_R32_SHIFT + (8 - SK_R16_BITS));
892 r1 = _mm_and_si128(r1, r16_mask);
893 __m128i r2 = _mm_srli_epi32(src_pixel2,
894 SK_R32_SHIFT + (8 - SK_R16_BITS));
895 r2 = _mm_and_si128(r2, r16_mask);
896 __m128i r = _mm_packs_epi32(r1, r2);
897
898 // Calculate result g.
899 __m128i g1 = _mm_srli_epi32(src_pixel1,
900 SK_G32_SHIFT + (8 - SK_G16_BITS));
901 g1 = _mm_and_si128(g1, g16_mask);
902 __m128i g2 = _mm_srli_epi32(src_pixel2,
903 SK_G32_SHIFT + (8 - SK_G16_BITS));
904 g2 = _mm_and_si128(g2, g16_mask);
905 __m128i g = _mm_packs_epi32(g1, g2);
906
907 // Calculate result b.
908 __m128i b1 = _mm_srli_epi32(src_pixel1,
909 SK_B32_SHIFT + (8 - SK_B16_BITS));
910 b1 = _mm_and_si128(b1, b16_mask);
911 __m128i b2 = _mm_srli_epi32(src_pixel2,
912 SK_B32_SHIFT + (8 - SK_B16_BITS));
913 b2 = _mm_and_si128(b2, b16_mask);
914 __m128i b = _mm_packs_epi32(b1, b2);
915
916 // Store 8 16-bit colors in dst.
917 __m128i d_pixel = SkPackRGB16_SSE(r, g, b);
918 _mm_store_si128(d++, d_pixel);
919 count -= 8;
920 }
921 src = reinterpret_cast<const SkPMColor*>(s);
922 dst = reinterpret_cast<uint16_t*>(d);
923 }
924
925 if (count > 0) {
926 do {
927 SkPMColor c = *src++;
928 SkPMColorAssert(c);
929 *dst++ = SkPixel32ToPixel16_ToU16(c);
930 } while (--count != 0);
931 }
932}
933
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000934/* SSE2 version of S32A_D565_Opaque()
935 * portable version is in core/SkBlitRow_D16.cpp
936 */
937void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
938 const SkPMColor* SK_RESTRICT src,
939 int count, U8CPU alpha, int /*x*/, int /*y*/) {
940 SkASSERT(255 == alpha);
941
942 if (count <= 0) {
943 return;
944 }
945
946 if (count >= 8) {
947 // Make dst 16 bytes alignment
948 while (((size_t)dst & 0x0F) != 0) {
949 SkPMColor c = *src++;
950 if (c) {
951 *dst = SkSrcOver32To16(c, *dst);
952 }
953 dst += 1;
954 count--;
955 }
956
957 const __m128i* s = reinterpret_cast<const __m128i*>(src);
958 __m128i* d = reinterpret_cast<__m128i*>(dst);
959 __m128i var255 = _mm_set1_epi16(255);
960 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
961 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
962 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
963
964 while (count >= 8) {
965 // Load 8 pixels of src.
966 __m128i src_pixel1 = _mm_loadu_si128(s++);
967 __m128i src_pixel2 = _mm_loadu_si128(s++);
968
969 // Check whether src pixels are equal to 0 and get the highest bit
970 // of each byte of result, if src pixels are all zero, src_cmp1 and
971 // src_cmp2 will be 0xFFFF.
972 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
973 _mm_setzero_si128()));
974 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
975 _mm_setzero_si128()));
976 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
977 d++;
978 count -= 8;
979 continue;
980 }
981
982 // Load 8 pixels of dst.
983 __m128i dst_pixel = _mm_load_si128(d);
984
985 // Extract A from src.
986 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
987 sa1 = _mm_srli_epi32(sa1, 24);
988 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
989 sa2 = _mm_srli_epi32(sa2, 24);
990 __m128i sa = _mm_packs_epi32(sa1, sa2);
991
992 // Extract R from src.
993 __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT));
994 sr1 = _mm_srli_epi32(sr1, 24);
995 __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT));
996 sr2 = _mm_srli_epi32(sr2, 24);
997 __m128i sr = _mm_packs_epi32(sr1, sr2);
998
999 // Extract G from src.
1000 __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT));
1001 sg1 = _mm_srli_epi32(sg1, 24);
1002 __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT));
1003 sg2 = _mm_srli_epi32(sg2, 24);
1004 __m128i sg = _mm_packs_epi32(sg1, sg2);
1005
1006 // Extract B from src.
1007 __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT));
1008 sb1 = _mm_srli_epi32(sb1, 24);
1009 __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT));
1010 sb2 = _mm_srli_epi32(sb2, 24);
1011 __m128i sb = _mm_packs_epi32(sb1, sb2);
1012
1013 // Extract R G B from dst.
1014 __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT);
1015 dr = _mm_and_si128(dr, r16_mask);
1016 __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT);
1017 dg = _mm_and_si128(dg, g16_mask);
1018 __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT);
1019 db = _mm_and_si128(db, b16_mask);
1020
1021 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
1022
1023 // Calculate R G B of result.
1024 // Original algorithm is in SkSrcOver32To16().
1025 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS));
1026 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
1027 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS));
1028 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
1029 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS));
1030 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
1031
1032 // Pack R G B into 16-bit color.
1033 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1034
1035 // Store 8 16-bit colors in dst.
1036 _mm_store_si128(d++, d_pixel);
1037 count -= 8;
1038 }
1039
1040 src = reinterpret_cast<const SkPMColor*>(s);
1041 dst = reinterpret_cast<uint16_t*>(d);
1042 }
1043
1044 if (count > 0) {
1045 do {
1046 SkPMColor c = *src++;
1047 SkPMColorAssert(c);
1048 if (c) {
1049 *dst = SkSrcOver32To16(c, *dst);
1050 }
1051 dst += 1;
1052 } while (--count != 0);
1053 }
1054}
commit-bot@chromium.org27580472014-03-07 03:25:32 +00001055
1056void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1057 const SkPMColor* SK_RESTRICT src,
1058 int count, U8CPU alpha, int x, int y) {
1059 SkASSERT(255 == alpha);
1060
1061 if (count <= 0) {
1062 return;
1063 }
1064
1065 if (count >= 8) {
1066 while (((size_t)dst & 0x0F) != 0) {
1067 DITHER_565_SCAN(y);
1068 SkPMColor c = *src++;
1069 SkPMColorAssert(c);
1070
1071 unsigned dither = DITHER_VALUE(x);
1072 *dst++ = SkDitherRGB32To565(c, dither);
1073 DITHER_INC_X(x);
1074 count--;
1075 }
1076
1077 unsigned short dither_value[8];
1078 __m128i dither;
1079#ifdef ENABLE_DITHER_MATRIX_4X4
1080 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1081 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1082 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1083 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1084 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1085#else
1086 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1087 dither_value[0] = dither_value[4] = (dither_scan
1088 >> (((x) & 3) << 2)) & 0xF;
1089 dither_value[1] = dither_value[5] = (dither_scan
1090 >> (((x + 1) & 3) << 2)) & 0xF;
1091 dither_value[2] = dither_value[6] = (dither_scan
1092 >> (((x + 2) & 3) << 2)) & 0xF;
1093 dither_value[3] = dither_value[7] = (dither_scan
1094 >> (((x + 3) & 3) << 2)) & 0xF;
1095#endif
1096 dither = _mm_loadu_si128((__m128i*) dither_value);
1097
1098 const __m128i* s = reinterpret_cast<const __m128i*>(src);
1099 __m128i* d = reinterpret_cast<__m128i*>(dst);
1100
1101 while (count >= 8) {
1102 // Load 8 pixels of src.
1103 __m128i src_pixel1 = _mm_loadu_si128(s++);
1104 __m128i src_pixel2 = _mm_loadu_si128(s++);
1105
1106 // Extract R from src.
1107 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1108 sr1 = _mm_srli_epi32(sr1, 24);
1109 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1110 sr2 = _mm_srli_epi32(sr2, 24);
1111 __m128i sr = _mm_packs_epi32(sr1, sr2);
1112
1113 // SkDITHER_R32To565(sr, dither)
1114 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1115 sr = _mm_add_epi16(sr, dither);
1116 sr = _mm_sub_epi16(sr, sr_offset);
1117 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
1118
1119 // Extract G from src.
1120 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1121 sg1 = _mm_srli_epi32(sg1, 24);
1122 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1123 sg2 = _mm_srli_epi32(sg2, 24);
1124 __m128i sg = _mm_packs_epi32(sg1, sg2);
1125
1126 // SkDITHER_R32To565(sg, dither)
1127 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1128 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
1129 sg = _mm_sub_epi16(sg, sg_offset);
1130 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
1131
1132 // Extract B from src.
1133 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1134 sb1 = _mm_srli_epi32(sb1, 24);
1135 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1136 sb2 = _mm_srli_epi32(sb2, 24);
1137 __m128i sb = _mm_packs_epi32(sb1, sb2);
1138
1139 // SkDITHER_R32To565(sb, dither)
1140 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1141 sb = _mm_add_epi16(sb, dither);
1142 sb = _mm_sub_epi16(sb, sb_offset);
1143 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
1144
1145 // Pack and store 16-bit dst pixel.
1146 __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb);
1147 _mm_store_si128(d++, d_pixel);
1148
1149 count -= 8;
1150 x += 8;
1151 }
1152
1153 src = reinterpret_cast<const SkPMColor*>(s);
1154 dst = reinterpret_cast<uint16_t*>(d);
1155 }
1156
1157 if (count > 0) {
1158 DITHER_565_SCAN(y);
1159 do {
1160 SkPMColor c = *src++;
1161 SkPMColorAssert(c);
1162
1163 unsigned dither = DITHER_VALUE(x);
1164 *dst++ = SkDitherRGB32To565(c, dither);
1165 DITHER_INC_X(x);
1166 } while (--count != 0);
1167 }
1168}
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +00001169
1170/* SSE2 version of S32A_D565_Opaque_Dither()
1171 * portable version is in core/SkBlitRow_D16.cpp
1172 */
1173void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
1174 const SkPMColor* SK_RESTRICT src,
1175 int count, U8CPU alpha, int x, int y) {
1176 SkASSERT(255 == alpha);
1177
1178 if (count <= 0) {
1179 return;
1180 }
1181
1182 if (count >= 8) {
1183 while (((size_t)dst & 0x0F) != 0) {
1184 DITHER_565_SCAN(y);
1185 SkPMColor c = *src++;
1186 SkPMColorAssert(c);
1187 if (c) {
1188 unsigned a = SkGetPackedA32(c);
1189
1190 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1191
1192 unsigned sr = SkGetPackedR32(c);
1193 unsigned sg = SkGetPackedG32(c);
1194 unsigned sb = SkGetPackedB32(c);
1195 sr = SkDITHER_R32_FOR_565(sr, d);
1196 sg = SkDITHER_G32_FOR_565(sg, d);
1197 sb = SkDITHER_B32_FOR_565(sb, d);
1198
1199 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1200 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1201 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1202 // now src and dst expanded are in g:11 r:10 x:1 b:10
1203 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1204 }
1205 dst += 1;
1206 DITHER_INC_X(x);
1207 count--;
1208 }
1209
1210 unsigned short dither_value[8];
1211 __m128i dither, dither_cur;
1212#ifdef ENABLE_DITHER_MATRIX_4X4
1213 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
1214 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
1215 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
1216 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
1217 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
1218#else
1219 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
1220 dither_value[0] = dither_value[4] = (dither_scan
1221 >> (((x) & 3) << 2)) & 0xF;
1222 dither_value[1] = dither_value[5] = (dither_scan
1223 >> (((x + 1) & 3) << 2)) & 0xF;
1224 dither_value[2] = dither_value[6] = (dither_scan
1225 >> (((x + 2) & 3) << 2)) & 0xF;
1226 dither_value[3] = dither_value[7] = (dither_scan
1227 >> (((x + 3) & 3) << 2)) & 0xF;
1228#endif
1229 dither = _mm_loadu_si128((__m128i*) dither_value);
1230
1231 const __m128i* s = reinterpret_cast<const __m128i*>(src);
1232 __m128i* d = reinterpret_cast<__m128i*>(dst);
1233 __m128i var256 = _mm_set1_epi16(256);
1234 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
1235 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
1236 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
1237
1238 while (count >= 8) {
1239 // Load 8 pixels of src and dst.
1240 __m128i src_pixel1 = _mm_loadu_si128(s++);
1241 __m128i src_pixel2 = _mm_loadu_si128(s++);
1242 __m128i dst_pixel = _mm_load_si128(d);
1243
1244 // Extract A from src.
1245 __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT));
1246 sa1 = _mm_srli_epi32(sa1, 24);
1247 __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT));
1248 sa2 = _mm_srli_epi32(sa2, 24);
1249 __m128i sa = _mm_packs_epi32(sa1, sa2);
1250
1251 // Calculate current dither value.
1252 dither_cur = _mm_mullo_epi16(dither,
1253 _mm_add_epi16(sa, _mm_set1_epi16(1)));
1254 dither_cur = _mm_srli_epi16(dither_cur, 8);
1255
1256 // Extract R from src.
1257 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
1258 sr1 = _mm_srli_epi32(sr1, 24);
1259 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
1260 sr2 = _mm_srli_epi32(sr2, 24);
1261 __m128i sr = _mm_packs_epi32(sr1, sr2);
1262
1263 // SkDITHER_R32_FOR_565(sr, d)
1264 __m128i sr_offset = _mm_srli_epi16(sr, 5);
1265 sr = _mm_add_epi16(sr, dither_cur);
1266 sr = _mm_sub_epi16(sr, sr_offset);
1267
1268 // Expand sr.
1269 sr = _mm_slli_epi16(sr, 2);
1270
1271 // Extract G from src.
1272 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
1273 sg1 = _mm_srli_epi32(sg1, 24);
1274 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
1275 sg2 = _mm_srli_epi32(sg2, 24);
1276 __m128i sg = _mm_packs_epi32(sg1, sg2);
1277
1278 // sg = SkDITHER_G32_FOR_565(sg, d).
1279 __m128i sg_offset = _mm_srli_epi16(sg, 6);
1280 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
1281 sg = _mm_sub_epi16(sg, sg_offset);
1282
1283 // Expand sg.
1284 sg = _mm_slli_epi16(sg, 3);
1285
1286 // Extract B from src.
1287 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
1288 sb1 = _mm_srli_epi32(sb1, 24);
1289 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
1290 sb2 = _mm_srli_epi32(sb2, 24);
1291 __m128i sb = _mm_packs_epi32(sb1, sb2);
1292
1293 // sb = SkDITHER_B32_FOR_565(sb, d).
1294 __m128i sb_offset = _mm_srli_epi16(sb, 5);
1295 sb = _mm_add_epi16(sb, dither_cur);
1296 sb = _mm_sub_epi16(sb, sb_offset);
1297
1298 // Expand sb.
1299 sb = _mm_slli_epi16(sb, 2);
1300
1301 // Extract R G B from dst.
1302 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
1303 dr = _mm_and_si128(dr, r16_mask);
1304 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
1305 dg = _mm_and_si128(dg, g16_mask);
1306 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
1307 db = _mm_and_si128(db, b16_mask);
1308
1309 // SkAlpha255To256(255 - a) >> 3
1310 __m128i isa = _mm_sub_epi16(var256, sa);
1311 isa = _mm_srli_epi16(isa, 3);
1312
1313 dr = _mm_mullo_epi16(dr, isa);
1314 dr = _mm_add_epi16(dr, sr);
1315 dr = _mm_srli_epi16(dr, 5);
1316
1317 dg = _mm_mullo_epi16(dg, isa);
1318 dg = _mm_add_epi16(dg, sg);
1319 dg = _mm_srli_epi16(dg, 5);
1320
1321 db = _mm_mullo_epi16(db, isa);
1322 db = _mm_add_epi16(db, sb);
1323 db = _mm_srli_epi16(db, 5);
1324
1325 // Package and store dst pixel.
1326 __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db);
1327 _mm_store_si128(d++, d_pixel);
1328
1329 count -= 8;
1330 x += 8;
1331 }
1332
1333 src = reinterpret_cast<const SkPMColor*>(s);
1334 dst = reinterpret_cast<uint16_t*>(d);
1335 }
1336
1337 if (count > 0) {
1338 DITHER_565_SCAN(y);
1339 do {
1340 SkPMColor c = *src++;
1341 SkPMColorAssert(c);
1342 if (c) {
1343 unsigned a = SkGetPackedA32(c);
1344
1345 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
1346
1347 unsigned sr = SkGetPackedR32(c);
1348 unsigned sg = SkGetPackedG32(c);
1349 unsigned sb = SkGetPackedB32(c);
1350 sr = SkDITHER_R32_FOR_565(sr, d);
1351 sg = SkDITHER_G32_FOR_565(sg, d);
1352 sb = SkDITHER_B32_FOR_565(sb, d);
1353
1354 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1355 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1356 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1357 // now src and dst expanded are in g:11 r:10 x:1 b:10
1358 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1359 }
1360 dst += 1;
1361 DITHER_INC_X(x);
1362 } while (--count != 0);
1363 }
1364}