blob: 7f03907d1c8cbc7e043a48defcb18dc88a6d197d [file] [log] [blame]
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001/*
tomhudson@google.com98a5b422012-02-28 16:15:26 +00002 * Copyright 2012 The Android Open Source Project
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
senorblanco@chromium.org92727612009-11-04 20:51:06 +00006 */
7
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +00008#include <emmintrin.h>
caryclark@google.com83ecdc32012-06-06 12:10:26 +00009#include "SkBitmapProcState_opts_SSE2.h"
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +000010#include "SkBlitRow_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000011#include "SkColorPriv.h"
commit-bot@chromium.org47591072014-02-19 03:09:52 +000012#include "SkColor_opts_SSE2.h"
commit-bot@chromium.org27580472014-03-07 03:25:32 +000013#include "SkDither.h"
mtklein1059b1f2016-02-03 07:25:02 -080014#include "SkMSAN.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000015#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000016
senorblanco@chromium.org92727612009-11-04 20:51:06 +000017/* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000020void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21 const SkPMColor* SK_RESTRICT src,
22 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000023 SkASSERT(alpha <= 255);
24 if (count <= 0) {
25 return;
26 }
27
28 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000029
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000030 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
lsalzman40254c22016-08-05 11:48:45 -070033 *dst = SkPMLerp(*src, *dst, src_scale);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000034 src++;
35 dst++;
36 count--;
37 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000038
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000039 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
tomhudson@google.com98a5b422012-02-28 16:15:26 +000041
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000042 while (count >= 4) {
43 // Load 4 pixels each of src and dest.
44 __m128i src_pixel = _mm_loadu_si128(s);
45 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000046
lsalzman40254c22016-08-05 11:48:45 -070047 __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000048 _mm_store_si128(d, result);
49 s++;
50 d++;
51 count -= 4;
52 }
53 src = reinterpret_cast<const SkPMColor*>(s);
54 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000055 }
56
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000057 while (count > 0) {
lsalzman40254c22016-08-05 11:48:45 -070058 *dst = SkPMLerp(*src, *dst, src_scale);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000059 src++;
60 dst++;
61 count--;
62 }
63}
64
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000065void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
66 const SkPMColor* SK_RESTRICT src,
67 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000068 SkASSERT(alpha <= 255);
69 if (count <= 0) {
70 return;
71 }
72
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000073 if (count >= 4) {
74 while (((size_t)dst & 0x0F) != 0) {
75 *dst = SkBlendARGB32(*src, *dst, alpha);
76 src++;
77 dst++;
78 count--;
79 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000080
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000081 const __m128i *s = reinterpret_cast<const __m128i*>(src);
82 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000083 while (count >= 4) {
84 // Load 4 pixels each of src and dest.
85 __m128i src_pixel = _mm_loadu_si128(s);
86 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000087
qiankun.miao2253aa92014-11-25 06:35:02 -080088 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000089 _mm_store_si128(d, result);
90 s++;
91 d++;
92 count -= 4;
93 }
94 src = reinterpret_cast<const SkPMColor*>(s);
95 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000096 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000097
senorblanco@chromium.org92727612009-11-04 20:51:06 +000098 while (count > 0) {
99 *dst = SkBlendARGB32(*src, *dst, alpha);
100 src++;
101 dst++;
102 count--;
103 }
104}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000105
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000106// The following (left) shifts cause the top 5 bits of the mask components to
107// line up with the corresponding components in an SkPMColor.
108// Note that the mask's RGB16 order may differ from the SkPMColor order.
109#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
110#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
111#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
112
113#if SK_R16x5_R32x5_SHIFT == 0
114 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
115#elif SK_R16x5_R32x5_SHIFT > 0
116 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
117#else
118 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
119#endif
120
121#if SK_G16x5_G32x5_SHIFT == 0
122 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
123#elif SK_G16x5_G32x5_SHIFT > 0
124 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
125#else
126 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
127#endif
128
129#if SK_B16x5_B32x5_SHIFT == 0
130 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
131#elif SK_B16x5_B32x5_SHIFT > 0
132 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
133#else
134 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
135#endif
136
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000137static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
138 __m128i &mask, __m128i &srcA) {
139 // In the following comments, the components of src, dst and mask are
140 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
141 // by an R, G, B, or A suffix. Components of one of the four pixels that
142 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
143 // example is the blue channel of the second destination pixel. Memory
144 // layout is shown for an ARGB byte order in a color value.
145
146 // src and srcA store 8-bit values interleaved with zeros.
147 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
148 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
149 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
150 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
151 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
152 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
153 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
154
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000155 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000156 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000157 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
158 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000159
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000160 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000161 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
162 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000163
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000164 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000165 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
166 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000167
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000168 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000169 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
170 // 8-bit position
171 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
172 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000173 mask = _mm_or_si128(_mm_or_si128(r, g), b);
174
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000175 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000176 // i.e. split the sixteen 8-bit values from mask into two sets of eight
177 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000178 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000179 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000180 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000181 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000182 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
183
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000184 // Upscale from 0..31 to 0..32
185 // (allows to replace division by left-shift further down)
186 // Left-shift each component by 4 and add the result back to that component,
187 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000188 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
189 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
190
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000191 // Multiply each component of maskLo and maskHi by srcA
192 maskLo = _mm_mullo_epi16(maskLo, srcA);
193 maskHi = _mm_mullo_epi16(maskHi, srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000194
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000195 // Left shift mask components by 8 (divide by 256)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000196 maskLo = _mm_srli_epi16(maskLo, 8);
197 maskHi = _mm_srli_epi16(maskHi, 8);
198
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000199 // Interleave R,G,B into the lower byte of the word
200 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000201 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000202 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000203 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
204
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000205 // mask = (src - dst) * mask
206 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
207 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000208
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000209 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000210 maskLo = _mm_srai_epi16(maskLo, 5);
211 maskHi = _mm_srai_epi16(maskHi, 5);
212
213 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000214 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000215 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
216 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
217
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000218 // Pack into 4 32bit dst pixels.
219 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
220 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
221 // clamping to 255 if necessary.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000222 return _mm_packus_epi16(resultLo, resultHi);
223}
224
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000225static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000226 __m128i &mask) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000227 // In the following comments, the components of src, dst and mask are
228 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
229 // by an R, G, B, or A suffix. Components of one of the four pixels that
230 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
231 // example is the blue channel of the second destination pixel. Memory
232 // layout is shown for an ARGB byte order in a color value.
233
234 // src and srcA store 8-bit values interleaved with zeros.
235 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
236 // mask stores 16-bit values (shown as high and low bytes) interleaved with
237 // zeros
238 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
239 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
240
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000241 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000242 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000243 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
244 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000245
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000246 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000247 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
248 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000249
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000250 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000251 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
252 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000253
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000254 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000255 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
256 // 8-bit position
257 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
258 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000259 mask = _mm_or_si128(_mm_or_si128(r, g), b);
260
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000261 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000262 // i.e. split the sixteen 8-bit values from mask into two sets of eight
263 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000264 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000265 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000266 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000267 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000268 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
269
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000270 // Upscale from 0..31 to 0..32
271 // (allows to replace division by left-shift further down)
272 // Left-shift each component by 4 and add the result back to that component,
273 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000274 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
275 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
276
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000277 // Interleave R,G,B into the lower byte of the word
278 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000279 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000280 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000281 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
282
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000283 // mask = (src - dst) * mask
284 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
285 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000286
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000287 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000288 maskLo = _mm_srai_epi16(maskLo, 5);
289 maskHi = _mm_srai_epi16(maskHi, 5);
290
291 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000292 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000293 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
294 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
295
bungeman@google.com27123cd2012-08-21 19:25:42 +0000296 // Pack into 4 32bit dst pixels and force opaque.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000297 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
298 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
299 // clamping to 255 if necessary. Set alpha components to 0xFF.
bungeman@google.com27123cd2012-08-21 19:25:42 +0000300 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
301 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000302}
303
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000304void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
305 SkColor src, int width, SkPMColor) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000306 if (width <= 0) {
307 return;
308 }
309
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000310 int srcA = SkColorGetA(src);
311 int srcR = SkColorGetR(src);
312 int srcG = SkColorGetG(src);
313 int srcB = SkColorGetB(src);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000314
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000315 srcA = SkAlpha255To256(srcA);
316
317 if (width >= 4) {
318 SkASSERT(((size_t)dst & 0x03) == 0);
319 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000320 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
321 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000322 dst++;
323 width--;
324 }
325
326 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000327 // Set alpha to 0xFF and replicate source four times in SSE register.
328 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
329 // Interleave with zeros to get two sets of four 16-bit values.
330 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
331 // Set srcA_sse to contain eight copies of srcA, padded with zero.
332 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
333 __m128i srcA_sse = _mm_set1_epi16(srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000334 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000335 // Load four destination pixels into dst_sse.
336 __m128i dst_sse = _mm_load_si128(d);
337 // Load four 16-bit masks into lower half of mask_sse.
338 __m128i mask_sse = _mm_loadl_epi64(
339 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000340
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000341 // Check whether masks are equal to 0 and get the highest bit
342 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000343 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000344 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000345 _mm_setzero_si128()));
346
347 // if mask pixels are not all zero, we will blend the dst pixels
348 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000349 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000350 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
351 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
352 mask_sse = _mm_unpacklo_epi16(mask_sse,
353 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000354
355 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000356 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
357 mask_sse, srcA_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000358 _mm_store_si128(d, result);
359 }
360
361 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000362 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000363 width -= 4;
364 }
365
366 dst = reinterpret_cast<SkPMColor*>(d);
367 }
368
369 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000370 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
371 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000372 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000373 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000374 }
375}
376
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000377void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
378 SkColor src, int width, SkPMColor opaqueDst) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000379 if (width <= 0) {
380 return;
381 }
382
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000383 int srcR = SkColorGetR(src);
384 int srcG = SkColorGetG(src);
385 int srcB = SkColorGetB(src);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000386
387 if (width >= 4) {
388 SkASSERT(((size_t)dst & 0x03) == 0);
389 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000390 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
391 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000392 dst++;
393 width--;
394 }
395
396 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000397 // Set alpha to 0xFF and replicate source four times in SSE register.
398 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
399 // Set srcA_sse to contain eight copies of srcA, padded with zero.
400 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
401 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000402 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000403 // Load four destination pixels into dst_sse.
404 __m128i dst_sse = _mm_load_si128(d);
405 // Load four 16-bit masks into lower half of mask_sse.
406 __m128i mask_sse = _mm_loadl_epi64(
407 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000408
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000409 // Check whether masks are equal to 0 and get the highest bit
410 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000411 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000412 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000413 _mm_setzero_si128()));
414
415 // if mask pixels are not all zero, we will blend the dst pixels
416 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000417 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000418 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
419 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
420 mask_sse = _mm_unpacklo_epi16(mask_sse,
421 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000422
423 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000424 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
425 mask_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000426 _mm_store_si128(d, result);
427 }
428
429 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000430 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000431 width -= 4;
432 }
433
434 dst = reinterpret_cast<SkPMColor*>(d);
435 }
436
437 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000438 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
439 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000440 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000441 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000442 }
443}