blob: 7ce1fc9a80995eb5363d32161b9ed17e97d5a41d [file] [log] [blame]
senorblanco@chromium.org92727612009-11-04 20:51:06 +00001/*
tomhudson@google.com98a5b422012-02-28 16:15:26 +00002 * Copyright 2012 The Android Open Source Project
epoger@google.comec3ed6a2011-07-28 14:26:00 +00003 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
senorblanco@chromium.org92727612009-11-04 20:51:06 +00006 */
7
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +00008#include <emmintrin.h>
caryclark@google.com83ecdc32012-06-06 12:10:26 +00009#include "SkBitmapProcState_opts_SSE2.h"
commit-bot@chromium.org8c4953c2014-04-30 14:58:46 +000010#include "SkBlitRow_opts_SSE2.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000011#include "SkColorPriv.h"
commit-bot@chromium.org47591072014-02-19 03:09:52 +000012#include "SkColor_opts_SSE2.h"
commit-bot@chromium.org27580472014-03-07 03:25:32 +000013#include "SkDither.h"
mtklein1059b1f2016-02-03 07:25:02 -080014#include "SkMSAN.h"
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +000015#include "SkUtils.h"
senorblanco@chromium.org92727612009-11-04 20:51:06 +000016
senorblanco@chromium.org92727612009-11-04 20:51:06 +000017/* SSE2 version of S32_Blend_BlitRow32()
18 * portable version is in core/SkBlitRow_D32.cpp
19 */
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000020void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
21 const SkPMColor* SK_RESTRICT src,
22 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000023 SkASSERT(alpha <= 255);
24 if (count <= 0) {
25 return;
26 }
27
28 uint32_t src_scale = SkAlpha255To256(alpha);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000029
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000030 if (count >= 4) {
31 SkASSERT(((size_t)dst & 0x03) == 0);
32 while (((size_t)dst & 0x0F) != 0) {
lsalzman40254c22016-08-05 11:48:45 -070033 *dst = SkPMLerp(*src, *dst, src_scale);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000034 src++;
35 dst++;
36 count--;
37 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000038
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000039 const __m128i *s = reinterpret_cast<const __m128i*>(src);
40 __m128i *d = reinterpret_cast<__m128i*>(dst);
tomhudson@google.com98a5b422012-02-28 16:15:26 +000041
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000042 while (count >= 4) {
43 // Load 4 pixels each of src and dest.
44 __m128i src_pixel = _mm_loadu_si128(s);
45 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000046
lsalzman40254c22016-08-05 11:48:45 -070047 __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000048 _mm_store_si128(d, result);
49 s++;
50 d++;
51 count -= 4;
52 }
53 src = reinterpret_cast<const SkPMColor*>(s);
54 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000055 }
56
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000057 while (count > 0) {
lsalzman40254c22016-08-05 11:48:45 -070058 *dst = SkPMLerp(*src, *dst, src_scale);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000059 src++;
60 dst++;
61 count--;
62 }
63}
64
senorblanco@chromium.org4e753552009-11-16 21:09:00 +000065void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
66 const SkPMColor* SK_RESTRICT src,
67 int count, U8CPU alpha) {
senorblanco@chromium.org92727612009-11-04 20:51:06 +000068 SkASSERT(alpha <= 255);
69 if (count <= 0) {
70 return;
71 }
72
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000073 if (count >= 4) {
74 while (((size_t)dst & 0x0F) != 0) {
75 *dst = SkBlendARGB32(*src, *dst, alpha);
76 src++;
77 dst++;
78 count--;
79 }
senorblanco@chromium.org92727612009-11-04 20:51:06 +000080
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000081 const __m128i *s = reinterpret_cast<const __m128i*>(src);
82 __m128i *d = reinterpret_cast<__m128i*>(dst);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000083 while (count >= 4) {
84 // Load 4 pixels each of src and dest.
85 __m128i src_pixel = _mm_loadu_si128(s);
86 __m128i dst_pixel = _mm_load_si128(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000087
qiankun.miao2253aa92014-11-25 06:35:02 -080088 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000089 _mm_store_si128(d, result);
90 s++;
91 d++;
92 count -= 4;
93 }
94 src = reinterpret_cast<const SkPMColor*>(s);
95 dst = reinterpret_cast<SkPMColor*>(d);
senorblanco@chromium.org92727612009-11-04 20:51:06 +000096 }
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000097
senorblanco@chromium.org92727612009-11-04 20:51:06 +000098 while (count > 0) {
99 *dst = SkBlendARGB32(*src, *dst, alpha);
100 src++;
101 dst++;
102 count--;
103 }
104}
senorblanco@chromium.orgc3856382010-12-13 15:27:20 +0000105
henrik.smiding70840cb2015-03-20 09:20:46 -0700106void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {
107 SkASSERT(count > 0);
108
109 uint32_t src_expand = (SkGetPackedG32(src) << 24) |
110 (SkGetPackedR32(src) << 13) |
111 (SkGetPackedB32(src) << 2);
112 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
113
114 // Check if we have enough pixels to run SIMD
115 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {
116 __m128i* dst_wide;
117 const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);
118 const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);
119 const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);
120 const __m128i scale_wide = _mm_set1_epi16(scale);
121 const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);
122 const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);
123
124 // Align dst to an even 16 byte address (0-7 pixels)
125 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {
126 *dst = SkBlend32_RGB16(src_expand, *dst, scale);
127 dst += 1;
128 count--;
129 }
130
131 dst_wide = reinterpret_cast<__m128i*>(dst);
132 do {
133 // Load eight RGB565 pixels
134 __m128i pixels = _mm_load_si128(dst_wide);
135
136 // Mask out sub-pixels
137 __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);
138 __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);
139 pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);
140 __m128i pixel_B = _mm_and_si128(pixels, mask_blue);
141
142 // Scale with alpha
143 pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);
144 pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);
145 pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);
146
147 // Add src_X_wide and shift down again
148 pixel_R = _mm_add_epi16(pixel_R, src_R_wide);
149 pixel_R = _mm_srli_epi16(pixel_R, 5);
150 pixel_G = _mm_add_epi16(pixel_G, src_G_wide);
151 pixel_B = _mm_add_epi16(pixel_B, src_B_wide);
152 pixel_B = _mm_srli_epi16(pixel_B, 5);
153
154 // Combine into RGB565 and store
155 pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);
156 pixel_G = _mm_and_si128(pixel_G, mask_green);
157 pixels = _mm_or_si128(pixel_R, pixel_G);
158 pixels = _mm_or_si128(pixels, pixel_B);
159 _mm_store_si128(dst_wide, pixels);
160 count -= 8;
161 dst_wide++;
162 } while (count >= 8);
163
164 dst = reinterpret_cast<uint16_t*>(dst_wide);
165 }
166
167 // Small loop to handle remaining pixels.
168 while (count > 0) {
169 *dst = SkBlend32_RGB16(src_expand, *dst, scale);
170 dst += 1;
171 count--;
172 }
173}
174
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000175// The following (left) shifts cause the top 5 bits of the mask components to
176// line up with the corresponding components in an SkPMColor.
177// Note that the mask's RGB16 order may differ from the SkPMColor order.
178#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
179#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
180#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
181
182#if SK_R16x5_R32x5_SHIFT == 0
183 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
184#elif SK_R16x5_R32x5_SHIFT > 0
185 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
186#else
187 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
188#endif
189
190#if SK_G16x5_G32x5_SHIFT == 0
191 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
192#elif SK_G16x5_G32x5_SHIFT > 0
193 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
194#else
195 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
196#endif
197
198#if SK_B16x5_B32x5_SHIFT == 0
199 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
200#elif SK_B16x5_B32x5_SHIFT > 0
201 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
202#else
203 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
204#endif
205
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000206static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
207 __m128i &mask, __m128i &srcA) {
208 // In the following comments, the components of src, dst and mask are
209 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
210 // by an R, G, B, or A suffix. Components of one of the four pixels that
211 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
212 // example is the blue channel of the second destination pixel. Memory
213 // layout is shown for an ARGB byte order in a color value.
214
215 // src and srcA store 8-bit values interleaved with zeros.
216 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
217 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
218 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
219 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
220 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
221 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
222 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
223
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000224 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000225 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000226 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
227 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000228
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000229 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000230 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
231 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000232
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000233 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000234 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
235 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000236
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000237 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000238 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
239 // 8-bit position
240 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
241 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000242 mask = _mm_or_si128(_mm_or_si128(r, g), b);
243
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000244 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000245 // i.e. split the sixteen 8-bit values from mask into two sets of eight
246 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000247 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000248 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000249 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000250 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000251 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
252
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000253 // Upscale from 0..31 to 0..32
254 // (allows to replace division by left-shift further down)
255 // Left-shift each component by 4 and add the result back to that component,
256 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000257 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
258 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
259
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000260 // Multiply each component of maskLo and maskHi by srcA
261 maskLo = _mm_mullo_epi16(maskLo, srcA);
262 maskHi = _mm_mullo_epi16(maskHi, srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000263
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000264 // Left shift mask components by 8 (divide by 256)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000265 maskLo = _mm_srli_epi16(maskLo, 8);
266 maskHi = _mm_srli_epi16(maskHi, 8);
267
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000268 // Interleave R,G,B into the lower byte of the word
269 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000270 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000271 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000272 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
273
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000274 // mask = (src - dst) * mask
275 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
276 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000277
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000278 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000279 maskLo = _mm_srai_epi16(maskLo, 5);
280 maskHi = _mm_srai_epi16(maskHi, 5);
281
282 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000283 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000284 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
285 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
286
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000287 // Pack into 4 32bit dst pixels.
288 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
289 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
290 // clamping to 255 if necessary.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000291 return _mm_packus_epi16(resultLo, resultHi);
292}
293
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000294static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000295 __m128i &mask) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000296 // In the following comments, the components of src, dst and mask are
297 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
298 // by an R, G, B, or A suffix. Components of one of the four pixels that
299 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
300 // example is the blue channel of the second destination pixel. Memory
301 // layout is shown for an ARGB byte order in a color value.
302
303 // src and srcA store 8-bit values interleaved with zeros.
304 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
305 // mask stores 16-bit values (shown as high and low bytes) interleaved with
306 // zeros
307 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
308 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
309
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000310 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000311 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000312 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
313 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000314
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000315 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000316 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
317 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000318
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000319 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
bungeman@google.com8cd5ae72012-07-09 17:44:57 +0000320 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
321 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000322
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000323 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000324 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
325 // 8-bit position
326 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
327 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000328 mask = _mm_or_si128(_mm_or_si128(r, g), b);
329
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000330 // Interleave R,G,B into the lower byte of word.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000331 // i.e. split the sixteen 8-bit values from mask into two sets of eight
332 // 16-bit values, padded by zero.
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000333 __m128i maskLo, maskHi;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000334 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000335 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000336 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000337 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
338
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000339 // Upscale from 0..31 to 0..32
340 // (allows to replace division by left-shift further down)
341 // Left-shift each component by 4 and add the result back to that component,
342 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000343 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
344 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
345
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000346 // Interleave R,G,B into the lower byte of the word
347 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000348 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000349 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000350 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
351
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000352 // mask = (src - dst) * mask
353 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
354 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000355
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000356 // mask = (src - dst) * mask >> 5
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000357 maskLo = _mm_srai_epi16(maskLo, 5);
358 maskHi = _mm_srai_epi16(maskHi, 5);
359
360 // Add two pixels into result.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000361 // result = dst + ((src - dst) * mask >> 5)
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000362 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
363 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
364
bungeman@google.com27123cd2012-08-21 19:25:42 +0000365 // Pack into 4 32bit dst pixels and force opaque.
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000366 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
367 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
368 // clamping to 255 if necessary. Set alpha components to 0xFF.
bungeman@google.com27123cd2012-08-21 19:25:42 +0000369 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
370 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000371}
372
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000373void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
374 SkColor src, int width, SkPMColor) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000375 if (width <= 0) {
376 return;
377 }
378
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000379 int srcA = SkColorGetA(src);
380 int srcR = SkColorGetR(src);
381 int srcG = SkColorGetG(src);
382 int srcB = SkColorGetB(src);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000383
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000384 srcA = SkAlpha255To256(srcA);
385
386 if (width >= 4) {
387 SkASSERT(((size_t)dst & 0x03) == 0);
388 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000389 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
390 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000391 dst++;
392 width--;
393 }
394
395 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000396 // Set alpha to 0xFF and replicate source four times in SSE register.
397 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
398 // Interleave with zeros to get two sets of four 16-bit values.
399 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
400 // Set srcA_sse to contain eight copies of srcA, padded with zero.
401 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
402 __m128i srcA_sse = _mm_set1_epi16(srcA);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000403 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000404 // Load four destination pixels into dst_sse.
405 __m128i dst_sse = _mm_load_si128(d);
406 // Load four 16-bit masks into lower half of mask_sse.
407 __m128i mask_sse = _mm_loadl_epi64(
408 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000409
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000410 // Check whether masks are equal to 0 and get the highest bit
411 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000412 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000413 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000414 _mm_setzero_si128()));
415
416 // if mask pixels are not all zero, we will blend the dst pixels
417 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000418 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000419 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
420 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
421 mask_sse = _mm_unpacklo_epi16(mask_sse,
422 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000423
424 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000425 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
426 mask_sse, srcA_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000427 _mm_store_si128(d, result);
428 }
429
430 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000431 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000432 width -= 4;
433 }
434
435 dst = reinterpret_cast<SkPMColor*>(d);
436 }
437
438 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000439 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
440 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000441 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000442 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000443 }
444}
445
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000446void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
447 SkColor src, int width, SkPMColor opaqueDst) {
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000448 if (width <= 0) {
449 return;
450 }
451
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000452 int srcR = SkColorGetR(src);
453 int srcG = SkColorGetG(src);
454 int srcB = SkColorGetB(src);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000455
456 if (width >= 4) {
457 SkASSERT(((size_t)dst & 0x03) == 0);
458 while (((size_t)dst & 0x0F) != 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000459 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
460 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000461 dst++;
462 width--;
463 }
464
465 __m128i *d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000466 // Set alpha to 0xFF and replicate source four times in SSE register.
467 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
468 // Set srcA_sse to contain eight copies of srcA, padded with zero.
469 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
470 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000471 while (width >= 4) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000472 // Load four destination pixels into dst_sse.
473 __m128i dst_sse = _mm_load_si128(d);
474 // Load four 16-bit masks into lower half of mask_sse.
475 __m128i mask_sse = _mm_loadl_epi64(
476 reinterpret_cast<const __m128i*>(mask));
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000477
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000478 // Check whether masks are equal to 0 and get the highest bit
479 // of each byte of result, if masks are all zero, we will get
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000480 // pack_cmp to 0xFFFF
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000481 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000482 _mm_setzero_si128()));
483
484 // if mask pixels are not all zero, we will blend the dst pixels
485 if (pack_cmp != 0xFFFF) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000486 // Unpack 4 16bit mask pixels to
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000487 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
488 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
489 mask_sse = _mm_unpacklo_epi16(mask_sse,
490 _mm_setzero_si128());
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000491
492 // Process 4 32bit dst pixels
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000493 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
494 mask_sse);
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000495 _mm_store_si128(d, result);
496 }
497
498 d++;
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000499 mask += 4;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000500 width -= 4;
501 }
502
503 dst = reinterpret_cast<SkPMColor*>(d);
504 }
505
506 while (width > 0) {
commit-bot@chromium.org76e0d132013-07-02 17:40:19 +0000507 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
508 mask++;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000509 dst++;
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000510 width--;
tomhudson@google.comd6770e62012-02-14 16:01:15 +0000511 }
512}
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000513
commit-bot@chromium.org39ce33a2014-02-24 04:23:39 +0000514/* SSE2 version of S32_D565_Opaque()
515 * portable version is in core/SkBlitRow_D16.cpp
516 */
517void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
518 const SkPMColor* SK_RESTRICT src, int count,
519 U8CPU alpha, int /*x*/, int /*y*/) {
520 SkASSERT(255 == alpha);
521
522 if (count <= 0) {
523 return;
524 }
525
526 if (count >= 8) {
527 while (((size_t)dst & 0x0F) != 0) {
528 SkPMColor c = *src++;
529 SkPMColorAssert(c);
530
531 *dst++ = SkPixel32ToPixel16_ToU16(c);
532 count--;
533 }
534
535 const __m128i* s = reinterpret_cast<const __m128i*>(src);
536 __m128i* d = reinterpret_cast<__m128i*>(dst);
commit-bot@chromium.org39ce33a2014-02-24 04:23:39 +0000537
538 while (count >= 8) {
539 // Load 8 pixels of src.
540 __m128i src_pixel1 = _mm_loadu_si128(s++);
541 __m128i src_pixel2 = _mm_loadu_si128(s++);
542
qiankun.miao52e74c62014-11-24 06:59:44 -0800543 __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);
commit-bot@chromium.org39ce33a2014-02-24 04:23:39 +0000544 _mm_store_si128(d++, d_pixel);
545 count -= 8;
546 }
547 src = reinterpret_cast<const SkPMColor*>(s);
548 dst = reinterpret_cast<uint16_t*>(d);
549 }
550
551 if (count > 0) {
552 do {
553 SkPMColor c = *src++;
554 SkPMColorAssert(c);
555 *dst++ = SkPixel32ToPixel16_ToU16(c);
556 } while (--count != 0);
557 }
558}
559
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000560/* SSE2 version of S32A_D565_Opaque()
561 * portable version is in core/SkBlitRow_D16.cpp
562 */
563void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,
564 const SkPMColor* SK_RESTRICT src,
565 int count, U8CPU alpha, int /*x*/, int /*y*/) {
566 SkASSERT(255 == alpha);
567
568 if (count <= 0) {
569 return;
570 }
571
572 if (count >= 8) {
573 // Make dst 16 bytes alignment
574 while (((size_t)dst & 0x0F) != 0) {
575 SkPMColor c = *src++;
576 if (c) {
577 *dst = SkSrcOver32To16(c, *dst);
578 }
579 dst += 1;
580 count--;
581 }
582
583 const __m128i* s = reinterpret_cast<const __m128i*>(src);
584 __m128i* d = reinterpret_cast<__m128i*>(dst);
585 __m128i var255 = _mm_set1_epi16(255);
586 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
587 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
588 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
589
590 while (count >= 8) {
591 // Load 8 pixels of src.
592 __m128i src_pixel1 = _mm_loadu_si128(s++);
593 __m128i src_pixel2 = _mm_loadu_si128(s++);
594
595 // Check whether src pixels are equal to 0 and get the highest bit
596 // of each byte of result, if src pixels are all zero, src_cmp1 and
597 // src_cmp2 will be 0xFFFF.
598 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,
599 _mm_setzero_si128()));
600 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,
601 _mm_setzero_si128()));
602 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {
603 d++;
604 count -= 8;
605 continue;
606 }
607
608 // Load 8 pixels of dst.
609 __m128i dst_pixel = _mm_load_si128(d);
610
611 // Extract A from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000612 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000613 sa1 = _mm_srli_epi32(sa1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000614 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000615 sa2 = _mm_srli_epi32(sa2, 24);
616 __m128i sa = _mm_packs_epi32(sa1, sa2);
617
618 // Extract R from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000619 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000620 sr1 = _mm_srli_epi32(sr1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000621 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000622 sr2 = _mm_srli_epi32(sr2, 24);
623 __m128i sr = _mm_packs_epi32(sr1, sr2);
624
625 // Extract G from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000626 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000627 sg1 = _mm_srli_epi32(sg1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000628 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000629 sg2 = _mm_srli_epi32(sg2, 24);
630 __m128i sg = _mm_packs_epi32(sg1, sg2);
631
632 // Extract B from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000633 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000634 sb1 = _mm_srli_epi32(sb1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000635 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000636 sb2 = _mm_srli_epi32(sb2, 24);
637 __m128i sb = _mm_packs_epi32(sb1, sb2);
638
639 // Extract R G B from dst.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000640 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000641 dr = _mm_and_si128(dr, r16_mask);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000642 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000643 dg = _mm_and_si128(dg, g16_mask);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000644 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000645 db = _mm_and_si128(db, b16_mask);
646
647 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa
648
649 // Calculate R G B of result.
650 // Original algorithm is in SkSrcOver32To16().
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000651 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000652 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000653 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000654 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000655 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000656 db = _mm_srli_epi16(db, 8 - SK_B16_BITS);
657
658 // Pack R G B into 16-bit color.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000659 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
commit-bot@chromium.org47591072014-02-19 03:09:52 +0000660
661 // Store 8 16-bit colors in dst.
662 _mm_store_si128(d++, d_pixel);
663 count -= 8;
664 }
665
666 src = reinterpret_cast<const SkPMColor*>(s);
667 dst = reinterpret_cast<uint16_t*>(d);
668 }
669
670 if (count > 0) {
671 do {
672 SkPMColor c = *src++;
673 SkPMColorAssert(c);
674 if (c) {
675 *dst = SkSrcOver32To16(c, *dst);
676 }
677 dst += 1;
678 } while (--count != 0);
679 }
680}
commit-bot@chromium.org27580472014-03-07 03:25:32 +0000681
682void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
683 const SkPMColor* SK_RESTRICT src,
684 int count, U8CPU alpha, int x, int y) {
685 SkASSERT(255 == alpha);
686
687 if (count <= 0) {
688 return;
689 }
690
691 if (count >= 8) {
692 while (((size_t)dst & 0x0F) != 0) {
693 DITHER_565_SCAN(y);
694 SkPMColor c = *src++;
695 SkPMColorAssert(c);
696
697 unsigned dither = DITHER_VALUE(x);
698 *dst++ = SkDitherRGB32To565(c, dither);
699 DITHER_INC_X(x);
700 count--;
701 }
702
703 unsigned short dither_value[8];
704 __m128i dither;
705#ifdef ENABLE_DITHER_MATRIX_4X4
706 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
707 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
708 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
709 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
710 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
711#else
712 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
713 dither_value[0] = dither_value[4] = (dither_scan
714 >> (((x) & 3) << 2)) & 0xF;
715 dither_value[1] = dither_value[5] = (dither_scan
716 >> (((x + 1) & 3) << 2)) & 0xF;
717 dither_value[2] = dither_value[6] = (dither_scan
718 >> (((x + 2) & 3) << 2)) & 0xF;
719 dither_value[3] = dither_value[7] = (dither_scan
720 >> (((x + 3) & 3) << 2)) & 0xF;
721#endif
722 dither = _mm_loadu_si128((__m128i*) dither_value);
723
724 const __m128i* s = reinterpret_cast<const __m128i*>(src);
725 __m128i* d = reinterpret_cast<__m128i*>(dst);
726
727 while (count >= 8) {
728 // Load 8 pixels of src.
729 __m128i src_pixel1 = _mm_loadu_si128(s++);
730 __m128i src_pixel2 = _mm_loadu_si128(s++);
731
732 // Extract R from src.
733 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
734 sr1 = _mm_srli_epi32(sr1, 24);
735 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
736 sr2 = _mm_srli_epi32(sr2, 24);
737 __m128i sr = _mm_packs_epi32(sr1, sr2);
738
739 // SkDITHER_R32To565(sr, dither)
740 __m128i sr_offset = _mm_srli_epi16(sr, 5);
741 sr = _mm_add_epi16(sr, dither);
742 sr = _mm_sub_epi16(sr, sr_offset);
743 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);
744
745 // Extract G from src.
746 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
747 sg1 = _mm_srli_epi32(sg1, 24);
748 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
749 sg2 = _mm_srli_epi32(sg2, 24);
750 __m128i sg = _mm_packs_epi32(sg1, sg2);
751
752 // SkDITHER_R32To565(sg, dither)
753 __m128i sg_offset = _mm_srli_epi16(sg, 6);
754 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));
755 sg = _mm_sub_epi16(sg, sg_offset);
756 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);
757
758 // Extract B from src.
759 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
760 sb1 = _mm_srli_epi32(sb1, 24);
761 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
762 sb2 = _mm_srli_epi32(sb2, 24);
763 __m128i sb = _mm_packs_epi32(sb1, sb2);
764
765 // SkDITHER_R32To565(sb, dither)
766 __m128i sb_offset = _mm_srli_epi16(sb, 5);
767 sb = _mm_add_epi16(sb, dither);
768 sb = _mm_sub_epi16(sb, sb_offset);
769 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);
770
771 // Pack and store 16-bit dst pixel.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000772 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);
commit-bot@chromium.org27580472014-03-07 03:25:32 +0000773 _mm_store_si128(d++, d_pixel);
774
775 count -= 8;
776 x += 8;
777 }
778
779 src = reinterpret_cast<const SkPMColor*>(s);
780 dst = reinterpret_cast<uint16_t*>(d);
781 }
782
783 if (count > 0) {
784 DITHER_565_SCAN(y);
785 do {
786 SkPMColor c = *src++;
787 SkPMColorAssert(c);
788
789 unsigned dither = DITHER_VALUE(x);
790 *dst++ = SkDitherRGB32To565(c, dither);
791 DITHER_INC_X(x);
792 } while (--count != 0);
793 }
794}
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +0000795
796/* SSE2 version of S32A_D565_Opaque_Dither()
797 * portable version is in core/SkBlitRow_D16.cpp
798 */
799void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,
800 const SkPMColor* SK_RESTRICT src,
801 int count, U8CPU alpha, int x, int y) {
802 SkASSERT(255 == alpha);
803
804 if (count <= 0) {
805 return;
806 }
807
808 if (count >= 8) {
809 while (((size_t)dst & 0x0F) != 0) {
810 DITHER_565_SCAN(y);
811 SkPMColor c = *src++;
812 SkPMColorAssert(c);
813 if (c) {
814 unsigned a = SkGetPackedA32(c);
815
816 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
817
818 unsigned sr = SkGetPackedR32(c);
819 unsigned sg = SkGetPackedG32(c);
820 unsigned sb = SkGetPackedB32(c);
821 sr = SkDITHER_R32_FOR_565(sr, d);
822 sg = SkDITHER_G32_FOR_565(sg, d);
823 sb = SkDITHER_B32_FOR_565(sb, d);
824
825 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
826 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
827 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
828 // now src and dst expanded are in g:11 r:10 x:1 b:10
829 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
830 }
831 dst += 1;
832 DITHER_INC_X(x);
833 count--;
834 }
835
836 unsigned short dither_value[8];
837 __m128i dither, dither_cur;
838#ifdef ENABLE_DITHER_MATRIX_4X4
839 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];
840 dither_value[0] = dither_value[4] = dither_scan[(x) & 3];
841 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];
842 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];
843 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];
844#else
845 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];
846 dither_value[0] = dither_value[4] = (dither_scan
847 >> (((x) & 3) << 2)) & 0xF;
848 dither_value[1] = dither_value[5] = (dither_scan
849 >> (((x + 1) & 3) << 2)) & 0xF;
850 dither_value[2] = dither_value[6] = (dither_scan
851 >> (((x + 2) & 3) << 2)) & 0xF;
852 dither_value[3] = dither_value[7] = (dither_scan
853 >> (((x + 3) & 3) << 2)) & 0xF;
854#endif
855 dither = _mm_loadu_si128((__m128i*) dither_value);
856
857 const __m128i* s = reinterpret_cast<const __m128i*>(src);
858 __m128i* d = reinterpret_cast<__m128i*>(dst);
859 __m128i var256 = _mm_set1_epi16(256);
860 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);
861 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);
862 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);
863
864 while (count >= 8) {
865 // Load 8 pixels of src and dst.
866 __m128i src_pixel1 = _mm_loadu_si128(s++);
867 __m128i src_pixel2 = _mm_loadu_si128(s++);
868 __m128i dst_pixel = _mm_load_si128(d);
869
870 // Extract A from src.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000871 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +0000872 sa1 = _mm_srli_epi32(sa1, 24);
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000873 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +0000874 sa2 = _mm_srli_epi32(sa2, 24);
875 __m128i sa = _mm_packs_epi32(sa1, sa2);
876
877 // Calculate current dither value.
878 dither_cur = _mm_mullo_epi16(dither,
879 _mm_add_epi16(sa, _mm_set1_epi16(1)));
880 dither_cur = _mm_srli_epi16(dither_cur, 8);
881
882 // Extract R from src.
883 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));
884 sr1 = _mm_srli_epi32(sr1, 24);
885 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));
886 sr2 = _mm_srli_epi32(sr2, 24);
887 __m128i sr = _mm_packs_epi32(sr1, sr2);
888
889 // SkDITHER_R32_FOR_565(sr, d)
890 __m128i sr_offset = _mm_srli_epi16(sr, 5);
891 sr = _mm_add_epi16(sr, dither_cur);
892 sr = _mm_sub_epi16(sr, sr_offset);
893
894 // Expand sr.
895 sr = _mm_slli_epi16(sr, 2);
896
897 // Extract G from src.
898 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));
899 sg1 = _mm_srli_epi32(sg1, 24);
900 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));
901 sg2 = _mm_srli_epi32(sg2, 24);
902 __m128i sg = _mm_packs_epi32(sg1, sg2);
903
904 // sg = SkDITHER_G32_FOR_565(sg, d).
905 __m128i sg_offset = _mm_srli_epi16(sg, 6);
906 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));
907 sg = _mm_sub_epi16(sg, sg_offset);
908
909 // Expand sg.
910 sg = _mm_slli_epi16(sg, 3);
911
912 // Extract B from src.
913 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));
914 sb1 = _mm_srli_epi32(sb1, 24);
915 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));
916 sb2 = _mm_srli_epi32(sb2, 24);
917 __m128i sb = _mm_packs_epi32(sb1, sb2);
918
919 // sb = SkDITHER_B32_FOR_565(sb, d).
920 __m128i sb_offset = _mm_srli_epi16(sb, 5);
921 sb = _mm_add_epi16(sb, dither_cur);
922 sb = _mm_sub_epi16(sb, sb_offset);
923
924 // Expand sb.
925 sb = _mm_slli_epi16(sb, 2);
926
927 // Extract R G B from dst.
928 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);
929 dr = _mm_and_si128(dr, r16_mask);
930 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);
931 dg = _mm_and_si128(dg, g16_mask);
932 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);
933 db = _mm_and_si128(db, b16_mask);
934
935 // SkAlpha255To256(255 - a) >> 3
936 __m128i isa = _mm_sub_epi16(var256, sa);
937 isa = _mm_srli_epi16(isa, 3);
938
939 dr = _mm_mullo_epi16(dr, isa);
940 dr = _mm_add_epi16(dr, sr);
941 dr = _mm_srli_epi16(dr, 5);
942
943 dg = _mm_mullo_epi16(dg, isa);
944 dg = _mm_add_epi16(dg, sg);
945 dg = _mm_srli_epi16(dg, 5);
946
947 db = _mm_mullo_epi16(db, isa);
948 db = _mm_add_epi16(db, sb);
949 db = _mm_srli_epi16(db, 5);
950
951 // Package and store dst pixel.
commit-bot@chromium.orgc524e982014-04-09 15:43:46 +0000952 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);
commit-bot@chromium.orgfe089b32014-03-07 13:24:42 +0000953 _mm_store_si128(d++, d_pixel);
954
955 count -= 8;
956 x += 8;
957 }
958
959 src = reinterpret_cast<const SkPMColor*>(s);
960 dst = reinterpret_cast<uint16_t*>(d);
961 }
962
963 if (count > 0) {
964 DITHER_565_SCAN(y);
965 do {
966 SkPMColor c = *src++;
967 SkPMColorAssert(c);
968 if (c) {
969 unsigned a = SkGetPackedA32(c);
970
971 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));
972
973 unsigned sr = SkGetPackedR32(c);
974 unsigned sg = SkGetPackedG32(c);
975 unsigned sb = SkGetPackedB32(c);
976 sr = SkDITHER_R32_FOR_565(sr, d);
977 sg = SkDITHER_G32_FOR_565(sg, d);
978 sb = SkDITHER_B32_FOR_565(sb, d);
979
980 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
981 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
982 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
983 // now src and dst expanded are in g:11 r:10 x:1 b:10
984 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
985 }
986 dst += 1;
987 DITHER_INC_X(x);
988 } while (--count != 0);
989 }
990}