senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 1 | /* |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 2 | * Copyright 2012 The Android Open Source Project |
epoger@google.com | ec3ed6a | 2011-07-28 14:26:00 +0000 | [diff] [blame] | 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 6 | */ |
| 7 | |
epoger@google.com | ec3ed6a | 2011-07-28 14:26:00 +0000 | [diff] [blame] | 8 | |
senorblanco@chromium.org | 4e75355 | 2009-11-16 21:09:00 +0000 | [diff] [blame] | 9 | #include "SkBlitRow_opts_SSE2.h" |
caryclark@google.com | 83ecdc3 | 2012-06-06 12:10:26 +0000 | [diff] [blame] | 10 | #include "SkBitmapProcState_opts_SSE2.h" |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 11 | #include "SkColorPriv.h" |
senorblanco@chromium.org | c385638 | 2010-12-13 15:27:20 +0000 | [diff] [blame] | 12 | #include "SkUtils.h" |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 13 | |
| 14 | #include <emmintrin.h> |
| 15 | |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 16 | /* SSE2 version of S32_Blend_BlitRow32() |
| 17 | * portable version is in core/SkBlitRow_D32.cpp |
| 18 | */ |
senorblanco@chromium.org | 4e75355 | 2009-11-16 21:09:00 +0000 | [diff] [blame] | 19 | void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
| 20 | const SkPMColor* SK_RESTRICT src, |
| 21 | int count, U8CPU alpha) { |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 22 | SkASSERT(alpha <= 255); |
| 23 | if (count <= 0) { |
| 24 | return; |
| 25 | } |
| 26 | |
| 27 | uint32_t src_scale = SkAlpha255To256(alpha); |
| 28 | uint32_t dst_scale = 256 - src_scale; |
| 29 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 30 | if (count >= 4) { |
| 31 | SkASSERT(((size_t)dst & 0x03) == 0); |
| 32 | while (((size_t)dst & 0x0F) != 0) { |
| 33 | *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
| 34 | src++; |
| 35 | dst++; |
| 36 | count--; |
| 37 | } |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 38 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 39 | const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 40 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 41 | __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 42 | __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); |
| 43 | |
| 44 | // Move scale factors to upper byte of word |
| 45 | __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); |
| 46 | __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 47 | while (count >= 4) { |
| 48 | // Load 4 pixels each of src and dest. |
| 49 | __m128i src_pixel = _mm_loadu_si128(s); |
| 50 | __m128i dst_pixel = _mm_load_si128(d); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 51 | |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 52 | // Interleave Atom port 0/1 operations based on the execution port |
| 53 | // constraints that multiply can only be executed on port 0 (while |
| 54 | // boolean operations can be executed on either port 0 or port 1) |
| 55 | // because GCC currently doesn't do a good job scheduling |
| 56 | // instructions based on these constraints. |
| 57 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 58 | // Get red and blue pixels into lower byte of each word. |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 59 | // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 60 | __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 61 | |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 62 | // Multiply by scale. |
| 63 | // (4 x (0, rs.h, 0, bs.h)) |
| 64 | // where rs.h stands for the higher byte of r * scale, and |
| 65 | // bs.h the higher byte of b * scale. |
| 66 | src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); |
| 67 | |
| 68 | // Get alpha and green pixels into higher byte of each word. |
| 69 | // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) |
| 70 | __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 71 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 72 | // Multiply by scale. |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 73 | // (4 x (as.h, as.l, gs.h, gs.l)) |
| 74 | src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 75 | |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 76 | // Clear the lower byte of the a*scale and g*scale results |
| 77 | // (4 x (as.h, 0, gs.h, 0)) |
| 78 | src_ag = _mm_and_si128(src_ag, ag_mask); |
| 79 | |
| 80 | // Operations the destination pixels are the same as on the |
| 81 | // source pixels. See the comments above. |
| 82 | __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
| 83 | dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); |
| 84 | __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); |
| 85 | dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); |
| 86 | dst_ag = _mm_and_si128(dst_ag, ag_mask); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 87 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 88 | // Combine back into RGBA. |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 89 | // (4 x (as.h, rs.h, gs.h, bs.h)) |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 90 | src_pixel = _mm_or_si128(src_rb, src_ag); |
| 91 | dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
| 92 | |
| 93 | // Add result |
| 94 | __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
| 95 | _mm_store_si128(d, result); |
| 96 | s++; |
| 97 | d++; |
| 98 | count -= 4; |
| 99 | } |
| 100 | src = reinterpret_cast<const SkPMColor*>(s); |
| 101 | dst = reinterpret_cast<SkPMColor*>(d); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 102 | } |
| 103 | |
senorblanco@chromium.org | 4e75355 | 2009-11-16 21:09:00 +0000 | [diff] [blame] | 104 | while (count > 0) { |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 105 | *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); |
| 106 | src++; |
| 107 | dst++; |
| 108 | count--; |
| 109 | } |
| 110 | } |
| 111 | |
senorblanco@chromium.org | 4e75355 | 2009-11-16 21:09:00 +0000 | [diff] [blame] | 112 | void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
| 113 | const SkPMColor* SK_RESTRICT src, |
| 114 | int count, U8CPU alpha) { |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 115 | SkASSERT(alpha == 255); |
| 116 | if (count <= 0) { |
| 117 | return; |
| 118 | } |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 119 | |
| 120 | if (count >= 4) { |
| 121 | SkASSERT(((size_t)dst & 0x03) == 0); |
| 122 | while (((size_t)dst & 0x0F) != 0) { |
| 123 | *dst = SkPMSrcOver(*src, *dst); |
| 124 | src++; |
| 125 | dst++; |
| 126 | count--; |
| 127 | } |
| 128 | |
| 129 | const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 130 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 131 | #ifdef SK_USE_ACCURATE_BLENDING |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 132 | __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
| 133 | __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) |
| 134 | __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) |
| 135 | while (count >= 4) { |
| 136 | // Load 4 pixels |
| 137 | __m128i src_pixel = _mm_loadu_si128(s); |
| 138 | __m128i dst_pixel = _mm_load_si128(d); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 139 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 140 | __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
senorblanco@chromium.org | f3f0bd7 | 2009-12-10 22:46:31 +0000 | [diff] [blame] | 141 | __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 142 | // Shift alphas down to lower 8 bits of each quad. |
| 143 | __m128i alpha = _mm_srli_epi32(src_pixel, 24); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 144 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 145 | // Copy alpha to upper 3rd byte of each quad |
| 146 | alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 147 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 148 | // Subtract alphas from 255, to get 0..255 |
| 149 | alpha = _mm_sub_epi16(c_255, alpha); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 150 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 151 | // Multiply by red and blue by src alpha. |
| 152 | dst_rb = _mm_mullo_epi16(dst_rb, alpha); |
| 153 | // Multiply by alpha and green by src alpha. |
| 154 | dst_ag = _mm_mullo_epi16(dst_ag, alpha); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 155 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 156 | // dst_rb_low = (dst_rb >> 8) |
| 157 | __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); |
| 158 | __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 159 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 160 | // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 |
| 161 | dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); |
| 162 | dst_rb = _mm_add_epi16(dst_rb, c_128); |
| 163 | dst_rb = _mm_srli_epi16(dst_rb, 8); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 164 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 165 | // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask |
| 166 | dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); |
| 167 | dst_ag = _mm_add_epi16(dst_ag, c_128); |
| 168 | dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 169 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 170 | // Combine back into RGBA. |
| 171 | dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 172 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 173 | // Add result |
| 174 | __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
| 175 | _mm_store_si128(d, result); |
| 176 | s++; |
| 177 | d++; |
| 178 | count -= 4; |
| 179 | } |
| 180 | #else |
| 181 | __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
| 182 | __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) |
| 183 | while (count >= 4) { |
| 184 | // Load 4 pixels |
| 185 | __m128i src_pixel = _mm_loadu_si128(s); |
| 186 | __m128i dst_pixel = _mm_load_si128(d); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 187 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 188 | __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
senorblanco@chromium.org | f3f0bd7 | 2009-12-10 22:46:31 +0000 | [diff] [blame] | 189 | __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 190 | |
senorblanco@chromium.org | f3f0bd7 | 2009-12-10 22:46:31 +0000 | [diff] [blame] | 191 | // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) |
| 192 | __m128i alpha = _mm_srli_epi16(src_pixel, 8); |
| 193 | |
| 194 | // (a0, a0, a1, a1, a2, g2, a3, g3) |
| 195 | alpha = _mm_shufflehi_epi16(alpha, 0xF5); |
| 196 | |
| 197 | // (a0, a0, a1, a1, a2, a2, a3, a3) |
| 198 | alpha = _mm_shufflelo_epi16(alpha, 0xF5); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 199 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 200 | // Subtract alphas from 256, to get 1..256 |
| 201 | alpha = _mm_sub_epi16(c_256, alpha); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 202 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 203 | // Multiply by red and blue by src alpha. |
| 204 | dst_rb = _mm_mullo_epi16(dst_rb, alpha); |
| 205 | // Multiply by alpha and green by src alpha. |
| 206 | dst_ag = _mm_mullo_epi16(dst_ag, alpha); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 207 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 208 | // Divide by 256. |
| 209 | dst_rb = _mm_srli_epi16(dst_rb, 8); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 210 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 211 | // Mask out high bits (already in the right place) |
| 212 | dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 213 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 214 | // Combine back into RGBA. |
| 215 | dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 216 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 217 | // Add result |
| 218 | __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
| 219 | _mm_store_si128(d, result); |
| 220 | s++; |
| 221 | d++; |
| 222 | count -= 4; |
| 223 | } |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 224 | #endif |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 225 | src = reinterpret_cast<const SkPMColor*>(s); |
| 226 | dst = reinterpret_cast<SkPMColor*>(d); |
| 227 | } |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 228 | |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 229 | while (count > 0) { |
| 230 | *dst = SkPMSrcOver(*src, *dst); |
| 231 | src++; |
| 232 | dst++; |
| 233 | count--; |
| 234 | } |
| 235 | } |
| 236 | |
senorblanco@chromium.org | 4e75355 | 2009-11-16 21:09:00 +0000 | [diff] [blame] | 237 | void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, |
| 238 | const SkPMColor* SK_RESTRICT src, |
| 239 | int count, U8CPU alpha) { |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 240 | SkASSERT(alpha <= 255); |
| 241 | if (count <= 0) { |
| 242 | return; |
| 243 | } |
| 244 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 245 | if (count >= 4) { |
| 246 | while (((size_t)dst & 0x0F) != 0) { |
| 247 | *dst = SkBlendARGB32(*src, *dst, alpha); |
| 248 | src++; |
| 249 | dst++; |
| 250 | count--; |
| 251 | } |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 252 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 253 | uint32_t src_scale = SkAlpha255To256(alpha); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 254 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 255 | const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 256 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 257 | __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 258 | __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
| 259 | __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) |
| 260 | while (count >= 4) { |
| 261 | // Load 4 pixels each of src and dest. |
| 262 | __m128i src_pixel = _mm_loadu_si128(s); |
| 263 | __m128i dst_pixel = _mm_load_si128(d); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 264 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 265 | // Get red and blue pixels into lower byte of each word. |
| 266 | __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
| 267 | __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 268 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 269 | // Get alpha and green into lower byte of each word. |
| 270 | __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
| 271 | __m128i src_ag = _mm_srli_epi16(src_pixel, 8); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 272 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 273 | // Put per-pixel alpha in low byte of each word. |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 274 | // After the following two statements, the dst_alpha looks like |
| 275 | // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 276 | __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); |
| 277 | dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 278 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 279 | // dst_alpha = dst_alpha * src_scale |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 280 | // Because src_scales are in the higher byte of each word and |
| 281 | // we use mulhi here, the resulting alpha values are already |
| 282 | // in the right place and don't need to be divided by 256. |
| 283 | // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) |
| 284 | dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 285 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 286 | // Subtract alphas from 256, to get 1..256 |
| 287 | dst_alpha = _mm_sub_epi16(c_256, dst_alpha); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 288 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 289 | // Multiply red and blue by dst pixel alpha. |
| 290 | dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); |
| 291 | // Multiply alpha and green by dst pixel alpha. |
| 292 | dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 293 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 294 | // Multiply red and blue by global alpha. |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 295 | // (4 x (0, rs.h, 0, bs.h)) |
| 296 | // where rs.h stands for the higher byte of r * src_scale, |
| 297 | // and bs.h the higher byte of b * src_scale. |
| 298 | // Again, because we use mulhi, the resuling red and blue |
| 299 | // values are already in the right place and don't need to |
| 300 | // be divided by 256. |
| 301 | src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 302 | // Multiply alpha and green by global alpha. |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 303 | // (4 x (0, as.h, 0, gs.h)) |
| 304 | src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 305 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 306 | // Divide by 256. |
| 307 | dst_rb = _mm_srli_epi16(dst_rb, 8); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 308 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 309 | // Mask out low bits (goodies already in the right place; no need to divide) |
| 310 | dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
tomhudson@google.com | 98a5b42 | 2012-02-28 16:15:26 +0000 | [diff] [blame] | 311 | // Shift alpha and green to higher byte of each word. |
| 312 | // (4 x (as.h, 0, gs.h, 0)) |
| 313 | src_ag = _mm_slli_epi16(src_ag, 8); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 314 | |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 315 | // Combine back into RGBA. |
| 316 | dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
| 317 | src_pixel = _mm_or_si128(src_rb, src_ag); |
| 318 | |
| 319 | // Add two pixels into result. |
| 320 | __m128i result = _mm_add_epi8(src_pixel, dst_pixel); |
| 321 | _mm_store_si128(d, result); |
| 322 | s++; |
| 323 | d++; |
| 324 | count -= 4; |
| 325 | } |
| 326 | src = reinterpret_cast<const SkPMColor*>(s); |
| 327 | dst = reinterpret_cast<SkPMColor*>(d); |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 328 | } |
senorblanco@chromium.org | dc7de74 | 2009-11-30 20:00:29 +0000 | [diff] [blame] | 329 | |
senorblanco@chromium.org | 9272761 | 2009-11-04 20:51:06 +0000 | [diff] [blame] | 330 | while (count > 0) { |
| 331 | *dst = SkBlendARGB32(*src, *dst, alpha); |
| 332 | src++; |
| 333 | dst++; |
| 334 | count--; |
| 335 | } |
| 336 | } |
senorblanco@chromium.org | c385638 | 2010-12-13 15:27:20 +0000 | [diff] [blame] | 337 | |
| 338 | /* SSE2 version of Color32() |
| 339 | * portable version is in core/SkBlitRow_D32.cpp |
| 340 | */ |
| 341 | void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, |
| 342 | SkPMColor color) { |
| 343 | |
| 344 | if (count <= 0) { |
| 345 | return; |
| 346 | } |
| 347 | |
| 348 | if (0 == color) { |
| 349 | if (src != dst) { |
| 350 | memcpy(dst, src, count * sizeof(SkPMColor)); |
| 351 | } |
reed@google.com | c909a1e | 2011-10-25 19:07:23 +0000 | [diff] [blame] | 352 | return; |
senorblanco@chromium.org | c385638 | 2010-12-13 15:27:20 +0000 | [diff] [blame] | 353 | } |
| 354 | |
| 355 | unsigned colorA = SkGetPackedA32(color); |
| 356 | if (255 == colorA) { |
| 357 | sk_memset32(dst, color, count); |
| 358 | } else { |
| 359 | unsigned scale = 256 - SkAlpha255To256(colorA); |
| 360 | |
| 361 | if (count >= 4) { |
| 362 | SkASSERT(((size_t)dst & 0x03) == 0); |
| 363 | while (((size_t)dst & 0x0F) != 0) { |
| 364 | *dst = color + SkAlphaMulQ(*src, scale); |
| 365 | src++; |
| 366 | dst++; |
| 367 | count--; |
| 368 | } |
| 369 | |
| 370 | const __m128i *s = reinterpret_cast<const __m128i*>(src); |
| 371 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 372 | __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
| 373 | __m128i src_scale_wide = _mm_set1_epi16(scale); |
| 374 | __m128i color_wide = _mm_set1_epi32(color); |
| 375 | while (count >= 4) { |
| 376 | // Load 4 pixels each of src and dest. |
| 377 | __m128i src_pixel = _mm_loadu_si128(s); |
| 378 | |
| 379 | // Get red and blue pixels into lower byte of each word. |
| 380 | __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
reed@google.com | 981d479 | 2011-03-09 12:55:47 +0000 | [diff] [blame] | 381 | |
senorblanco@chromium.org | c385638 | 2010-12-13 15:27:20 +0000 | [diff] [blame] | 382 | // Get alpha and green into lower byte of each word. |
| 383 | __m128i src_ag = _mm_srli_epi16(src_pixel, 8); |
| 384 | |
| 385 | // Multiply by scale. |
| 386 | src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); |
| 387 | src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); |
| 388 | |
| 389 | // Divide by 256. |
| 390 | src_rb = _mm_srli_epi16(src_rb, 8); |
| 391 | src_ag = _mm_andnot_si128(rb_mask, src_ag); |
| 392 | |
| 393 | // Combine back into RGBA. |
| 394 | src_pixel = _mm_or_si128(src_rb, src_ag); |
| 395 | |
| 396 | // Add color to result. |
| 397 | __m128i result = _mm_add_epi8(color_wide, src_pixel); |
| 398 | |
| 399 | // Store result. |
| 400 | _mm_store_si128(d, result); |
| 401 | s++; |
| 402 | d++; |
| 403 | count -= 4; |
| 404 | } |
| 405 | src = reinterpret_cast<const SkPMColor*>(s); |
| 406 | dst = reinterpret_cast<SkPMColor*>(d); |
| 407 | } |
| 408 | |
| 409 | while (count > 0) { |
| 410 | *dst = color + SkAlphaMulQ(*src, scale); |
| 411 | src += 1; |
| 412 | dst += 1; |
| 413 | count--; |
reed@google.com | 981d479 | 2011-03-09 12:55:47 +0000 | [diff] [blame] | 414 | } |
senorblanco@chromium.org | c385638 | 2010-12-13 15:27:20 +0000 | [diff] [blame] | 415 | } |
| 416 | } |
reed@google.com | 981d479 | 2011-03-09 12:55:47 +0000 | [diff] [blame] | 417 | |
reed@google.com | edb606c | 2011-10-18 13:56:50 +0000 | [diff] [blame] | 418 | void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, |
| 419 | size_t maskRB, SkColor origColor, |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 420 | int width, int height) { |
reed@google.com | ee467ee | 2011-03-09 13:23:57 +0000 | [diff] [blame] | 421 | SkPMColor color = SkPreMultiplyColor(origColor); |
reed@google.com | 981d479 | 2011-03-09 12:55:47 +0000 | [diff] [blame] | 422 | size_t dstOffset = dstRB - (width << 2); |
| 423 | size_t maskOffset = maskRB - width; |
| 424 | SkPMColor* dst = (SkPMColor *)device; |
reed@google.com | edb606c | 2011-10-18 13:56:50 +0000 | [diff] [blame] | 425 | const uint8_t* mask = (const uint8_t*)maskPtr; |
reed@google.com | 981d479 | 2011-03-09 12:55:47 +0000 | [diff] [blame] | 426 | do { |
| 427 | int count = width; |
| 428 | if (count >= 4) { |
| 429 | while (((size_t)dst & 0x0F) != 0 && (count > 0)) { |
| 430 | *dst = SkBlendARGB32(color, *dst, *mask); |
| 431 | mask++; |
| 432 | dst++; |
| 433 | count--; |
| 434 | } |
| 435 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
| 436 | __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); |
| 437 | __m128i c_256 = _mm_set1_epi16(256); |
| 438 | __m128i c_1 = _mm_set1_epi16(1); |
| 439 | __m128i src_pixel = _mm_set1_epi32(color); |
| 440 | while (count >= 4) { |
| 441 | // Load 4 pixels each of src and dest. |
| 442 | __m128i dst_pixel = _mm_load_si128(d); |
| 443 | |
| 444 | //set the aphla value |
| 445 | __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ |
| 446 | 0, *(mask+3),0, \ |
| 447 | *(mask+2),0, *(mask+2),\ |
| 448 | 0,*(mask+1), 0,*(mask+1),\ |
| 449 | 0, *mask,0,*mask); |
| 450 | |
| 451 | //call SkAlpha255To256() |
| 452 | src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); |
| 453 | |
| 454 | // Get red and blue pixels into lower byte of each word. |
| 455 | __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); |
| 456 | __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); |
| 457 | |
| 458 | // Get alpha and green into lower byte of each word. |
| 459 | __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); |
| 460 | __m128i src_ag = _mm_srli_epi16(src_pixel, 8); |
| 461 | |
| 462 | // Put per-pixel alpha in low byte of each word. |
| 463 | __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); |
| 464 | dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); |
| 465 | |
| 466 | // dst_alpha = dst_alpha * src_scale |
| 467 | dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); |
| 468 | |
| 469 | // Divide by 256. |
| 470 | dst_alpha = _mm_srli_epi16(dst_alpha, 8); |
| 471 | |
| 472 | // Subtract alphas from 256, to get 1..256 |
| 473 | dst_alpha = _mm_sub_epi16(c_256, dst_alpha); |
| 474 | // Multiply red and blue by dst pixel alpha. |
| 475 | dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); |
| 476 | // Multiply alpha and green by dst pixel alpha. |
| 477 | dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); |
| 478 | |
| 479 | // Multiply red and blue by global alpha. |
| 480 | src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); |
| 481 | // Multiply alpha and green by global alpha. |
| 482 | src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); |
| 483 | // Divide by 256. |
| 484 | dst_rb = _mm_srli_epi16(dst_rb, 8); |
| 485 | src_rb = _mm_srli_epi16(src_rb, 8); |
| 486 | |
| 487 | // Mask out low bits (goodies already in the right place; no need to divide) |
| 488 | dst_ag = _mm_andnot_si128(rb_mask, dst_ag); |
| 489 | src_ag = _mm_andnot_si128(rb_mask, src_ag); |
| 490 | |
| 491 | // Combine back into RGBA. |
| 492 | dst_pixel = _mm_or_si128(dst_rb, dst_ag); |
| 493 | __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); |
| 494 | |
| 495 | // Add two pixels into result. |
| 496 | __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); |
| 497 | _mm_store_si128(d, result); |
| 498 | // load the next 4 pixel |
| 499 | mask = mask + 4; |
| 500 | d++; |
| 501 | count -= 4; |
| 502 | } |
| 503 | dst = reinterpret_cast<SkPMColor *>(d); |
| 504 | } |
| 505 | while(count > 0) { |
| 506 | *dst= SkBlendARGB32(color, *dst, *mask); |
| 507 | dst += 1; |
| 508 | mask++; |
| 509 | count --; |
| 510 | } |
| 511 | dst = (SkPMColor *)((char*)dst + dstOffset); |
| 512 | mask += maskOffset; |
| 513 | } while (--height != 0); |
| 514 | } |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 515 | |
bungeman@google.com | 8cd5ae7 | 2012-07-09 17:44:57 +0000 | [diff] [blame] | 516 | // The following (left) shifts cause the top 5 bits of the mask components to |
| 517 | // line up with the corresponding components in an SkPMColor. |
| 518 | // Note that the mask's RGB16 order may differ from the SkPMColor order. |
| 519 | #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) |
| 520 | #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) |
| 521 | #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) |
| 522 | |
| 523 | #if SK_R16x5_R32x5_SHIFT == 0 |
| 524 | #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) |
| 525 | #elif SK_R16x5_R32x5_SHIFT > 0 |
| 526 | #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) |
| 527 | #else |
| 528 | #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) |
| 529 | #endif |
| 530 | |
| 531 | #if SK_G16x5_G32x5_SHIFT == 0 |
| 532 | #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) |
| 533 | #elif SK_G16x5_G32x5_SHIFT > 0 |
| 534 | #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) |
| 535 | #else |
| 536 | #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) |
| 537 | #endif |
| 538 | |
| 539 | #if SK_B16x5_B32x5_SHIFT == 0 |
| 540 | #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) |
| 541 | #elif SK_B16x5_B32x5_SHIFT > 0 |
| 542 | #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) |
| 543 | #else |
| 544 | #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) |
| 545 | #endif |
| 546 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 547 | static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, |
| 548 | __m128i &mask, __m128i &srcA) { |
| 549 | // In the following comments, the components of src, dst and mask are |
| 550 | // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
| 551 | // by an R, G, B, or A suffix. Components of one of the four pixels that |
| 552 | // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
| 553 | // example is the blue channel of the second destination pixel. Memory |
| 554 | // layout is shown for an ARGB byte order in a color value. |
| 555 | |
| 556 | // src and srcA store 8-bit values interleaved with zeros. |
| 557 | // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 558 | // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, |
| 559 | // srcA, 0, srcA, 0, srcA, 0, srcA, 0) |
| 560 | // mask stores 16-bit values (compressed three channels) interleaved with zeros. |
| 561 | // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. |
| 562 | // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 563 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 564 | |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 565 | // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 566 | // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
bungeman@google.com | 8cd5ae7 | 2012-07-09 17:44:57 +0000 | [diff] [blame] | 567 | __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
| 568 | _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 569 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 570 | // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
bungeman@google.com | 8cd5ae7 | 2012-07-09 17:44:57 +0000 | [diff] [blame] | 571 | __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
| 572 | _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 573 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 574 | // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
bungeman@google.com | 8cd5ae7 | 2012-07-09 17:44:57 +0000 | [diff] [blame] | 575 | __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
| 576 | _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 577 | |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 578 | // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 579 | // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
| 580 | // 8-bit position |
| 581 | // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
| 582 | // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 583 | mask = _mm_or_si128(_mm_or_si128(r, g), b); |
| 584 | |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 585 | // Interleave R,G,B into the lower byte of word. |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 586 | // i.e. split the sixteen 8-bit values from mask into two sets of eight |
| 587 | // 16-bit values, padded by zero. |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 588 | __m128i maskLo, maskHi; |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 589 | // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 590 | maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 591 | // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 592 | maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
| 593 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 594 | // Upscale from 0..31 to 0..32 |
| 595 | // (allows to replace division by left-shift further down) |
| 596 | // Left-shift each component by 4 and add the result back to that component, |
| 597 | // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 598 | maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
| 599 | maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
| 600 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 601 | // Multiply each component of maskLo and maskHi by srcA |
| 602 | maskLo = _mm_mullo_epi16(maskLo, srcA); |
| 603 | maskHi = _mm_mullo_epi16(maskHi, srcA); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 604 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 605 | // Left shift mask components by 8 (divide by 256) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 606 | maskLo = _mm_srli_epi16(maskLo, 8); |
| 607 | maskHi = _mm_srli_epi16(maskHi, 8); |
| 608 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 609 | // Interleave R,G,B into the lower byte of the word |
| 610 | // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 611 | __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 612 | // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 613 | __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
| 614 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 615 | // mask = (src - dst) * mask |
| 616 | maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
| 617 | maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 618 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 619 | // mask = (src - dst) * mask >> 5 |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 620 | maskLo = _mm_srai_epi16(maskLo, 5); |
| 621 | maskHi = _mm_srai_epi16(maskHi, 5); |
| 622 | |
| 623 | // Add two pixels into result. |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 624 | // result = dst + ((src - dst) * mask >> 5) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 625 | __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
| 626 | __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
| 627 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 628 | // Pack into 4 32bit dst pixels. |
| 629 | // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
| 630 | // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
| 631 | // clamping to 255 if necessary. |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 632 | return _mm_packus_epi16(resultLo, resultHi); |
| 633 | } |
| 634 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 635 | static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 636 | __m128i &mask) { |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 637 | // In the following comments, the components of src, dst and mask are |
| 638 | // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked |
| 639 | // by an R, G, B, or A suffix. Components of one of the four pixels that |
| 640 | // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for |
| 641 | // example is the blue channel of the second destination pixel. Memory |
| 642 | // layout is shown for an ARGB byte order in a color value. |
| 643 | |
| 644 | // src and srcA store 8-bit values interleaved with zeros. |
| 645 | // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 646 | // mask stores 16-bit values (shown as high and low bytes) interleaved with |
| 647 | // zeros |
| 648 | // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 649 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 650 | |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 651 | // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 652 | // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) |
bungeman@google.com | 8cd5ae7 | 2012-07-09 17:44:57 +0000 | [diff] [blame] | 653 | __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), |
| 654 | _mm_set1_epi32(0x1F << SK_R32_SHIFT)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 655 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 656 | // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) |
bungeman@google.com | 8cd5ae7 | 2012-07-09 17:44:57 +0000 | [diff] [blame] | 657 | __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), |
| 658 | _mm_set1_epi32(0x1F << SK_G32_SHIFT)); |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 659 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 660 | // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) |
bungeman@google.com | 8cd5ae7 | 2012-07-09 17:44:57 +0000 | [diff] [blame] | 661 | __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), |
| 662 | _mm_set1_epi32(0x1F << SK_B32_SHIFT)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 663 | |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 664 | // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 665 | // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an |
| 666 | // 8-bit position |
| 667 | // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, |
| 668 | // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 669 | mask = _mm_or_si128(_mm_or_si128(r, g), b); |
| 670 | |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 671 | // Interleave R,G,B into the lower byte of word. |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 672 | // i.e. split the sixteen 8-bit values from mask into two sets of eight |
| 673 | // 16-bit values, padded by zero. |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 674 | __m128i maskLo, maskHi; |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 675 | // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 676 | maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 677 | // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 678 | maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); |
| 679 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 680 | // Upscale from 0..31 to 0..32 |
| 681 | // (allows to replace division by left-shift further down) |
| 682 | // Left-shift each component by 4 and add the result back to that component, |
| 683 | // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 684 | maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); |
| 685 | maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); |
| 686 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 687 | // Interleave R,G,B into the lower byte of the word |
| 688 | // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 689 | __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 690 | // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 691 | __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); |
| 692 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 693 | // mask = (src - dst) * mask |
| 694 | maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); |
| 695 | maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 696 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 697 | // mask = (src - dst) * mask >> 5 |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 698 | maskLo = _mm_srai_epi16(maskLo, 5); |
| 699 | maskHi = _mm_srai_epi16(maskHi, 5); |
| 700 | |
| 701 | // Add two pixels into result. |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 702 | // result = dst + ((src - dst) * mask >> 5) |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 703 | __m128i resultLo = _mm_add_epi16(dstLo, maskLo); |
| 704 | __m128i resultHi = _mm_add_epi16(dstHi, maskHi); |
| 705 | |
bungeman@google.com | 27123cd | 2012-08-21 19:25:42 +0000 | [diff] [blame] | 706 | // Pack into 4 32bit dst pixels and force opaque. |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 707 | // resultLo and resultHi contain eight 16-bit components (two pixels) each. |
| 708 | // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), |
| 709 | // clamping to 255 if necessary. Set alpha components to 0xFF. |
bungeman@google.com | 27123cd | 2012-08-21 19:25:42 +0000 | [diff] [blame] | 710 | return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), |
| 711 | _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 712 | } |
| 713 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 714 | void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], |
| 715 | SkColor src, int width, SkPMColor) { |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 716 | if (width <= 0) { |
| 717 | return; |
| 718 | } |
| 719 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 720 | int srcA = SkColorGetA(src); |
| 721 | int srcR = SkColorGetR(src); |
| 722 | int srcG = SkColorGetG(src); |
| 723 | int srcB = SkColorGetB(src); |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 724 | |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 725 | srcA = SkAlpha255To256(srcA); |
| 726 | |
| 727 | if (width >= 4) { |
| 728 | SkASSERT(((size_t)dst & 0x03) == 0); |
| 729 | while (((size_t)dst & 0x0F) != 0) { |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 730 | *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
| 731 | mask++; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 732 | dst++; |
| 733 | width--; |
| 734 | } |
| 735 | |
| 736 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 737 | // Set alpha to 0xFF and replicate source four times in SSE register. |
| 738 | __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
| 739 | // Interleave with zeros to get two sets of four 16-bit values. |
| 740 | src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
| 741 | // Set srcA_sse to contain eight copies of srcA, padded with zero. |
| 742 | // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 743 | __m128i srcA_sse = _mm_set1_epi16(srcA); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 744 | while (width >= 4) { |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 745 | // Load four destination pixels into dst_sse. |
| 746 | __m128i dst_sse = _mm_load_si128(d); |
| 747 | // Load four 16-bit masks into lower half of mask_sse. |
| 748 | __m128i mask_sse = _mm_loadl_epi64( |
| 749 | reinterpret_cast<const __m128i*>(mask)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 750 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 751 | // Check whether masks are equal to 0 and get the highest bit |
| 752 | // of each byte of result, if masks are all zero, we will get |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 753 | // pack_cmp to 0xFFFF |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 754 | int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 755 | _mm_setzero_si128())); |
| 756 | |
| 757 | // if mask pixels are not all zero, we will blend the dst pixels |
| 758 | if (pack_cmp != 0xFFFF) { |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 759 | // Unpack 4 16bit mask pixels to |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 760 | // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 761 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 762 | mask_sse = _mm_unpacklo_epi16(mask_sse, |
| 763 | _mm_setzero_si128()); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 764 | |
| 765 | // Process 4 32bit dst pixels |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 766 | __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, |
| 767 | mask_sse, srcA_sse); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 768 | _mm_store_si128(d, result); |
| 769 | } |
| 770 | |
| 771 | d++; |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 772 | mask += 4; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 773 | width -= 4; |
| 774 | } |
| 775 | |
| 776 | dst = reinterpret_cast<SkPMColor*>(d); |
| 777 | } |
| 778 | |
| 779 | while (width > 0) { |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 780 | *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); |
| 781 | mask++; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 782 | dst++; |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 783 | width--; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 784 | } |
| 785 | } |
| 786 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 787 | void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], |
| 788 | SkColor src, int width, SkPMColor opaqueDst) { |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 789 | if (width <= 0) { |
| 790 | return; |
| 791 | } |
| 792 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 793 | int srcR = SkColorGetR(src); |
| 794 | int srcG = SkColorGetG(src); |
| 795 | int srcB = SkColorGetB(src); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 796 | |
| 797 | if (width >= 4) { |
| 798 | SkASSERT(((size_t)dst & 0x03) == 0); |
| 799 | while (((size_t)dst & 0x0F) != 0) { |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 800 | *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
| 801 | mask++; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 802 | dst++; |
| 803 | width--; |
| 804 | } |
| 805 | |
| 806 | __m128i *d = reinterpret_cast<__m128i*>(dst); |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 807 | // Set alpha to 0xFF and replicate source four times in SSE register. |
| 808 | __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); |
| 809 | // Set srcA_sse to contain eight copies of srcA, padded with zero. |
| 810 | // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) |
| 811 | src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 812 | while (width >= 4) { |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 813 | // Load four destination pixels into dst_sse. |
| 814 | __m128i dst_sse = _mm_load_si128(d); |
| 815 | // Load four 16-bit masks into lower half of mask_sse. |
| 816 | __m128i mask_sse = _mm_loadl_epi64( |
| 817 | reinterpret_cast<const __m128i*>(mask)); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 818 | |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 819 | // Check whether masks are equal to 0 and get the highest bit |
| 820 | // of each byte of result, if masks are all zero, we will get |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 821 | // pack_cmp to 0xFFFF |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 822 | int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 823 | _mm_setzero_si128())); |
| 824 | |
| 825 | // if mask pixels are not all zero, we will blend the dst pixels |
| 826 | if (pack_cmp != 0xFFFF) { |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 827 | // Unpack 4 16bit mask pixels to |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 828 | // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, |
| 829 | // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) |
| 830 | mask_sse = _mm_unpacklo_epi16(mask_sse, |
| 831 | _mm_setzero_si128()); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 832 | |
| 833 | // Process 4 32bit dst pixels |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 834 | __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, |
| 835 | mask_sse); |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 836 | _mm_store_si128(d, result); |
| 837 | } |
| 838 | |
| 839 | d++; |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 840 | mask += 4; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 841 | width -= 4; |
| 842 | } |
| 843 | |
| 844 | dst = reinterpret_cast<SkPMColor*>(d); |
| 845 | } |
| 846 | |
| 847 | while (width > 0) { |
commit-bot@chromium.org | 76e0d13 | 2013-07-02 17:40:19 +0000 | [diff] [blame] | 848 | *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); |
| 849 | mask++; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 850 | dst++; |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 851 | width--; |
tomhudson@google.com | d6770e6 | 2012-02-14 16:01:15 +0000 | [diff] [blame] | 852 | } |
| 853 | } |