blob: 2484123767068aa4dc13b7ca6762661ea2240008 [file] [log] [blame]
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +00001/*
epoger@google.comec3ed6a2011-07-28 14:26:00 +00002 * Copyright 2009 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +00006 */
7
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +00008#include "SkBitmapProcState_opts_SSE2.h"
Florin Malita99537372017-01-04 13:01:55 -05009#include "SkBitmapProcState_utils.h"
Cary Clarka4083c92017-09-15 11:59:23 -040010#include "SkColorData.h"
reed@google.com9cfc83c2013-07-22 17:18:18 +000011#include "SkPaint.h"
Hal Canaryc640d0d2018-06-13 09:59:02 -040012#include "SkTo.h"
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000013#include "SkUtils.h"
14
Hal Canaryc640d0d2018-06-13 09:59:02 -040015#include <emmintrin.h>
16
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000017void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,
18 const uint32_t* xy,
19 int count, uint32_t* colors) {
halcanary96fcdcc2015-08-27 07:41:13 -070020 SkASSERT(count > 0 && colors != nullptr);
reed05a56472016-03-02 09:49:02 -080021 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reedad7ae6c2015-06-04 14:12:25 -070022 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.orgaa4f0c62009-12-01 13:36:19 +000023 SkASSERT(s.fAlphaScale == 256);
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000024
reedad7ae6c2015-06-04 14:12:25 -070025 const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
26 size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000027 uint32_t XY = *xy++;
28 unsigned y0 = XY >> 14;
29 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
30 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
31 unsigned subY = y0 & 0xF;
32
33 // ( 0, 0, 0, 0, 0, 0, 0, 16)
34 __m128i sixteen = _mm_cvtsi32_si128(16);
35
36 // ( 0, 0, 0, 0, 16, 16, 16, 16)
37 sixteen = _mm_shufflelo_epi16(sixteen, 0);
38
39 // ( 0, 0, 0, 0, 0, 0, 0, y)
40 __m128i allY = _mm_cvtsi32_si128(subY);
41
42 // ( 0, 0, 0, 0, y, y, y, y)
43 allY = _mm_shufflelo_epi16(allY, 0);
44
45 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
46 __m128i negY = _mm_sub_epi16(sixteen, allY);
47
48 // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
49 allY = _mm_unpacklo_epi64(allY, negY);
50
51 // (16, 16, 16, 16, 16, 16, 16, 16 )
52 sixteen = _mm_shuffle_epi32(sixteen, 0);
53
54 // ( 0, 0, 0, 0, 0, 0, 0, 0)
55 __m128i zero = _mm_setzero_si128();
56 do {
57 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
58 unsigned x0 = XX >> 18;
59 unsigned x1 = XX & 0x3FFF;
60
61 // (0, 0, 0, 0, 0, 0, 0, x)
62 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.comfbfcd562012-08-23 18:09:54 +000063
senorblanco@chromium.orgdc7de742009-11-30 20:00:29 +000064 // (0, 0, 0, 0, x, x, x, x)
65 allX = _mm_shufflelo_epi16(allX, 0);
66
67 // (x, x, x, x, x, x, x, x)
68 allX = _mm_shuffle_epi32(allX, 0);
69
70 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
71 __m128i negX = _mm_sub_epi16(sixteen, allX);
72
73 // Load 4 samples (pixels).
74 __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
75 __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
76 __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
77 __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
78
79 // (0, 0, a00, a10)
80 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
81
82 // Expand to 16 bits per component.
83 a00a10 = _mm_unpacklo_epi8(a00a10, zero);
84
85 // ((a00 * (16-y)), (a10 * y)).
86 a00a10 = _mm_mullo_epi16(a00a10, allY);
87
88 // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
89 a00a10 = _mm_mullo_epi16(a00a10, negX);
90
91 // (0, 0, a01, a10)
92 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
93
94 // Expand to 16 bits per component.
95 a01a11 = _mm_unpacklo_epi8(a01a11, zero);
96
97 // (a01 * (16-y)), (a11 * y)
98 a01a11 = _mm_mullo_epi16(a01a11, allY);
99
100 // (a01 * (16-y) * x), (a11 * y * x)
101 a01a11 = _mm_mullo_epi16(a01a11, allX);
102
103 // (a00*w00 + a01*w01, a10*w10 + a11*w11)
104 __m128i sum = _mm_add_epi16(a00a10, a01a11);
105
106 // (DC, a00*w00 + a01*w01)
107 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
108
109 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
110 sum = _mm_add_epi16(sum, shifted);
111
112 // Divide each 16 bit component by 256.
113 sum = _mm_srli_epi16(sum, 8);
114
115 // Pack lower 4 16 bit values of sum into lower 4 bytes.
116 sum = _mm_packus_epi16(sum, zero);
117
118 // Extract low int and store.
119 *colors++ = _mm_cvtsi128_si32(sum);
120 } while (--count > 0);
121}
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000122
123void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,
124 const uint32_t* xy,
125 int count, uint32_t* colors) {
halcanary96fcdcc2015-08-27 07:41:13 -0700126 SkASSERT(count > 0 && colors != nullptr);
reed05a56472016-03-02 09:49:02 -0800127 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
reedad7ae6c2015-06-04 14:12:25 -0700128 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000129 SkASSERT(s.fAlphaScale < 256);
130
reedad7ae6c2015-06-04 14:12:25 -0700131 const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
132 size_t rb = s.fPixmap.rowBytes();
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000133 uint32_t XY = *xy++;
134 unsigned y0 = XY >> 14;
135 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
136 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
137 unsigned subY = y0 & 0xF;
138
139 // ( 0, 0, 0, 0, 0, 0, 0, 16)
140 __m128i sixteen = _mm_cvtsi32_si128(16);
141
142 // ( 0, 0, 0, 0, 16, 16, 16, 16)
143 sixteen = _mm_shufflelo_epi16(sixteen, 0);
144
145 // ( 0, 0, 0, 0, 0, 0, 0, y)
146 __m128i allY = _mm_cvtsi32_si128(subY);
147
148 // ( 0, 0, 0, 0, y, y, y, y)
149 allY = _mm_shufflelo_epi16(allY, 0);
150
151 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
152 __m128i negY = _mm_sub_epi16(sixteen, allY);
153
154 // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
155 allY = _mm_unpacklo_epi64(allY, negY);
156
157 // (16, 16, 16, 16, 16, 16, 16, 16 )
158 sixteen = _mm_shuffle_epi32(sixteen, 0);
159
160 // ( 0, 0, 0, 0, 0, 0, 0, 0)
161 __m128i zero = _mm_setzero_si128();
162
163 // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
164 __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
165
166 do {
167 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
168 unsigned x0 = XX >> 18;
169 unsigned x1 = XX & 0x3FFF;
170
171 // (0, 0, 0, 0, 0, 0, 0, x)
172 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000173
senorblanco@chromium.orgf3f0bd72009-12-10 22:46:31 +0000174 // (0, 0, 0, 0, x, x, x, x)
175 allX = _mm_shufflelo_epi16(allX, 0);
176
177 // (x, x, x, x, x, x, x, x)
178 allX = _mm_shuffle_epi32(allX, 0);
179
180 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
181 __m128i negX = _mm_sub_epi16(sixteen, allX);
182
183 // Load 4 samples (pixels).
184 __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
185 __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
186 __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
187 __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
188
189 // (0, 0, a00, a10)
190 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
191
192 // Expand to 16 bits per component.
193 a00a10 = _mm_unpacklo_epi8(a00a10, zero);
194
195 // ((a00 * (16-y)), (a10 * y)).
196 a00a10 = _mm_mullo_epi16(a00a10, allY);
197
198 // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
199 a00a10 = _mm_mullo_epi16(a00a10, negX);
200
201 // (0, 0, a01, a10)
202 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
203
204 // Expand to 16 bits per component.
205 a01a11 = _mm_unpacklo_epi8(a01a11, zero);
206
207 // (a01 * (16-y)), (a11 * y)
208 a01a11 = _mm_mullo_epi16(a01a11, allY);
209
210 // (a01 * (16-y) * x), (a11 * y * x)
211 a01a11 = _mm_mullo_epi16(a01a11, allX);
212
213 // (a00*w00 + a01*w01, a10*w10 + a11*w11)
214 __m128i sum = _mm_add_epi16(a00a10, a01a11);
215
216 // (DC, a00*w00 + a01*w01)
217 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
218
219 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
220 sum = _mm_add_epi16(sum, shifted);
221
222 // Divide each 16 bit component by 256.
223 sum = _mm_srli_epi16(sum, 8);
224
225 // Multiply by alpha.
226 sum = _mm_mullo_epi16(sum, alpha);
227
228 // Divide each 16 bit component by 256.
229 sum = _mm_srli_epi16(sum, 8);
230
231 // Pack lower 4 16 bit values of sum into lower 4 bytes.
232 sum = _mm_packus_epi16(sum, zero);
233
234 // Extract low int and store.
235 *colors++ = _mm_cvtsi128_si32(sum);
236 } while (--count > 0);
237}
tomhudson@google.com06a73132012-02-22 18:30:43 +0000238
Mike Reed2eab65b2018-04-17 12:01:10 -0400239// Temporarily go into 64bit so we don't overflow during the add. Since we shift down by 16
240// in the end, the result should always fit back in 32bits.
241static inline int32_t safe_fixed_add_shift(SkFixed a, SkFixed b) {
242 int64_t tmp = a;
243 return SkToS32((tmp + b) >> 16);
244}
245
tomhudson@google.com06a73132012-02-22 18:30:43 +0000246static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,
247 SkFixed one) {
248 unsigned i = SkClampMax(f >> 16, max);
249 i = (i << 4) | ((f >> 12) & 0xF);
Mike Reed2eab65b2018-04-17 12:01:10 -0400250 return (i << 14) | SkClampMax(safe_fixed_add_shift(f, one), max);
tomhudson@google.com06a73132012-02-22 18:30:43 +0000251}
252
253/* SSE version of ClampX_ClampY_filter_scale()
254 * portable version is in core/SkBitmapProcState_matrix.h
255 */
256void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],
257 int count, int x, int y) {
258 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
259 SkMatrix::kScale_Mask)) == 0);
260 SkASSERT(s.fInvKy == 0);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000261
reedad7ae6c2015-06-04 14:12:25 -0700262 const unsigned maxX = s.fPixmap.width() - 1;
tomhudson@google.com06a73132012-02-22 18:30:43 +0000263 const SkFixed one = s.fFilterOneX;
264 const SkFixed dx = s.fInvSx;
tomhudson@google.com06a73132012-02-22 18:30:43 +0000265
fmalita2404f032016-02-03 05:44:21 -0800266 const SkBitmapProcStateAutoMapper mapper(s, x, y);
fmalitabe5cfa92016-02-03 10:21:33 -0800267 const SkFixed fy = mapper.fixedY();
reedad7ae6c2015-06-04 14:12:25 -0700268 const unsigned maxY = s.fPixmap.height() - 1;
tomhudson@google.com06a73132012-02-22 18:30:43 +0000269 // compute our two Y values up front
270 *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);
271 // now initialize fx
fmalitabe5cfa92016-02-03 10:21:33 -0800272 SkFixed fx = mapper.fixedX();
tomhudson@google.com06a73132012-02-22 18:30:43 +0000273
274 // test if we don't need to apply the tile proc
Florin Malita99537372017-01-04 13:01:55 -0500275 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
tomhudson@google.com06a73132012-02-22 18:30:43 +0000276 if (count >= 4) {
277 // SSE version of decal_filter_scale
278 while ((size_t(xy) & 0x0F) != 0) {
279 SkASSERT((fx >> (16 + 14)) == 0);
280 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
281 fx += dx;
282 count--;
283 }
284
285 __m128i wide_1 = _mm_set1_epi32(1);
286 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
287 __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
288 fx + dx, fx);
289
290 while (count >= 4) {
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000291 __m128i wide_out;
292
tomhudson@google.com06a73132012-02-22 18:30:43 +0000293 wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);
294 wide_out = _mm_or_si128(wide_out, _mm_add_epi32(
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000295 _mm_srai_epi32(wide_fx, 16), wide_1));
296
tomhudson@google.com06a73132012-02-22 18:30:43 +0000297 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000298
tomhudson@google.com06a73132012-02-22 18:30:43 +0000299 xy += 4;
300 fx += dx * 4;
301 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
302 count -= 4;
303 } // while count >= 4
304 } // if count >= 4
305
306 while (count-- > 0) {
307 SkASSERT((fx >> (16 + 14)) == 0);
308 *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);
309 fx += dx;
310 }
311 } else {
312 // SSE2 only support 16bit interger max & min, so only process the case
313 // maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000314 // height, there should be rare bitmap whose height will be greater
tomhudson@google.com06a73132012-02-22 18:30:43 +0000315 // than max 16bit interger in the real world.
316 if ((count >= 4) && (maxX <= 0xFFFF)) {
317 while (((size_t)xy & 0x0F) != 0) {
318 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
319 fx += dx;
320 count--;
321 }
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000322
tomhudson@google.com06a73132012-02-22 18:30:43 +0000323 __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
324 fx + dx, fx);
325 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
326 __m128i wide_one = _mm_set1_epi32(one);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000327 __m128i wide_maxX = _mm_set1_epi32(maxX);
tomhudson@google.com06a73132012-02-22 18:30:43 +0000328 __m128i wide_mask = _mm_set1_epi32(0xF);
329
330 while (count >= 4) {
331 __m128i wide_i;
332 __m128i wide_lo;
333 __m128i wide_fx1;
334
335 // i = SkClampMax(f>>16,maxX)
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000336 wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),
tomhudson@google.com06a73132012-02-22 18:30:43 +0000337 _mm_setzero_si128());
338 wide_i = _mm_min_epi16(wide_i, wide_maxX);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000339
Florin Malitad1c550e2016-12-19 10:55:41 -0500340 // i<<4 | EXTRACT_LOW_BITS(fx)
tomhudson@google.com06a73132012-02-22 18:30:43 +0000341 wide_lo = _mm_srli_epi32(wide_fx, 12);
342 wide_lo = _mm_and_si128(wide_lo, wide_mask);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000343 wide_i = _mm_slli_epi32(wide_i, 4);
344 wide_i = _mm_or_si128(wide_i, wide_lo);
345
tomhudson@google.com06a73132012-02-22 18:30:43 +0000346 // i<<14
347 wide_i = _mm_slli_epi32(wide_i, 14);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000348
tomhudson@google.com06a73132012-02-22 18:30:43 +0000349 // SkClampMax(((f+one))>>16,max)
350 wide_fx1 = _mm_add_epi32(wide_fx, wide_one);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000351 wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),
tomhudson@google.com06a73132012-02-22 18:30:43 +0000352 _mm_setzero_si128());
353 wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000354
tomhudson@google.com06a73132012-02-22 18:30:43 +0000355 // final combination
356 wide_i = _mm_or_si128(wide_i, wide_fx1);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000357 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);
358
tomhudson@google.com06a73132012-02-22 18:30:43 +0000359 wide_fx = _mm_add_epi32(wide_fx, wide_dx4);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000360 fx += dx * 4;
tomhudson@google.com06a73132012-02-22 18:30:43 +0000361 xy += 4;
362 count -= 4;
363 } // while count >= 4
364 } // if count >= 4
365
Mike Reed010ce2b2018-05-09 13:53:59 -0400366 /*
tomhudson@google.com06a73132012-02-22 18:30:43 +0000367 while (count-- > 0) {
368 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
369 fx += dx;
370 }
Mike Reed010ce2b2018-05-09 13:53:59 -0400371 We'd like to write this as above, but that form allows fx to get 1-iteration too big/small
372 when count is 0, and this can trigger a UBSAN error, even though we won't in fact use that
373 last (undefined) value for fx.
374
375 Here is an alternative that should always be efficient, but seems much harder to read:
376
377 if (count > 0) {
378 for (;;) {
379 *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);
380 if (--count == 0) break;
381 fx += dx;
382 }
383 }
384
385 For now, we'll try this variant: more compact than the if/for version, and we hope the
386 compiler will get rid of the integer multiply.
387 */
388 for (int i = 0; i < count; ++i) {
389 *xy++ = ClampX_ClampY_pack_filter(fx + i*dx, maxX, one);
390 }
tomhudson@google.com06a73132012-02-22 18:30:43 +0000391 }
392}
393
394/* SSE version of ClampX_ClampY_nofilter_scale()
395 * portable version is in core/SkBitmapProcState_matrix.h
396 */
397void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,
398 uint32_t xy[], int count, int x, int y) {
399 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
400 SkMatrix::kScale_Mask)) == 0);
401
402 // we store y, x, x, x, x, x
reedad7ae6c2015-06-04 14:12:25 -0700403 const unsigned maxX = s.fPixmap.width() - 1;
fmalitaeb543072016-02-02 10:17:24 -0800404 const SkBitmapProcStateAutoMapper mapper(s, x, y);
reedad7ae6c2015-06-04 14:12:25 -0700405 const unsigned maxY = s.fPixmap.height() - 1;
fmalitabe5cfa92016-02-03 10:21:33 -0800406 *xy++ = SkClampMax(mapper.intY(), maxY);
407 SkFixed fx = mapper.fixedX();
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000408
tomhudson@google.com06a73132012-02-22 18:30:43 +0000409 if (0 == maxX) {
410 // all of the following X values must be 0
411 memset(xy, 0, count * sizeof(uint16_t));
412 return;
413 }
414
415 const SkFixed dx = s.fInvSx;
416
417 // test if we don't need to apply the tile proc
418 if ((unsigned)(fx >> 16) <= maxX &&
419 (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
420 // SSE version of decal_nofilter_scale
421 if (count >= 8) {
422 while (((size_t)xy & 0x0F) != 0) {
423 *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);
424 fx += 2 * dx;
425 count -= 2;
426 }
427
428 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
429 __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
430
431 __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
432 fx + dx, fx);
433 __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
434
435 while (count >= 8) {
436 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
437 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
438
439 __m128i wide_result = _mm_packs_epi32(wide_out_low,
440 wide_out_high);
441 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000442
tomhudson@google.com06a73132012-02-22 18:30:43 +0000443 wide_low = _mm_add_epi32(wide_low, wide_dx8);
444 wide_high = _mm_add_epi32(wide_high, wide_dx8);
445
446 xy += 4;
447 fx += dx * 8;
448 count -= 8;
449 }
450 } // if count >= 8
451
452 uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
453 while (count-- > 0) {
454 *xx++ = SkToU16(fx >> 16);
455 fx += dx;
456 }
457 } else {
458 // SSE2 only support 16bit interger max & min, so only process the case
459 // maxX less than the max 16bit interger. Actually maxX is the bitmap's
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000460 // height, there should be rare bitmap whose height will be greater
tomhudson@google.com06a73132012-02-22 18:30:43 +0000461 // than max 16bit interger in the real world.
462 if ((count >= 8) && (maxX <= 0xFFFF)) {
463 while (((size_t)xy & 0x0F) != 0) {
mike@reedtribe.org602f2272012-03-14 02:04:40 +0000464 *xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),
465 SkClampMax(fx >> 16, maxX));
tomhudson@google.com06a73132012-02-22 18:30:43 +0000466 fx += 2 * dx;
467 count -= 2;
468 }
469
470 __m128i wide_dx4 = _mm_set1_epi32(dx * 4);
471 __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);
472
473 __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,
474 fx + dx, fx);
475 __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);
476 __m128i wide_maxX = _mm_set1_epi32(maxX);
477
478 while (count >= 8) {
479 __m128i wide_out_low = _mm_srli_epi32(wide_low, 16);
480 __m128i wide_out_high = _mm_srli_epi32(wide_high, 16);
481
rmistry@google.comfbfcd562012-08-23 18:09:54 +0000482 wide_out_low = _mm_max_epi16(wide_out_low,
tomhudson@google.com06a73132012-02-22 18:30:43 +0000483 _mm_setzero_si128());
484 wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);
485 wide_out_high = _mm_max_epi16(wide_out_high,
486 _mm_setzero_si128());
487 wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);
488
489 __m128i wide_result = _mm_packs_epi32(wide_out_low,
490 wide_out_high);
491 _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);
492
493 wide_low = _mm_add_epi32(wide_low, wide_dx8);
494 wide_high = _mm_add_epi32(wide_high, wide_dx8);
495
496 xy += 4;
497 fx += dx * 8;
498 count -= 8;
499 }
500 } // if count >= 8
501
502 uint16_t* xx = reinterpret_cast<uint16_t*>(xy);
503 while (count-- > 0) {
504 *xx++ = SkClampMax(fx >> 16, maxX);
505 fx += dx;
506 }
507 }
508}