blob: 1d3cc51c3764c360788243fb19f3b9dd9d0c36d0 [file] [log] [blame]
msarett3a24f452016-01-13 14:31:59 -08001/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkSwizzler_opts_DEFINED
9#define SkSwizzler_opts_DEFINED
10
11#include "SkColorPriv.h"
12
13namespace SK_OPTS_NS {
14
mtklein8bf7b792016-01-22 07:42:53 -080015static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
16 auto src = (const uint32_t*)vsrc;
msarett3a24f452016-01-13 14:31:59 -080017 for (int i = 0; i < count; i++) {
18 uint8_t a = src[i] >> 24,
mtklein8bf7b792016-01-22 07:42:53 -080019 b = src[i] >> 16,
msarett3a24f452016-01-13 14:31:59 -080020 g = src[i] >> 8,
mtklein8bf7b792016-01-22 07:42:53 -080021 r = src[i] >> 0;
msarett3a24f452016-01-13 14:31:59 -080022 b = (b*a+127)/255;
mtklein8bf7b792016-01-22 07:42:53 -080023 g = (g*a+127)/255;
24 r = (r*a+127)/255;
25 dst[i] = (uint32_t)a << 24
26 | (uint32_t)b << 16
27 | (uint32_t)g << 8
28 | (uint32_t)r << 0;
29 }
30}
31
32static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
33 auto src = (const uint32_t*)vsrc;
34 for (int i = 0; i < count; i++) {
35 uint8_t a = src[i] >> 24,
36 b = src[i] >> 16,
37 g = src[i] >> 8,
38 r = src[i] >> 0;
39 b = (b*a+127)/255;
40 g = (g*a+127)/255;
41 r = (r*a+127)/255;
msarett3a24f452016-01-13 14:31:59 -080042 dst[i] = (uint32_t)a << 24
43 | (uint32_t)r << 16
44 | (uint32_t)g << 8
45 | (uint32_t)b << 0;
46 }
47}
48
mtklein8bf7b792016-01-22 07:42:53 -080049static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
50 auto src = (const uint32_t*)vsrc;
msarett3a24f452016-01-13 14:31:59 -080051 for (int i = 0; i < count; i++) {
52 uint8_t a = src[i] >> 24,
mtklein8bf7b792016-01-22 07:42:53 -080053 b = src[i] >> 16,
msarett3a24f452016-01-13 14:31:59 -080054 g = src[i] >> 8,
mtklein8bf7b792016-01-22 07:42:53 -080055 r = src[i] >> 0;
msarett3a24f452016-01-13 14:31:59 -080056 dst[i] = (uint32_t)a << 24
mtklein8bf7b792016-01-22 07:42:53 -080057 | (uint32_t)r << 16
msarett3a24f452016-01-13 14:31:59 -080058 | (uint32_t)g << 8
mtklein8bf7b792016-01-22 07:42:53 -080059 | (uint32_t)b << 0;
msarett03108de2016-01-15 11:02:36 -080060 }
61}
62
msarettf1b8b6a2016-01-22 09:54:21 -080063static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
64 const uint8_t* src = (const uint8_t*)vsrc;
65 for (int i = 0; i < count; i++) {
66 uint8_t r = src[0],
67 g = src[1],
68 b = src[2];
69 src += 3;
70 dst[i] = (uint32_t)0xFF << 24
71 | (uint32_t)b << 16
72 | (uint32_t)g << 8
73 | (uint32_t)r << 0;
74 }
75}
76
77static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
78 const uint8_t* src = (const uint8_t*)vsrc;
79 for (int i = 0; i < count; i++) {
80 uint8_t r = src[0],
81 g = src[1],
82 b = src[2];
83 src += 3;
84 dst[i] = (uint32_t)0xFF << 24
85 | (uint32_t)r << 16
86 | (uint32_t)g << 8
87 | (uint32_t)b << 0;
88 }
89}
90
msarett2eff71c2016-02-02 12:59:45 -080091static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
92 const uint8_t* src = (const uint8_t*)vsrc;
93 for (int i = 0; i < count; i++) {
94 dst[i] = (uint32_t)0xFF << 24
95 | (uint32_t)src[i] << 16
96 | (uint32_t)src[i] << 8
97 | (uint32_t)src[i] << 0;
98 }
99}
100
msarett1e060792016-02-03 11:17:43 -0800101static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
102 const uint8_t* src = (const uint8_t*)vsrc;
103 for (int i = 0; i < count; i++) {
104 uint8_t g = src[0],
105 a = src[1];
106 src += 2;
107 dst[i] = (uint32_t)a << 24
108 | (uint32_t)g << 16
109 | (uint32_t)g << 8
110 | (uint32_t)g << 0;
111 }
112}
113
114static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
115 const uint8_t* src = (const uint8_t*)vsrc;
116 for (int i = 0; i < count; i++) {
117 uint8_t g = src[0],
118 a = src[1];
119 src += 2;
120 g = (g*a+127)/255;
121 dst[i] = (uint32_t)a << 24
122 | (uint32_t)g << 16
123 | (uint32_t)g << 8
124 | (uint32_t)g << 0;
125 }
126}
127
msarett3a24f452016-01-13 14:31:59 -0800128#if defined(SK_ARM_HAS_NEON)
129
130// Rounded divide by 255, (x + 127) / 255
131static uint8x8_t div255_round(uint16x8_t x) {
132 // result = (x + 127) / 255
133 // result = (x + 127) / 256 + error1
134 //
135 // error1 = (x + 127) / (255 * 256)
136 // error1 = (x + 127) / (256 * 256) + error2
137 //
138 // error2 = (x + 127) / (255 * 256 * 256)
139 //
140 // The maximum value of error2 is too small to matter. Thus:
141 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
142 // result = ((x + 127) / 256 + x + 127) / 256
143 // result = ((x + 127) >> 8 + x + 127) >> 8
144 //
145 // Use >>> to represent "rounded right shift" which, conveniently,
146 // NEON supports in one instruction.
147 // result = ((x >>> 8) + x) >>> 8
148 //
149 // Note that the second right shift is actually performed as an
150 // "add, round, and narrow back to 8-bits" instruction.
151 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
152}
153
154// Scale a byte by another, (x * y + 127) / 255
155static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
156 return div255_round(vmull_u8(x, y));
157}
158
159template <bool kSwapRB>
mtklein8bf7b792016-01-22 07:42:53 -0800160static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
161 auto src = (const uint32_t*)vsrc;
msarett3a24f452016-01-13 14:31:59 -0800162 while (count >= 8) {
163 // Load 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800164 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
msarett3a24f452016-01-13 14:31:59 -0800165
msarettf1b8b6a2016-01-22 09:54:21 -0800166 uint8x8_t a = rgba.val[3],
167 b = rgba.val[2],
168 g = rgba.val[1],
169 r = rgba.val[0];
msarett3a24f452016-01-13 14:31:59 -0800170
171 // Premultiply.
msarett3a24f452016-01-13 14:31:59 -0800172 b = scale(b, a);
mtklein8bf7b792016-01-22 07:42:53 -0800173 g = scale(g, a);
174 r = scale(r, a);
msarett3a24f452016-01-13 14:31:59 -0800175
176 // Store 8 premultiplied pixels.
177 if (kSwapRB) {
msarettf1b8b6a2016-01-22 09:54:21 -0800178 rgba.val[2] = r;
179 rgba.val[1] = g;
180 rgba.val[0] = b;
mtklein8bf7b792016-01-22 07:42:53 -0800181 } else {
msarettf1b8b6a2016-01-22 09:54:21 -0800182 rgba.val[2] = b;
183 rgba.val[1] = g;
184 rgba.val[0] = r;
msarett3a24f452016-01-13 14:31:59 -0800185 }
msarettf1b8b6a2016-01-22 09:54:21 -0800186 vst4_u8((uint8_t*) dst, rgba);
msarett3a24f452016-01-13 14:31:59 -0800187 src += 8;
188 dst += 8;
189 count -= 8;
190 }
191
192 // Call portable code to finish up the tail of [0,8) pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800193 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
msarett3a24f452016-01-13 14:31:59 -0800194 proc(dst, src, count);
195}
196
mtklein8bf7b792016-01-22 07:42:53 -0800197static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
198 premul_should_swapRB<false>(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800199}
200
mtklein8bf7b792016-01-22 07:42:53 -0800201static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
202 premul_should_swapRB<true>(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800203}
204
mtklein8bf7b792016-01-22 07:42:53 -0800205static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
206 auto src = (const uint32_t*)vsrc;
msarett03108de2016-01-15 11:02:36 -0800207 while (count >= 16) {
208 // Load 16 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800209 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
msarett03108de2016-01-15 11:02:36 -0800210
211 // Swap r and b.
msarettf1b8b6a2016-01-22 09:54:21 -0800212 SkTSwap(rgba.val[0], rgba.val[2]);
msarett03108de2016-01-15 11:02:36 -0800213
214 // Store 16 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800215 vst4q_u8((uint8_t*) dst, rgba);
msarett03108de2016-01-15 11:02:36 -0800216 src += 16;
217 dst += 16;
218 count -= 16;
219 }
220
221 if (count >= 8) {
222 // Load 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800223 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
msarett03108de2016-01-15 11:02:36 -0800224
225 // Swap r and b.
msarettf1b8b6a2016-01-22 09:54:21 -0800226 SkTSwap(rgba.val[0], rgba.val[2]);
msarett03108de2016-01-15 11:02:36 -0800227
228 // Store 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800229 vst4_u8((uint8_t*) dst, rgba);
msarett03108de2016-01-15 11:02:36 -0800230 src += 8;
231 dst += 8;
232 count -= 8;
233 }
234
mtklein8bf7b792016-01-22 07:42:53 -0800235 RGBA_to_BGRA_portable(dst, src, count);
msarett03108de2016-01-15 11:02:36 -0800236}
237
msarettf1b8b6a2016-01-22 09:54:21 -0800238template <bool kSwapRB>
239static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
240 const uint8_t* src = (const uint8_t*) vsrc;
241 while (count >= 16) {
242 // Load 16 pixels.
243 uint8x16x3_t rgb = vld3q_u8(src);
244
245 // Insert an opaque alpha channel and swap if needed.
246 uint8x16x4_t rgba;
247 if (kSwapRB) {
248 rgba.val[0] = rgb.val[2];
249 rgba.val[2] = rgb.val[0];
250 } else {
251 rgba.val[0] = rgb.val[0];
252 rgba.val[2] = rgb.val[2];
253 }
254 rgba.val[1] = rgb.val[1];
255 rgba.val[3] = vdupq_n_u8(0xFF);
256
257 // Store 16 pixels.
258 vst4q_u8((uint8_t*) dst, rgba);
259 src += 16*3;
260 dst += 16;
261 count -= 16;
262 }
263
264 if (count >= 8) {
265 // Load 8 pixels.
266 uint8x8x3_t rgb = vld3_u8(src);
267
268 // Insert an opaque alpha channel and swap if needed.
269 uint8x8x4_t rgba;
270 if (kSwapRB) {
271 rgba.val[0] = rgb.val[2];
272 rgba.val[2] = rgb.val[0];
273 } else {
274 rgba.val[0] = rgb.val[0];
275 rgba.val[2] = rgb.val[2];
276 }
277 rgba.val[1] = rgb.val[1];
278 rgba.val[3] = vdup_n_u8(0xFF);
279
280 // Store 8 pixels.
281 vst4_u8((uint8_t*) dst, rgba);
282 src += 8*3;
283 dst += 8;
284 count -= 8;
285 }
286
287 // Call portable code to finish up the tail of [0,8) pixels.
288 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
289 proc(dst, src, count);
290}
291
292static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
293 insert_alpha_should_swaprb<false>(dst, src, count);
294}
295
296static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
297 insert_alpha_should_swaprb<true>(dst, src, count);
298}
299
msarett2eff71c2016-02-02 12:59:45 -0800300static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
301 const uint8_t* src = (const uint8_t*) vsrc;
302 while (count >= 16) {
303 // Load 16 pixels.
304 uint8x16_t gray = vld1q_u8(src);
305
306 // Set each of the color channels.
307 uint8x16x4_t rgba;
308 rgba.val[0] = gray;
309 rgba.val[1] = gray;
310 rgba.val[2] = gray;
311 rgba.val[3] = vdupq_n_u8(0xFF);
312
313 // Store 16 pixels.
314 vst4q_u8((uint8_t*) dst, rgba);
315 src += 16;
316 dst += 16;
317 count -= 16;
318 }
319
320 if (count >= 8) {
321 // Load 8 pixels.
322 uint8x8_t gray = vld1_u8(src);
323
324 // Set each of the color channels.
325 uint8x8x4_t rgba;
326 rgba.val[0] = gray;
327 rgba.val[1] = gray;
328 rgba.val[2] = gray;
329 rgba.val[3] = vdup_n_u8(0xFF);
330
331 // Store 8 pixels.
332 vst4_u8((uint8_t*) dst, rgba);
333 src += 8;
334 dst += 8;
335 count -= 8;
336 }
337
338 gray_to_RGB1_portable(dst, src, count);
339}
340
msarett1e060792016-02-03 11:17:43 -0800341template <bool kPremul>
342static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
343 const uint8_t* src = (const uint8_t*) vsrc;
344 while (count >= 16) {
345 // Load 16 pixels.
346 uint8x16x2_t ga = vld2q_u8(src);
347
348 // Premultiply if requested.
349 if (kPremul) {
350 ga.val[0] = vcombine_u8(
351 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
352 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
353 }
354
355 // Set each of the color channels.
356 uint8x16x4_t rgba;
357 rgba.val[0] = ga.val[0];
358 rgba.val[1] = ga.val[0];
359 rgba.val[2] = ga.val[0];
360 rgba.val[3] = ga.val[1];
361
362 // Store 16 pixels.
363 vst4q_u8((uint8_t*) dst, rgba);
364 src += 16*2;
365 dst += 16;
366 count -= 16;
367 }
368
369 if (count >= 8) {
370 // Load 8 pixels.
371 uint8x8x2_t ga = vld2_u8(src);
372
373 // Premultiply if requested.
374 if (kPremul) {
375 ga.val[0] = scale(ga.val[0], ga.val[1]);
376 }
377
378 // Set each of the color channels.
379 uint8x8x4_t rgba;
380 rgba.val[0] = ga.val[0];
381 rgba.val[1] = ga.val[0];
382 rgba.val[2] = ga.val[0];
383 rgba.val[3] = ga.val[1];
384
385 // Store 8 pixels.
386 vst4_u8((uint8_t*) dst, rgba);
387 src += 8*2;
388 dst += 8;
389 count -= 8;
390 }
391
392 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
393 proc(dst, src, count);
394}
395
396static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
397 expand_grayA<false>(dst, src, count);
398}
399
400static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
401 expand_grayA<true>(dst, src, count);
402}
403
msarett53b9d292016-01-19 13:17:58 -0800404#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
405
msarett09574242016-02-03 15:28:35 -0800406// Scale a byte by another.
407// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
408static __m128i scale(__m128i x, __m128i y) {
409 const __m128i _128 = _mm_set1_epi16(128);
410 const __m128i _257 = _mm_set1_epi16(257);
411
412 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
413 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
414}
415
msarett53b9d292016-01-19 13:17:58 -0800416template <bool kSwapRB>
mtklein8bf7b792016-01-22 07:42:53 -0800417static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
418 auto src = (const uint32_t*)vsrc;
msarett53b9d292016-01-19 13:17:58 -0800419
420 auto premul8 = [](__m128i* lo, __m128i* hi) {
421 const __m128i zeros = _mm_setzero_si128();
msarett53b9d292016-01-19 13:17:58 -0800422 __m128i planar;
423 if (kSwapRB) {
424 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
425 } else {
426 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
427 }
428
429 // Swizzle the pixels to 8-bit planar.
mtklein8bf7b792016-01-22 07:42:53 -0800430 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
431 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
432 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
433 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
msarett53b9d292016-01-19 13:17:58 -0800434
435 // Unpack to 16-bit planar.
mtklein8bf7b792016-01-22 07:42:53 -0800436 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
437 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
438 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
439 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
msarett53b9d292016-01-19 13:17:58 -0800440
msarett09574242016-02-03 15:28:35 -0800441 // Premultiply!
442 r = scale(r, a);
443 g = scale(g, a);
444 b = scale(b, a);
msarett53b9d292016-01-19 13:17:58 -0800445
446 // Repack into interlaced pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800447 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
448 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
449 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
450 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
msarett53b9d292016-01-19 13:17:58 -0800451 };
452
453 while (count >= 8) {
454 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
455 hi = _mm_loadu_si128((const __m128i*) (src + 4));
456
457 premul8(&lo, &hi);
458
459 _mm_storeu_si128((__m128i*) (dst + 0), lo);
460 _mm_storeu_si128((__m128i*) (dst + 4), hi);
461
462 src += 8;
463 dst += 8;
464 count -= 8;
465 }
466
467 if (count >= 4) {
468 __m128i lo = _mm_loadu_si128((const __m128i*) src),
469 hi = _mm_setzero_si128();
470
471 premul8(&lo, &hi);
472
473 _mm_storeu_si128((__m128i*) dst, lo);
474
475 src += 4;
476 dst += 4;
477 count -= 4;
478 }
479
480 // Call portable code to finish up the tail of [0,4) pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800481 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
msarett53b9d292016-01-19 13:17:58 -0800482 proc(dst, src, count);
483}
484
mtklein8bf7b792016-01-22 07:42:53 -0800485static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
486 premul_should_swapRB<false>(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800487}
488
mtklein8bf7b792016-01-22 07:42:53 -0800489static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
490 premul_should_swapRB<true>(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800491}
492
mtklein8bf7b792016-01-22 07:42:53 -0800493static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
494 auto src = (const uint32_t*)vsrc;
msarett53b9d292016-01-19 13:17:58 -0800495 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
496
497 while (count >= 4) {
mtklein8bf7b792016-01-22 07:42:53 -0800498 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
499 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
500 _mm_storeu_si128((__m128i*) dst, bgra);
msarett53b9d292016-01-19 13:17:58 -0800501
502 src += 4;
503 dst += 4;
504 count -= 4;
505 }
506
mtklein8bf7b792016-01-22 07:42:53 -0800507 RGBA_to_BGRA_portable(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800508}
509
msarett13aa1a52016-01-22 14:12:38 -0800510template <bool kSwapRB>
511static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
512 const uint8_t* src = (const uint8_t*) vsrc;
513
514 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
515 __m128i expand;
516 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
517 if (kSwapRB) {
518 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
519 } else {
520 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
521 }
522
523 while (count >= 6) {
524 // Load a vector. While this actually contains 5 pixels plus an
525 // extra component, we will discard all but the first four pixels on
526 // this iteration.
527 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
528
529 // Expand the first four pixels to RGBX and then mask to RGB(FF).
530 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
531
532 // Store 4 pixels.
533 _mm_storeu_si128((__m128i*) dst, rgba);
534
535 src += 4*3;
536 dst += 4;
537 count -= 4;
538 }
539
540 // Call portable code to finish up the tail of [0,4) pixels.
541 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
542 proc(dst, src, count);
543}
544
msarettf1b8b6a2016-01-22 09:54:21 -0800545static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
msarett13aa1a52016-01-22 14:12:38 -0800546 insert_alpha_should_swaprb<false>(dst, src, count);
msarettf1b8b6a2016-01-22 09:54:21 -0800547}
548
549static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
msarett13aa1a52016-01-22 14:12:38 -0800550 insert_alpha_should_swaprb<true>(dst, src, count);
msarettf1b8b6a2016-01-22 09:54:21 -0800551}
552
msarett07006512016-02-02 13:41:03 -0800553static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
554 const uint8_t* src = (const uint8_t*) vsrc;
555
556 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
557 while (count >= 16) {
558 __m128i grays = _mm_loadu_si128((const __m128i*) src);
559
560 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
561 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
562 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
563 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
564
565 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
566 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
567 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
568 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
569
570 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
571 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
572 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
573 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
574
575 src += 16;
576 dst += 16;
577 count -= 16;
578 }
579
msarett2eff71c2016-02-02 12:59:45 -0800580 gray_to_RGB1_portable(dst, src, count);
581}
582
msarett09574242016-02-03 15:28:35 -0800583static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
584 const uint8_t* src = (const uint8_t*) vsrc;
585 while (count >= 8) {
586 __m128i ga = _mm_loadu_si128((const __m128i*) src);
587
588 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
589 _mm_slli_epi16(ga, 8));
590
591 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
592 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
593
594 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
595 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
596
597 src += 8*2;
598 dst += 8;
599 count -= 8;
600 }
601
msarett1e060792016-02-03 11:17:43 -0800602 grayA_to_RGBA_portable(dst, src, count);
603}
604
msarett09574242016-02-03 15:28:35 -0800605static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
606 const uint8_t* src = (const uint8_t*) vsrc;
607 while (count >= 8) {
608 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
609
610 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
611 __m128i a0 = _mm_srli_epi16(grayA, 8);
612
613 // Premultiply
614 g0 = scale(g0, a0);
615
616 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
617 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
618
619
620 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
621 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
622
623 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
624 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
625
626 src += 8*2;
627 dst += 8;
628 count -= 8;
629 }
630
msarett1e060792016-02-03 11:17:43 -0800631 grayA_to_rgbA_portable(dst, src, count);
632}
633
msarett3a24f452016-01-13 14:31:59 -0800634#else
635
mtklein8bf7b792016-01-22 07:42:53 -0800636static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
637 RGBA_to_rgbA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800638}
639
mtklein8bf7b792016-01-22 07:42:53 -0800640static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
641 RGBA_to_bgrA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800642}
643
mtklein8bf7b792016-01-22 07:42:53 -0800644static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
645 RGBA_to_BGRA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800646}
647
msarettf1b8b6a2016-01-22 09:54:21 -0800648static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
649 RGB_to_RGB1_portable(dst, src, count);
650}
651
652static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
653 RGB_to_BGR1_portable(dst, src, count);
654}
655
msarett2eff71c2016-02-02 12:59:45 -0800656static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
657 gray_to_RGB1_portable(dst, src, count);
658}
659
msarett1e060792016-02-03 11:17:43 -0800660static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
661 grayA_to_RGBA_portable(dst, src, count);
662}
663
664static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
665 grayA_to_rgbA_portable(dst, src, count);
666}
667
msarett03108de2016-01-15 11:02:36 -0800668#endif
669
msarett3a24f452016-01-13 14:31:59 -0800670}
671
672#endif // SkSwizzler_opts_DEFINED