blob: a22e1450200926eac5d29264d192bc68d4027733 [file] [log] [blame]
msarett3a24f452016-01-13 14:31:59 -08001/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkSwizzler_opts_DEFINED
9#define SkSwizzler_opts_DEFINED
10
11#include "SkColorPriv.h"
12
mtkleine18fa442016-06-09 13:40:56 -070013#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
14 #include <immintrin.h>
15#elif defined(SK_ARM_HAS_NEON)
16 #include <arm_neon.h>
17#endif
18
msarett3a24f452016-01-13 14:31:59 -080019namespace SK_OPTS_NS {
20
mtklein8bf7b792016-01-22 07:42:53 -080021static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
22 auto src = (const uint32_t*)vsrc;
msarett3a24f452016-01-13 14:31:59 -080023 for (int i = 0; i < count; i++) {
24 uint8_t a = src[i] >> 24,
mtklein8bf7b792016-01-22 07:42:53 -080025 b = src[i] >> 16,
msarett3a24f452016-01-13 14:31:59 -080026 g = src[i] >> 8,
mtklein8bf7b792016-01-22 07:42:53 -080027 r = src[i] >> 0;
msarett3a24f452016-01-13 14:31:59 -080028 b = (b*a+127)/255;
mtklein8bf7b792016-01-22 07:42:53 -080029 g = (g*a+127)/255;
30 r = (r*a+127)/255;
31 dst[i] = (uint32_t)a << 24
32 | (uint32_t)b << 16
33 | (uint32_t)g << 8
34 | (uint32_t)r << 0;
35 }
36}
37
38static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
39 auto src = (const uint32_t*)vsrc;
40 for (int i = 0; i < count; i++) {
41 uint8_t a = src[i] >> 24,
42 b = src[i] >> 16,
43 g = src[i] >> 8,
44 r = src[i] >> 0;
45 b = (b*a+127)/255;
46 g = (g*a+127)/255;
47 r = (r*a+127)/255;
msarett3a24f452016-01-13 14:31:59 -080048 dst[i] = (uint32_t)a << 24
49 | (uint32_t)r << 16
50 | (uint32_t)g << 8
51 | (uint32_t)b << 0;
52 }
53}
54
mtklein8bf7b792016-01-22 07:42:53 -080055static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
56 auto src = (const uint32_t*)vsrc;
msarett3a24f452016-01-13 14:31:59 -080057 for (int i = 0; i < count; i++) {
58 uint8_t a = src[i] >> 24,
mtklein8bf7b792016-01-22 07:42:53 -080059 b = src[i] >> 16,
msarett3a24f452016-01-13 14:31:59 -080060 g = src[i] >> 8,
mtklein8bf7b792016-01-22 07:42:53 -080061 r = src[i] >> 0;
msarett3a24f452016-01-13 14:31:59 -080062 dst[i] = (uint32_t)a << 24
mtklein8bf7b792016-01-22 07:42:53 -080063 | (uint32_t)r << 16
msarett3a24f452016-01-13 14:31:59 -080064 | (uint32_t)g << 8
mtklein8bf7b792016-01-22 07:42:53 -080065 | (uint32_t)b << 0;
msarett03108de2016-01-15 11:02:36 -080066 }
67}
68
msarettf1b8b6a2016-01-22 09:54:21 -080069static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
70 const uint8_t* src = (const uint8_t*)vsrc;
71 for (int i = 0; i < count; i++) {
72 uint8_t r = src[0],
73 g = src[1],
74 b = src[2];
75 src += 3;
76 dst[i] = (uint32_t)0xFF << 24
77 | (uint32_t)b << 16
78 | (uint32_t)g << 8
79 | (uint32_t)r << 0;
80 }
81}
82
83static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
84 const uint8_t* src = (const uint8_t*)vsrc;
85 for (int i = 0; i < count; i++) {
86 uint8_t r = src[0],
87 g = src[1],
88 b = src[2];
89 src += 3;
90 dst[i] = (uint32_t)0xFF << 24
91 | (uint32_t)r << 16
92 | (uint32_t)g << 8
93 | (uint32_t)b << 0;
94 }
95}
96
msarett2eff71c2016-02-02 12:59:45 -080097static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
98 const uint8_t* src = (const uint8_t*)vsrc;
99 for (int i = 0; i < count; i++) {
100 dst[i] = (uint32_t)0xFF << 24
101 | (uint32_t)src[i] << 16
102 | (uint32_t)src[i] << 8
103 | (uint32_t)src[i] << 0;
104 }
105}
106
msarett1e060792016-02-03 11:17:43 -0800107static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
108 const uint8_t* src = (const uint8_t*)vsrc;
109 for (int i = 0; i < count; i++) {
110 uint8_t g = src[0],
111 a = src[1];
112 src += 2;
113 dst[i] = (uint32_t)a << 24
114 | (uint32_t)g << 16
115 | (uint32_t)g << 8
116 | (uint32_t)g << 0;
117 }
118}
119
120static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
121 const uint8_t* src = (const uint8_t*)vsrc;
122 for (int i = 0; i < count; i++) {
123 uint8_t g = src[0],
124 a = src[1];
125 src += 2;
126 g = (g*a+127)/255;
127 dst[i] = (uint32_t)a << 24
128 | (uint32_t)g << 16
129 | (uint32_t)g << 8
130 | (uint32_t)g << 0;
131 }
132}
133
msarettc5c322d2016-02-08 13:26:25 -0800134static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
135 const uint32_t* src = (const uint32_t*)vsrc;
136 for (int i = 0; i < count; i++) {
137 uint8_t k = src[i] >> 24,
138 y = src[i] >> 16,
139 m = src[i] >> 8,
140 c = src[i] >> 0;
141 // See comments in SkSwizzler.cpp for details on the conversion formula.
142 uint8_t b = (y*k+127)/255,
143 g = (m*k+127)/255,
144 r = (c*k+127)/255;
145 dst[i] = (uint32_t)0xFF << 24
146 | (uint32_t) b << 16
147 | (uint32_t) g << 8
148 | (uint32_t) r << 0;
149 }
150}
151
152static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
153 const uint32_t* src = (const uint32_t*)vsrc;
154 for (int i = 0; i < count; i++) {
155 uint8_t k = src[i] >> 24,
156 y = src[i] >> 16,
157 m = src[i] >> 8,
158 c = src[i] >> 0;
159 uint8_t b = (y*k+127)/255,
160 g = (m*k+127)/255,
161 r = (c*k+127)/255;
162 dst[i] = (uint32_t)0xFF << 24
163 | (uint32_t) r << 16
164 | (uint32_t) g << 8
165 | (uint32_t) b << 0;
166 }
167}
168
msarett3a24f452016-01-13 14:31:59 -0800169#if defined(SK_ARM_HAS_NEON)
170
171// Rounded divide by 255, (x + 127) / 255
172static uint8x8_t div255_round(uint16x8_t x) {
173 // result = (x + 127) / 255
174 // result = (x + 127) / 256 + error1
175 //
176 // error1 = (x + 127) / (255 * 256)
177 // error1 = (x + 127) / (256 * 256) + error2
178 //
179 // error2 = (x + 127) / (255 * 256 * 256)
180 //
181 // The maximum value of error2 is too small to matter. Thus:
182 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
183 // result = ((x + 127) / 256 + x + 127) / 256
184 // result = ((x + 127) >> 8 + x + 127) >> 8
185 //
186 // Use >>> to represent "rounded right shift" which, conveniently,
187 // NEON supports in one instruction.
188 // result = ((x >>> 8) + x) >>> 8
189 //
190 // Note that the second right shift is actually performed as an
191 // "add, round, and narrow back to 8-bits" instruction.
192 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
193}
194
195// Scale a byte by another, (x * y + 127) / 255
196static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
197 return div255_round(vmull_u8(x, y));
198}
199
200template <bool kSwapRB>
mtklein8bf7b792016-01-22 07:42:53 -0800201static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
202 auto src = (const uint32_t*)vsrc;
msarett3a24f452016-01-13 14:31:59 -0800203 while (count >= 8) {
204 // Load 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800205 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
msarett3a24f452016-01-13 14:31:59 -0800206
msarettf1b8b6a2016-01-22 09:54:21 -0800207 uint8x8_t a = rgba.val[3],
208 b = rgba.val[2],
209 g = rgba.val[1],
210 r = rgba.val[0];
msarett3a24f452016-01-13 14:31:59 -0800211
212 // Premultiply.
msarett3a24f452016-01-13 14:31:59 -0800213 b = scale(b, a);
mtklein8bf7b792016-01-22 07:42:53 -0800214 g = scale(g, a);
215 r = scale(r, a);
msarett3a24f452016-01-13 14:31:59 -0800216
217 // Store 8 premultiplied pixels.
218 if (kSwapRB) {
msarettf1b8b6a2016-01-22 09:54:21 -0800219 rgba.val[2] = r;
220 rgba.val[1] = g;
221 rgba.val[0] = b;
mtklein8bf7b792016-01-22 07:42:53 -0800222 } else {
msarettf1b8b6a2016-01-22 09:54:21 -0800223 rgba.val[2] = b;
224 rgba.val[1] = g;
225 rgba.val[0] = r;
msarett3a24f452016-01-13 14:31:59 -0800226 }
msarettf1b8b6a2016-01-22 09:54:21 -0800227 vst4_u8((uint8_t*) dst, rgba);
msarett3a24f452016-01-13 14:31:59 -0800228 src += 8;
229 dst += 8;
230 count -= 8;
231 }
232
233 // Call portable code to finish up the tail of [0,8) pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800234 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
msarett3a24f452016-01-13 14:31:59 -0800235 proc(dst, src, count);
236}
237
mtklein8bf7b792016-01-22 07:42:53 -0800238static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
239 premul_should_swapRB<false>(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800240}
241
mtklein8bf7b792016-01-22 07:42:53 -0800242static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
243 premul_should_swapRB<true>(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800244}
245
mtklein8bf7b792016-01-22 07:42:53 -0800246static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
247 auto src = (const uint32_t*)vsrc;
msarett03108de2016-01-15 11:02:36 -0800248 while (count >= 16) {
249 // Load 16 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800250 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
msarett03108de2016-01-15 11:02:36 -0800251
252 // Swap r and b.
msarettf1b8b6a2016-01-22 09:54:21 -0800253 SkTSwap(rgba.val[0], rgba.val[2]);
msarett03108de2016-01-15 11:02:36 -0800254
255 // Store 16 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800256 vst4q_u8((uint8_t*) dst, rgba);
msarett03108de2016-01-15 11:02:36 -0800257 src += 16;
258 dst += 16;
259 count -= 16;
260 }
261
262 if (count >= 8) {
263 // Load 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800264 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
msarett03108de2016-01-15 11:02:36 -0800265
266 // Swap r and b.
msarettf1b8b6a2016-01-22 09:54:21 -0800267 SkTSwap(rgba.val[0], rgba.val[2]);
msarett03108de2016-01-15 11:02:36 -0800268
269 // Store 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800270 vst4_u8((uint8_t*) dst, rgba);
msarett03108de2016-01-15 11:02:36 -0800271 src += 8;
272 dst += 8;
273 count -= 8;
274 }
275
mtklein8bf7b792016-01-22 07:42:53 -0800276 RGBA_to_BGRA_portable(dst, src, count);
msarett03108de2016-01-15 11:02:36 -0800277}
278
msarettf1b8b6a2016-01-22 09:54:21 -0800279template <bool kSwapRB>
280static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
281 const uint8_t* src = (const uint8_t*) vsrc;
282 while (count >= 16) {
283 // Load 16 pixels.
284 uint8x16x3_t rgb = vld3q_u8(src);
285
286 // Insert an opaque alpha channel and swap if needed.
287 uint8x16x4_t rgba;
288 if (kSwapRB) {
289 rgba.val[0] = rgb.val[2];
290 rgba.val[2] = rgb.val[0];
291 } else {
292 rgba.val[0] = rgb.val[0];
293 rgba.val[2] = rgb.val[2];
294 }
295 rgba.val[1] = rgb.val[1];
296 rgba.val[3] = vdupq_n_u8(0xFF);
297
298 // Store 16 pixels.
299 vst4q_u8((uint8_t*) dst, rgba);
300 src += 16*3;
301 dst += 16;
302 count -= 16;
303 }
304
305 if (count >= 8) {
306 // Load 8 pixels.
307 uint8x8x3_t rgb = vld3_u8(src);
308
309 // Insert an opaque alpha channel and swap if needed.
310 uint8x8x4_t rgba;
311 if (kSwapRB) {
312 rgba.val[0] = rgb.val[2];
313 rgba.val[2] = rgb.val[0];
314 } else {
315 rgba.val[0] = rgb.val[0];
316 rgba.val[2] = rgb.val[2];
317 }
318 rgba.val[1] = rgb.val[1];
319 rgba.val[3] = vdup_n_u8(0xFF);
320
321 // Store 8 pixels.
322 vst4_u8((uint8_t*) dst, rgba);
323 src += 8*3;
324 dst += 8;
325 count -= 8;
326 }
327
328 // Call portable code to finish up the tail of [0,8) pixels.
329 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
330 proc(dst, src, count);
331}
332
333static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
334 insert_alpha_should_swaprb<false>(dst, src, count);
335}
336
337static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
338 insert_alpha_should_swaprb<true>(dst, src, count);
339}
340
msarett2eff71c2016-02-02 12:59:45 -0800341static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
342 const uint8_t* src = (const uint8_t*) vsrc;
343 while (count >= 16) {
344 // Load 16 pixels.
345 uint8x16_t gray = vld1q_u8(src);
346
347 // Set each of the color channels.
348 uint8x16x4_t rgba;
349 rgba.val[0] = gray;
350 rgba.val[1] = gray;
351 rgba.val[2] = gray;
352 rgba.val[3] = vdupq_n_u8(0xFF);
353
354 // Store 16 pixels.
355 vst4q_u8((uint8_t*) dst, rgba);
356 src += 16;
357 dst += 16;
358 count -= 16;
359 }
360
361 if (count >= 8) {
362 // Load 8 pixels.
363 uint8x8_t gray = vld1_u8(src);
364
365 // Set each of the color channels.
366 uint8x8x4_t rgba;
367 rgba.val[0] = gray;
368 rgba.val[1] = gray;
369 rgba.val[2] = gray;
370 rgba.val[3] = vdup_n_u8(0xFF);
371
372 // Store 8 pixels.
373 vst4_u8((uint8_t*) dst, rgba);
374 src += 8;
375 dst += 8;
376 count -= 8;
377 }
378
379 gray_to_RGB1_portable(dst, src, count);
380}
381
msarett1e060792016-02-03 11:17:43 -0800382template <bool kPremul>
383static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
384 const uint8_t* src = (const uint8_t*) vsrc;
385 while (count >= 16) {
386 // Load 16 pixels.
387 uint8x16x2_t ga = vld2q_u8(src);
388
389 // Premultiply if requested.
390 if (kPremul) {
391 ga.val[0] = vcombine_u8(
392 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
393 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
394 }
395
396 // Set each of the color channels.
397 uint8x16x4_t rgba;
398 rgba.val[0] = ga.val[0];
399 rgba.val[1] = ga.val[0];
400 rgba.val[2] = ga.val[0];
401 rgba.val[3] = ga.val[1];
402
403 // Store 16 pixels.
404 vst4q_u8((uint8_t*) dst, rgba);
405 src += 16*2;
406 dst += 16;
407 count -= 16;
408 }
409
410 if (count >= 8) {
411 // Load 8 pixels.
412 uint8x8x2_t ga = vld2_u8(src);
413
414 // Premultiply if requested.
415 if (kPremul) {
416 ga.val[0] = scale(ga.val[0], ga.val[1]);
417 }
418
419 // Set each of the color channels.
420 uint8x8x4_t rgba;
421 rgba.val[0] = ga.val[0];
422 rgba.val[1] = ga.val[0];
423 rgba.val[2] = ga.val[0];
424 rgba.val[3] = ga.val[1];
425
426 // Store 8 pixels.
427 vst4_u8((uint8_t*) dst, rgba);
428 src += 8*2;
429 dst += 8;
430 count -= 8;
431 }
432
433 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
434 proc(dst, src, count);
435}
436
437static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
438 expand_grayA<false>(dst, src, count);
439}
440
441static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
442 expand_grayA<true>(dst, src, count);
443}
444
msarettc5c322d2016-02-08 13:26:25 -0800445enum Format { kRGB1, kBGR1 };
446template <Format format>
447static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
448 auto src = (const uint32_t*)vsrc;
449 while (count >= 8) {
450 // Load 8 cmyk pixels.
451 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
452
453 uint8x8_t k = pixels.val[3],
454 y = pixels.val[2],
455 m = pixels.val[1],
456 c = pixels.val[0];
457
458 // Scale to r, g, b.
459 uint8x8_t b = scale(y, k);
460 uint8x8_t g = scale(m, k);
461 uint8x8_t r = scale(c, k);
462
463 // Store 8 rgba pixels.
464 if (kBGR1 == format) {
465 pixels.val[3] = vdup_n_u8(0xFF);
466 pixels.val[2] = r;
467 pixels.val[1] = g;
468 pixels.val[0] = b;
469 } else {
470 pixels.val[3] = vdup_n_u8(0xFF);
471 pixels.val[2] = b;
472 pixels.val[1] = g;
473 pixels.val[0] = r;
474 }
475 vst4_u8((uint8_t*) dst, pixels);
476 src += 8;
477 dst += 8;
478 count -= 8;
479 }
480
481 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
482 proc(dst, src, count);
483}
484
485static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
486 inverted_cmyk_to<kRGB1>(dst, src, count);
487}
488
489static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
490 inverted_cmyk_to<kBGR1>(dst, src, count);
491}
492
msarett53b9d292016-01-19 13:17:58 -0800493#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
494
msarett09574242016-02-03 15:28:35 -0800495// Scale a byte by another.
496// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
497static __m128i scale(__m128i x, __m128i y) {
498 const __m128i _128 = _mm_set1_epi16(128);
499 const __m128i _257 = _mm_set1_epi16(257);
500
501 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
502 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
503}
504
msarett53b9d292016-01-19 13:17:58 -0800505template <bool kSwapRB>
mtklein8bf7b792016-01-22 07:42:53 -0800506static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
507 auto src = (const uint32_t*)vsrc;
msarett53b9d292016-01-19 13:17:58 -0800508
509 auto premul8 = [](__m128i* lo, __m128i* hi) {
510 const __m128i zeros = _mm_setzero_si128();
msarett53b9d292016-01-19 13:17:58 -0800511 __m128i planar;
512 if (kSwapRB) {
513 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
514 } else {
515 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
516 }
517
518 // Swizzle the pixels to 8-bit planar.
mtklein8bf7b792016-01-22 07:42:53 -0800519 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
520 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
521 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
522 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
msarett53b9d292016-01-19 13:17:58 -0800523
524 // Unpack to 16-bit planar.
mtklein8bf7b792016-01-22 07:42:53 -0800525 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
526 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
527 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
528 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
msarett53b9d292016-01-19 13:17:58 -0800529
msarett09574242016-02-03 15:28:35 -0800530 // Premultiply!
531 r = scale(r, a);
532 g = scale(g, a);
533 b = scale(b, a);
msarett53b9d292016-01-19 13:17:58 -0800534
535 // Repack into interlaced pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800536 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
537 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
538 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
539 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
msarett53b9d292016-01-19 13:17:58 -0800540 };
541
542 while (count >= 8) {
543 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
544 hi = _mm_loadu_si128((const __m128i*) (src + 4));
545
546 premul8(&lo, &hi);
547
548 _mm_storeu_si128((__m128i*) (dst + 0), lo);
549 _mm_storeu_si128((__m128i*) (dst + 4), hi);
550
551 src += 8;
552 dst += 8;
553 count -= 8;
554 }
555
556 if (count >= 4) {
557 __m128i lo = _mm_loadu_si128((const __m128i*) src),
558 hi = _mm_setzero_si128();
559
560 premul8(&lo, &hi);
561
562 _mm_storeu_si128((__m128i*) dst, lo);
563
564 src += 4;
565 dst += 4;
566 count -= 4;
567 }
568
569 // Call portable code to finish up the tail of [0,4) pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800570 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
msarett53b9d292016-01-19 13:17:58 -0800571 proc(dst, src, count);
572}
573
mtklein8bf7b792016-01-22 07:42:53 -0800574static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
575 premul_should_swapRB<false>(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800576}
577
mtklein8bf7b792016-01-22 07:42:53 -0800578static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
579 premul_should_swapRB<true>(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800580}
581
mtklein8bf7b792016-01-22 07:42:53 -0800582static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
583 auto src = (const uint32_t*)vsrc;
msarett53b9d292016-01-19 13:17:58 -0800584 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
585
586 while (count >= 4) {
mtklein8bf7b792016-01-22 07:42:53 -0800587 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
588 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
589 _mm_storeu_si128((__m128i*) dst, bgra);
msarett53b9d292016-01-19 13:17:58 -0800590
591 src += 4;
592 dst += 4;
593 count -= 4;
594 }
595
mtklein8bf7b792016-01-22 07:42:53 -0800596 RGBA_to_BGRA_portable(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800597}
598
msarett13aa1a52016-01-22 14:12:38 -0800599template <bool kSwapRB>
600static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
601 const uint8_t* src = (const uint8_t*) vsrc;
602
603 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
604 __m128i expand;
605 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
606 if (kSwapRB) {
607 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
608 } else {
609 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
610 }
611
612 while (count >= 6) {
613 // Load a vector. While this actually contains 5 pixels plus an
614 // extra component, we will discard all but the first four pixels on
615 // this iteration.
616 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
617
618 // Expand the first four pixels to RGBX and then mask to RGB(FF).
619 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
620
621 // Store 4 pixels.
622 _mm_storeu_si128((__m128i*) dst, rgba);
623
624 src += 4*3;
625 dst += 4;
626 count -= 4;
627 }
628
629 // Call portable code to finish up the tail of [0,4) pixels.
630 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
631 proc(dst, src, count);
632}
633
msarettf1b8b6a2016-01-22 09:54:21 -0800634static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
msarett13aa1a52016-01-22 14:12:38 -0800635 insert_alpha_should_swaprb<false>(dst, src, count);
msarettf1b8b6a2016-01-22 09:54:21 -0800636}
637
638static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
msarett13aa1a52016-01-22 14:12:38 -0800639 insert_alpha_should_swaprb<true>(dst, src, count);
msarettf1b8b6a2016-01-22 09:54:21 -0800640}
641
msarett07006512016-02-02 13:41:03 -0800642static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
643 const uint8_t* src = (const uint8_t*) vsrc;
644
645 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
646 while (count >= 16) {
647 __m128i grays = _mm_loadu_si128((const __m128i*) src);
648
649 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
650 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
651 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
652 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
653
654 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
655 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
656 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
657 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
658
659 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
660 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
661 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
662 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
663
664 src += 16;
665 dst += 16;
666 count -= 16;
667 }
668
msarett2eff71c2016-02-02 12:59:45 -0800669 gray_to_RGB1_portable(dst, src, count);
670}
671
msarett09574242016-02-03 15:28:35 -0800672static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
673 const uint8_t* src = (const uint8_t*) vsrc;
674 while (count >= 8) {
675 __m128i ga = _mm_loadu_si128((const __m128i*) src);
676
677 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
678 _mm_slli_epi16(ga, 8));
679
680 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
681 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
682
683 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
684 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
685
686 src += 8*2;
687 dst += 8;
688 count -= 8;
689 }
690
msarett1e060792016-02-03 11:17:43 -0800691 grayA_to_RGBA_portable(dst, src, count);
692}
693
msarett09574242016-02-03 15:28:35 -0800694static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
695 const uint8_t* src = (const uint8_t*) vsrc;
696 while (count >= 8) {
697 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
698
699 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
700 __m128i a0 = _mm_srli_epi16(grayA, 8);
701
702 // Premultiply
703 g0 = scale(g0, a0);
704
705 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
706 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
707
708
709 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
710 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
711
712 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
713 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
714
715 src += 8*2;
716 dst += 8;
717 count -= 8;
718 }
719
msarett1e060792016-02-03 11:17:43 -0800720 grayA_to_rgbA_portable(dst, src, count);
721}
722
msarettc5c322d2016-02-08 13:26:25 -0800723enum Format { kRGB1, kBGR1 };
724template <Format format>
725static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
726 auto src = (const uint32_t*)vsrc;
727
728 auto convert8 = [](__m128i* lo, __m128i* hi) {
729 const __m128i zeros = _mm_setzero_si128();
730 __m128i planar;
731 if (kBGR1 == format) {
732 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
733 } else {
734 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
735 }
736
737 // Swizzle the pixels to 8-bit planar.
738 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
739 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
740 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
741 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
742
743 // Unpack to 16-bit planar.
744 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
745 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
746 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
747 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
748
749 // Scale to r, g, b.
750 __m128i r = scale(c, k),
751 g = scale(m, k),
752 b = scale(y, k);
753
754 // Repack into interlaced pixels.
755 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
756 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
757 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
758 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
759 };
760
761 while (count >= 8) {
762 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
763 hi = _mm_loadu_si128((const __m128i*) (src + 4));
764
765 convert8(&lo, &hi);
766
767 _mm_storeu_si128((__m128i*) (dst + 0), lo);
768 _mm_storeu_si128((__m128i*) (dst + 4), hi);
769
770 src += 8;
771 dst += 8;
772 count -= 8;
773 }
774
775 if (count >= 4) {
776 __m128i lo = _mm_loadu_si128((const __m128i*) src),
777 hi = _mm_setzero_si128();
778
779 convert8(&lo, &hi);
780
781 _mm_storeu_si128((__m128i*) dst, lo);
782
783 src += 4;
784 dst += 4;
785 count -= 4;
786 }
787
788 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
789 proc(dst, src, count);
790}
791
792static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
793 inverted_cmyk_to<kRGB1>(dst, src, count);
794}
795
796static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
797 inverted_cmyk_to<kBGR1>(dst, src, count);
798}
799
msarett3a24f452016-01-13 14:31:59 -0800800#else
801
mtklein8bf7b792016-01-22 07:42:53 -0800802static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
803 RGBA_to_rgbA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800804}
805
mtklein8bf7b792016-01-22 07:42:53 -0800806static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
807 RGBA_to_bgrA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800808}
809
mtklein8bf7b792016-01-22 07:42:53 -0800810static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
811 RGBA_to_BGRA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800812}
813
msarettf1b8b6a2016-01-22 09:54:21 -0800814static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
815 RGB_to_RGB1_portable(dst, src, count);
816}
817
818static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
819 RGB_to_BGR1_portable(dst, src, count);
820}
821
msarett2eff71c2016-02-02 12:59:45 -0800822static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
823 gray_to_RGB1_portable(dst, src, count);
824}
825
msarett1e060792016-02-03 11:17:43 -0800826static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
827 grayA_to_RGBA_portable(dst, src, count);
828}
829
830static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
831 grayA_to_rgbA_portable(dst, src, count);
832}
833
msarettc5c322d2016-02-08 13:26:25 -0800834static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
835 inverted_CMYK_to_RGB1_portable(dst, src, count);
836}
837
838static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
839 inverted_CMYK_to_BGR1_portable(dst, src, count);
840}
841
msarett03108de2016-01-15 11:02:36 -0800842#endif
843
msarett3a24f452016-01-13 14:31:59 -0800844}
845
846#endif // SkSwizzler_opts_DEFINED