blob: 82eb7b64a777060f9469f9d5c61983bd435eb06f [file] [log] [blame]
msarett3a24f452016-01-13 14:31:59 -08001/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkSwizzler_opts_DEFINED
9#define SkSwizzler_opts_DEFINED
10
Cary Clarka4083c92017-09-15 11:59:23 -040011#include "SkColorData.h"
msarett3a24f452016-01-13 14:31:59 -080012
Ben Wagnerf08d1d02018-06-18 15:11:00 -040013#include <utility>
14
mtkleine18fa442016-06-09 13:40:56 -070015#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
16 #include <immintrin.h>
17#elif defined(SK_ARM_HAS_NEON)
18 #include <arm_neon.h>
19#endif
20
msarett3a24f452016-01-13 14:31:59 -080021namespace SK_OPTS_NS {
22
Mike Klein6e78ae52018-09-19 13:37:16 -040023static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
msarett3a24f452016-01-13 14:31:59 -080024 for (int i = 0; i < count; i++) {
25 uint8_t a = src[i] >> 24,
mtklein8bf7b792016-01-22 07:42:53 -080026 b = src[i] >> 16,
msarett3a24f452016-01-13 14:31:59 -080027 g = src[i] >> 8,
mtklein8bf7b792016-01-22 07:42:53 -080028 r = src[i] >> 0;
msarett3a24f452016-01-13 14:31:59 -080029 b = (b*a+127)/255;
mtklein8bf7b792016-01-22 07:42:53 -080030 g = (g*a+127)/255;
31 r = (r*a+127)/255;
32 dst[i] = (uint32_t)a << 24
33 | (uint32_t)b << 16
34 | (uint32_t)g << 8
35 | (uint32_t)r << 0;
36 }
37}
38
Mike Klein6e78ae52018-09-19 13:37:16 -040039static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -080040 for (int i = 0; i < count; i++) {
41 uint8_t a = src[i] >> 24,
42 b = src[i] >> 16,
43 g = src[i] >> 8,
44 r = src[i] >> 0;
45 b = (b*a+127)/255;
46 g = (g*a+127)/255;
47 r = (r*a+127)/255;
msarett3a24f452016-01-13 14:31:59 -080048 dst[i] = (uint32_t)a << 24
49 | (uint32_t)r << 16
50 | (uint32_t)g << 8
51 | (uint32_t)b << 0;
52 }
53}
54
Mike Klein6e78ae52018-09-19 13:37:16 -040055static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
msarett3a24f452016-01-13 14:31:59 -080056 for (int i = 0; i < count; i++) {
57 uint8_t a = src[i] >> 24,
mtklein8bf7b792016-01-22 07:42:53 -080058 b = src[i] >> 16,
msarett3a24f452016-01-13 14:31:59 -080059 g = src[i] >> 8,
mtklein8bf7b792016-01-22 07:42:53 -080060 r = src[i] >> 0;
msarett3a24f452016-01-13 14:31:59 -080061 dst[i] = (uint32_t)a << 24
mtklein8bf7b792016-01-22 07:42:53 -080062 | (uint32_t)r << 16
msarett3a24f452016-01-13 14:31:59 -080063 | (uint32_t)g << 8
mtklein8bf7b792016-01-22 07:42:53 -080064 | (uint32_t)b << 0;
msarett03108de2016-01-15 11:02:36 -080065 }
66}
67
Mike Klein6e78ae52018-09-19 13:37:16 -040068static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
msarettf1b8b6a2016-01-22 09:54:21 -080069 for (int i = 0; i < count; i++) {
70 uint8_t r = src[0],
71 g = src[1],
72 b = src[2];
73 src += 3;
74 dst[i] = (uint32_t)0xFF << 24
75 | (uint32_t)b << 16
76 | (uint32_t)g << 8
77 | (uint32_t)r << 0;
78 }
79}
80
Mike Klein6e78ae52018-09-19 13:37:16 -040081static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
msarettf1b8b6a2016-01-22 09:54:21 -080082 for (int i = 0; i < count; i++) {
83 uint8_t r = src[0],
84 g = src[1],
85 b = src[2];
86 src += 3;
87 dst[i] = (uint32_t)0xFF << 24
88 | (uint32_t)r << 16
89 | (uint32_t)g << 8
90 | (uint32_t)b << 0;
91 }
92}
93
Mike Klein6e78ae52018-09-19 13:37:16 -040094static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
msarett2eff71c2016-02-02 12:59:45 -080095 for (int i = 0; i < count; i++) {
96 dst[i] = (uint32_t)0xFF << 24
97 | (uint32_t)src[i] << 16
98 | (uint32_t)src[i] << 8
99 | (uint32_t)src[i] << 0;
100 }
101}
102
Mike Klein6e78ae52018-09-19 13:37:16 -0400103static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
msarett1e060792016-02-03 11:17:43 -0800104 for (int i = 0; i < count; i++) {
105 uint8_t g = src[0],
106 a = src[1];
107 src += 2;
108 dst[i] = (uint32_t)a << 24
109 | (uint32_t)g << 16
110 | (uint32_t)g << 8
111 | (uint32_t)g << 0;
112 }
113}
114
Mike Klein6e78ae52018-09-19 13:37:16 -0400115static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
msarett1e060792016-02-03 11:17:43 -0800116 for (int i = 0; i < count; i++) {
117 uint8_t g = src[0],
118 a = src[1];
119 src += 2;
120 g = (g*a+127)/255;
121 dst[i] = (uint32_t)a << 24
122 | (uint32_t)g << 16
123 | (uint32_t)g << 8
124 | (uint32_t)g << 0;
125 }
126}
127
Mike Klein6e78ae52018-09-19 13:37:16 -0400128static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800129 for (int i = 0; i < count; i++) {
130 uint8_t k = src[i] >> 24,
131 y = src[i] >> 16,
132 m = src[i] >> 8,
133 c = src[i] >> 0;
134 // See comments in SkSwizzler.cpp for details on the conversion formula.
135 uint8_t b = (y*k+127)/255,
136 g = (m*k+127)/255,
137 r = (c*k+127)/255;
138 dst[i] = (uint32_t)0xFF << 24
139 | (uint32_t) b << 16
140 | (uint32_t) g << 8
141 | (uint32_t) r << 0;
142 }
143}
144
Mike Klein6e78ae52018-09-19 13:37:16 -0400145static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800146 for (int i = 0; i < count; i++) {
147 uint8_t k = src[i] >> 24,
148 y = src[i] >> 16,
149 m = src[i] >> 8,
150 c = src[i] >> 0;
151 uint8_t b = (y*k+127)/255,
152 g = (m*k+127)/255,
153 r = (c*k+127)/255;
154 dst[i] = (uint32_t)0xFF << 24
155 | (uint32_t) r << 16
156 | (uint32_t) g << 8
157 | (uint32_t) b << 0;
158 }
159}
160
msarett3a24f452016-01-13 14:31:59 -0800161#if defined(SK_ARM_HAS_NEON)
162
163// Rounded divide by 255, (x + 127) / 255
164static uint8x8_t div255_round(uint16x8_t x) {
165 // result = (x + 127) / 255
166 // result = (x + 127) / 256 + error1
167 //
168 // error1 = (x + 127) / (255 * 256)
169 // error1 = (x + 127) / (256 * 256) + error2
170 //
171 // error2 = (x + 127) / (255 * 256 * 256)
172 //
173 // The maximum value of error2 is too small to matter. Thus:
174 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
175 // result = ((x + 127) / 256 + x + 127) / 256
176 // result = ((x + 127) >> 8 + x + 127) >> 8
177 //
178 // Use >>> to represent "rounded right shift" which, conveniently,
179 // NEON supports in one instruction.
180 // result = ((x >>> 8) + x) >>> 8
181 //
182 // Note that the second right shift is actually performed as an
183 // "add, round, and narrow back to 8-bits" instruction.
184 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
185}
186
187// Scale a byte by another, (x * y + 127) / 255
188static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
189 return div255_round(vmull_u8(x, y));
190}
191
192template <bool kSwapRB>
Mike Klein6e78ae52018-09-19 13:37:16 -0400193static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
msarett3a24f452016-01-13 14:31:59 -0800194 while (count >= 8) {
195 // Load 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800196 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
msarett3a24f452016-01-13 14:31:59 -0800197
msarettf1b8b6a2016-01-22 09:54:21 -0800198 uint8x8_t a = rgba.val[3],
199 b = rgba.val[2],
200 g = rgba.val[1],
201 r = rgba.val[0];
msarett3a24f452016-01-13 14:31:59 -0800202
203 // Premultiply.
msarett3a24f452016-01-13 14:31:59 -0800204 b = scale(b, a);
mtklein8bf7b792016-01-22 07:42:53 -0800205 g = scale(g, a);
206 r = scale(r, a);
msarett3a24f452016-01-13 14:31:59 -0800207
208 // Store 8 premultiplied pixels.
209 if (kSwapRB) {
msarettf1b8b6a2016-01-22 09:54:21 -0800210 rgba.val[2] = r;
211 rgba.val[1] = g;
212 rgba.val[0] = b;
mtklein8bf7b792016-01-22 07:42:53 -0800213 } else {
msarettf1b8b6a2016-01-22 09:54:21 -0800214 rgba.val[2] = b;
215 rgba.val[1] = g;
216 rgba.val[0] = r;
msarett3a24f452016-01-13 14:31:59 -0800217 }
msarettf1b8b6a2016-01-22 09:54:21 -0800218 vst4_u8((uint8_t*) dst, rgba);
msarett3a24f452016-01-13 14:31:59 -0800219 src += 8;
220 dst += 8;
221 count -= 8;
222 }
223
224 // Call portable code to finish up the tail of [0,8) pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800225 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
msarett3a24f452016-01-13 14:31:59 -0800226 proc(dst, src, count);
227}
228
Mike Klein6e78ae52018-09-19 13:37:16 -0400229/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -0800230 premul_should_swapRB<false>(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800231}
232
Mike Klein6e78ae52018-09-19 13:37:16 -0400233/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -0800234 premul_should_swapRB<true>(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800235}
236
Mike Klein6e78ae52018-09-19 13:37:16 -0400237/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
Ben Wagnerf08d1d02018-06-18 15:11:00 -0400238 using std::swap;
msarett03108de2016-01-15 11:02:36 -0800239 while (count >= 16) {
240 // Load 16 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800241 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
msarett03108de2016-01-15 11:02:36 -0800242
243 // Swap r and b.
Ben Wagnerf08d1d02018-06-18 15:11:00 -0400244 swap(rgba.val[0], rgba.val[2]);
msarett03108de2016-01-15 11:02:36 -0800245
246 // Store 16 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800247 vst4q_u8((uint8_t*) dst, rgba);
msarett03108de2016-01-15 11:02:36 -0800248 src += 16;
249 dst += 16;
250 count -= 16;
251 }
252
253 if (count >= 8) {
254 // Load 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800255 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
msarett03108de2016-01-15 11:02:36 -0800256
257 // Swap r and b.
Ben Wagnerf08d1d02018-06-18 15:11:00 -0400258 swap(rgba.val[0], rgba.val[2]);
msarett03108de2016-01-15 11:02:36 -0800259
260 // Store 8 pixels.
msarettf1b8b6a2016-01-22 09:54:21 -0800261 vst4_u8((uint8_t*) dst, rgba);
msarett03108de2016-01-15 11:02:36 -0800262 src += 8;
263 dst += 8;
264 count -= 8;
265 }
266
mtklein8bf7b792016-01-22 07:42:53 -0800267 RGBA_to_BGRA_portable(dst, src, count);
msarett03108de2016-01-15 11:02:36 -0800268}
269
msarettf1b8b6a2016-01-22 09:54:21 -0800270template <bool kSwapRB>
Mike Klein6e78ae52018-09-19 13:37:16 -0400271static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
msarettf1b8b6a2016-01-22 09:54:21 -0800272 while (count >= 16) {
273 // Load 16 pixels.
274 uint8x16x3_t rgb = vld3q_u8(src);
275
276 // Insert an opaque alpha channel and swap if needed.
277 uint8x16x4_t rgba;
278 if (kSwapRB) {
279 rgba.val[0] = rgb.val[2];
280 rgba.val[2] = rgb.val[0];
281 } else {
282 rgba.val[0] = rgb.val[0];
283 rgba.val[2] = rgb.val[2];
284 }
285 rgba.val[1] = rgb.val[1];
286 rgba.val[3] = vdupq_n_u8(0xFF);
287
288 // Store 16 pixels.
289 vst4q_u8((uint8_t*) dst, rgba);
290 src += 16*3;
291 dst += 16;
292 count -= 16;
293 }
294
295 if (count >= 8) {
296 // Load 8 pixels.
297 uint8x8x3_t rgb = vld3_u8(src);
298
299 // Insert an opaque alpha channel and swap if needed.
300 uint8x8x4_t rgba;
301 if (kSwapRB) {
302 rgba.val[0] = rgb.val[2];
303 rgba.val[2] = rgb.val[0];
304 } else {
305 rgba.val[0] = rgb.val[0];
306 rgba.val[2] = rgb.val[2];
307 }
308 rgba.val[1] = rgb.val[1];
309 rgba.val[3] = vdup_n_u8(0xFF);
310
311 // Store 8 pixels.
312 vst4_u8((uint8_t*) dst, rgba);
313 src += 8*3;
314 dst += 8;
315 count -= 8;
316 }
317
318 // Call portable code to finish up the tail of [0,8) pixels.
319 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
320 proc(dst, src, count);
321}
322
Mike Klein6e78ae52018-09-19 13:37:16 -0400323/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
msarettf1b8b6a2016-01-22 09:54:21 -0800324 insert_alpha_should_swaprb<false>(dst, src, count);
325}
326
Mike Klein6e78ae52018-09-19 13:37:16 -0400327/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
msarettf1b8b6a2016-01-22 09:54:21 -0800328 insert_alpha_should_swaprb<true>(dst, src, count);
329}
330
Mike Klein6e78ae52018-09-19 13:37:16 -0400331/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
msarett2eff71c2016-02-02 12:59:45 -0800332 while (count >= 16) {
333 // Load 16 pixels.
334 uint8x16_t gray = vld1q_u8(src);
335
336 // Set each of the color channels.
337 uint8x16x4_t rgba;
338 rgba.val[0] = gray;
339 rgba.val[1] = gray;
340 rgba.val[2] = gray;
341 rgba.val[3] = vdupq_n_u8(0xFF);
342
343 // Store 16 pixels.
344 vst4q_u8((uint8_t*) dst, rgba);
345 src += 16;
346 dst += 16;
347 count -= 16;
348 }
349
350 if (count >= 8) {
351 // Load 8 pixels.
352 uint8x8_t gray = vld1_u8(src);
353
354 // Set each of the color channels.
355 uint8x8x4_t rgba;
356 rgba.val[0] = gray;
357 rgba.val[1] = gray;
358 rgba.val[2] = gray;
359 rgba.val[3] = vdup_n_u8(0xFF);
360
361 // Store 8 pixels.
362 vst4_u8((uint8_t*) dst, rgba);
363 src += 8;
364 dst += 8;
365 count -= 8;
366 }
367
368 gray_to_RGB1_portable(dst, src, count);
369}
370
msarett1e060792016-02-03 11:17:43 -0800371template <bool kPremul>
Mike Klein6e78ae52018-09-19 13:37:16 -0400372static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
msarett1e060792016-02-03 11:17:43 -0800373 while (count >= 16) {
374 // Load 16 pixels.
375 uint8x16x2_t ga = vld2q_u8(src);
376
377 // Premultiply if requested.
378 if (kPremul) {
379 ga.val[0] = vcombine_u8(
380 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])),
381 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
382 }
383
384 // Set each of the color channels.
385 uint8x16x4_t rgba;
386 rgba.val[0] = ga.val[0];
387 rgba.val[1] = ga.val[0];
388 rgba.val[2] = ga.val[0];
389 rgba.val[3] = ga.val[1];
390
391 // Store 16 pixels.
392 vst4q_u8((uint8_t*) dst, rgba);
393 src += 16*2;
394 dst += 16;
395 count -= 16;
396 }
397
398 if (count >= 8) {
399 // Load 8 pixels.
400 uint8x8x2_t ga = vld2_u8(src);
401
402 // Premultiply if requested.
403 if (kPremul) {
404 ga.val[0] = scale(ga.val[0], ga.val[1]);
405 }
406
407 // Set each of the color channels.
408 uint8x8x4_t rgba;
409 rgba.val[0] = ga.val[0];
410 rgba.val[1] = ga.val[0];
411 rgba.val[2] = ga.val[0];
412 rgba.val[3] = ga.val[1];
413
414 // Store 8 pixels.
415 vst4_u8((uint8_t*) dst, rgba);
416 src += 8*2;
417 dst += 8;
418 count -= 8;
419 }
420
421 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
422 proc(dst, src, count);
423}
424
Mike Klein6e78ae52018-09-19 13:37:16 -0400425/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
msarett1e060792016-02-03 11:17:43 -0800426 expand_grayA<false>(dst, src, count);
427}
428
Mike Klein6e78ae52018-09-19 13:37:16 -0400429/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
msarett1e060792016-02-03 11:17:43 -0800430 expand_grayA<true>(dst, src, count);
431}
432
msarettc5c322d2016-02-08 13:26:25 -0800433enum Format { kRGB1, kBGR1 };
434template <Format format>
Mike Klein6e78ae52018-09-19 13:37:16 -0400435static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800436 while (count >= 8) {
437 // Load 8 cmyk pixels.
438 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
439
440 uint8x8_t k = pixels.val[3],
441 y = pixels.val[2],
442 m = pixels.val[1],
443 c = pixels.val[0];
444
445 // Scale to r, g, b.
446 uint8x8_t b = scale(y, k);
447 uint8x8_t g = scale(m, k);
448 uint8x8_t r = scale(c, k);
449
450 // Store 8 rgba pixels.
451 if (kBGR1 == format) {
452 pixels.val[3] = vdup_n_u8(0xFF);
453 pixels.val[2] = r;
454 pixels.val[1] = g;
455 pixels.val[0] = b;
456 } else {
457 pixels.val[3] = vdup_n_u8(0xFF);
458 pixels.val[2] = b;
459 pixels.val[1] = g;
460 pixels.val[0] = r;
461 }
462 vst4_u8((uint8_t*) dst, pixels);
463 src += 8;
464 dst += 8;
465 count -= 8;
466 }
467
468 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
469 proc(dst, src, count);
470}
471
Mike Klein6e78ae52018-09-19 13:37:16 -0400472/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800473 inverted_cmyk_to<kRGB1>(dst, src, count);
474}
475
Mike Klein6e78ae52018-09-19 13:37:16 -0400476/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800477 inverted_cmyk_to<kBGR1>(dst, src, count);
478}
479
msarett53b9d292016-01-19 13:17:58 -0800480#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
481
msarett09574242016-02-03 15:28:35 -0800482// Scale a byte by another.
483// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
484static __m128i scale(__m128i x, __m128i y) {
485 const __m128i _128 = _mm_set1_epi16(128);
486 const __m128i _257 = _mm_set1_epi16(257);
487
488 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
489 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
490}
491
msarett53b9d292016-01-19 13:17:58 -0800492template <bool kSwapRB>
Mike Klein6e78ae52018-09-19 13:37:16 -0400493static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
msarett53b9d292016-01-19 13:17:58 -0800494
495 auto premul8 = [](__m128i* lo, __m128i* hi) {
496 const __m128i zeros = _mm_setzero_si128();
msarett53b9d292016-01-19 13:17:58 -0800497 __m128i planar;
498 if (kSwapRB) {
499 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
500 } else {
501 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
502 }
503
504 // Swizzle the pixels to 8-bit planar.
mtklein8bf7b792016-01-22 07:42:53 -0800505 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa
506 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA
507 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG
508 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA
msarett53b9d292016-01-19 13:17:58 -0800509
510 // Unpack to 16-bit planar.
mtklein8bf7b792016-01-22 07:42:53 -0800511 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_
512 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_
513 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_
514 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_
msarett53b9d292016-01-19 13:17:58 -0800515
msarett09574242016-02-03 15:28:35 -0800516 // Premultiply!
517 r = scale(r, a);
518 g = scale(g, a);
519 b = scale(b, a);
msarett53b9d292016-01-19 13:17:58 -0800520
521 // Repack into interlaced pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800522 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG
523 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA
524 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
525 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA
msarett53b9d292016-01-19 13:17:58 -0800526 };
527
528 while (count >= 8) {
529 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
530 hi = _mm_loadu_si128((const __m128i*) (src + 4));
531
532 premul8(&lo, &hi);
533
534 _mm_storeu_si128((__m128i*) (dst + 0), lo);
535 _mm_storeu_si128((__m128i*) (dst + 4), hi);
536
537 src += 8;
538 dst += 8;
539 count -= 8;
540 }
541
542 if (count >= 4) {
543 __m128i lo = _mm_loadu_si128((const __m128i*) src),
544 hi = _mm_setzero_si128();
545
546 premul8(&lo, &hi);
547
548 _mm_storeu_si128((__m128i*) dst, lo);
549
550 src += 4;
551 dst += 4;
552 count -= 4;
553 }
554
555 // Call portable code to finish up the tail of [0,4) pixels.
mtklein8bf7b792016-01-22 07:42:53 -0800556 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
msarett53b9d292016-01-19 13:17:58 -0800557 proc(dst, src, count);
558}
559
Mike Klein6e78ae52018-09-19 13:37:16 -0400560/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -0800561 premul_should_swapRB<false>(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800562}
563
Mike Klein6e78ae52018-09-19 13:37:16 -0400564/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -0800565 premul_should_swapRB<true>(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800566}
567
Mike Klein6e78ae52018-09-19 13:37:16 -0400568/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
msarett53b9d292016-01-19 13:17:58 -0800569 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
570
571 while (count >= 4) {
mtklein8bf7b792016-01-22 07:42:53 -0800572 __m128i rgba = _mm_loadu_si128((const __m128i*) src);
573 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
574 _mm_storeu_si128((__m128i*) dst, bgra);
msarett53b9d292016-01-19 13:17:58 -0800575
576 src += 4;
577 dst += 4;
578 count -= 4;
579 }
580
mtklein8bf7b792016-01-22 07:42:53 -0800581 RGBA_to_BGRA_portable(dst, src, count);
msarett53b9d292016-01-19 13:17:58 -0800582}
583
msarett13aa1a52016-01-22 14:12:38 -0800584template <bool kSwapRB>
Mike Klein6e78ae52018-09-19 13:37:16 -0400585static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
msarett13aa1a52016-01-22 14:12:38 -0800586 const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
587 __m128i expand;
588 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant.
589 if (kSwapRB) {
590 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
591 } else {
592 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
593 }
594
595 while (count >= 6) {
596 // Load a vector. While this actually contains 5 pixels plus an
597 // extra component, we will discard all but the first four pixels on
598 // this iteration.
599 __m128i rgb = _mm_loadu_si128((const __m128i*) src);
600
601 // Expand the first four pixels to RGBX and then mask to RGB(FF).
602 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
603
604 // Store 4 pixels.
605 _mm_storeu_si128((__m128i*) dst, rgba);
606
607 src += 4*3;
608 dst += 4;
609 count -= 4;
610 }
611
612 // Call portable code to finish up the tail of [0,4) pixels.
613 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
614 proc(dst, src, count);
615}
616
Mike Klein6e78ae52018-09-19 13:37:16 -0400617/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
msarett13aa1a52016-01-22 14:12:38 -0800618 insert_alpha_should_swaprb<false>(dst, src, count);
msarettf1b8b6a2016-01-22 09:54:21 -0800619}
620
Mike Klein6e78ae52018-09-19 13:37:16 -0400621/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
msarett13aa1a52016-01-22 14:12:38 -0800622 insert_alpha_should_swaprb<true>(dst, src, count);
msarettf1b8b6a2016-01-22 09:54:21 -0800623}
624
Mike Klein6e78ae52018-09-19 13:37:16 -0400625/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
msarett07006512016-02-02 13:41:03 -0800626 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
627 while (count >= 16) {
628 __m128i grays = _mm_loadu_si128((const __m128i*) src);
629
630 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
631 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
632 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
633 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
634
635 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
636 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
637 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
638 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
639
640 _mm_storeu_si128((__m128i*) (dst + 0), ggga0);
641 _mm_storeu_si128((__m128i*) (dst + 4), ggga1);
642 _mm_storeu_si128((__m128i*) (dst + 8), ggga2);
643 _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
644
645 src += 16;
646 dst += 16;
647 count -= 16;
648 }
649
msarett2eff71c2016-02-02 12:59:45 -0800650 gray_to_RGB1_portable(dst, src, count);
651}
652
Mike Klein6e78ae52018-09-19 13:37:16 -0400653/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
msarett09574242016-02-03 15:28:35 -0800654 while (count >= 8) {
655 __m128i ga = _mm_loadu_si128((const __m128i*) src);
656
657 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
658 _mm_slli_epi16(ga, 8));
659
660 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
661 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
662
663 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
664 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
665
666 src += 8*2;
667 dst += 8;
668 count -= 8;
669 }
670
msarett1e060792016-02-03 11:17:43 -0800671 grayA_to_RGBA_portable(dst, src, count);
672}
673
Mike Klein6e78ae52018-09-19 13:37:16 -0400674/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
msarett09574242016-02-03 15:28:35 -0800675 while (count >= 8) {
676 __m128i grayA = _mm_loadu_si128((const __m128i*) src);
677
678 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
679 __m128i a0 = _mm_srli_epi16(grayA, 8);
680
681 // Premultiply
682 g0 = scale(g0, a0);
683
684 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
685 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
686
687
688 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
689 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
690
691 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo);
692 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi);
693
694 src += 8*2;
695 dst += 8;
696 count -= 8;
697 }
698
msarett1e060792016-02-03 11:17:43 -0800699 grayA_to_rgbA_portable(dst, src, count);
700}
701
msarettc5c322d2016-02-08 13:26:25 -0800702enum Format { kRGB1, kBGR1 };
703template <Format format>
Mike Klein6e78ae52018-09-19 13:37:16 -0400704static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800705 auto convert8 = [](__m128i* lo, __m128i* hi) {
706 const __m128i zeros = _mm_setzero_si128();
707 __m128i planar;
708 if (kBGR1 == format) {
709 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
710 } else {
711 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
712 }
713
714 // Swizzle the pixels to 8-bit planar.
715 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk
716 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK
717 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM
718 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK
719
720 // Unpack to 16-bit planar.
721 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_
722 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_
723 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_
724 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_
725
726 // Scale to r, g, b.
727 __m128i r = scale(c, k),
728 g = scale(m, k),
729 b = scale(y, k);
730
731 // Repack into interlaced pixels.
732 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG
733 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1
734 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba
735 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1
736 };
737
738 while (count >= 8) {
739 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
740 hi = _mm_loadu_si128((const __m128i*) (src + 4));
741
742 convert8(&lo, &hi);
743
744 _mm_storeu_si128((__m128i*) (dst + 0), lo);
745 _mm_storeu_si128((__m128i*) (dst + 4), hi);
746
747 src += 8;
748 dst += 8;
749 count -= 8;
750 }
751
752 if (count >= 4) {
753 __m128i lo = _mm_loadu_si128((const __m128i*) src),
754 hi = _mm_setzero_si128();
755
756 convert8(&lo, &hi);
757
758 _mm_storeu_si128((__m128i*) dst, lo);
759
760 src += 4;
761 dst += 4;
762 count -= 4;
763 }
764
765 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
766 proc(dst, src, count);
767}
768
Mike Klein6e78ae52018-09-19 13:37:16 -0400769/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800770 inverted_cmyk_to<kRGB1>(dst, src, count);
771}
772
Mike Klein6e78ae52018-09-19 13:37:16 -0400773/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800774 inverted_cmyk_to<kBGR1>(dst, src, count);
775}
776
msarett3a24f452016-01-13 14:31:59 -0800777#else
778
Mike Klein6e78ae52018-09-19 13:37:16 -0400779/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -0800780 RGBA_to_rgbA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800781}
782
Mike Klein6e78ae52018-09-19 13:37:16 -0400783/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -0800784 RGBA_to_bgrA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800785}
786
Mike Klein6e78ae52018-09-19 13:37:16 -0400787/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
mtklein8bf7b792016-01-22 07:42:53 -0800788 RGBA_to_BGRA_portable(dst, src, count);
msarett3a24f452016-01-13 14:31:59 -0800789}
790
Mike Klein6e78ae52018-09-19 13:37:16 -0400791/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
msarettf1b8b6a2016-01-22 09:54:21 -0800792 RGB_to_RGB1_portable(dst, src, count);
793}
794
Mike Klein6e78ae52018-09-19 13:37:16 -0400795/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
msarettf1b8b6a2016-01-22 09:54:21 -0800796 RGB_to_BGR1_portable(dst, src, count);
797}
798
Mike Klein6e78ae52018-09-19 13:37:16 -0400799/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
msarett2eff71c2016-02-02 12:59:45 -0800800 gray_to_RGB1_portable(dst, src, count);
801}
802
Mike Klein6e78ae52018-09-19 13:37:16 -0400803/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
msarett1e060792016-02-03 11:17:43 -0800804 grayA_to_RGBA_portable(dst, src, count);
805}
806
Mike Klein6e78ae52018-09-19 13:37:16 -0400807/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
msarett1e060792016-02-03 11:17:43 -0800808 grayA_to_rgbA_portable(dst, src, count);
809}
810
Mike Klein6e78ae52018-09-19 13:37:16 -0400811/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800812 inverted_CMYK_to_RGB1_portable(dst, src, count);
813}
814
Mike Klein6e78ae52018-09-19 13:37:16 -0400815/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
msarettc5c322d2016-02-08 13:26:25 -0800816 inverted_CMYK_to_BGR1_portable(dst, src, count);
817}
818
msarett03108de2016-01-15 11:02:36 -0800819#endif
820
msarett3a24f452016-01-13 14:31:59 -0800821}
822
823#endif // SkSwizzler_opts_DEFINED