blob: 467e5d1940624d6571c7561cafc26e6c8a776fa0 [file] [log] [blame]
msarett3a24f452016-01-13 14:31:59 -08001/*
2 * Copyright 2016 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkSwizzler_opts_DEFINED
9#define SkSwizzler_opts_DEFINED
10
11#include "SkColorPriv.h"
12
13namespace SK_OPTS_NS {
14
15// These variable names in these functions just pretend the input is BGRA.
16// They work fine with both RGBA and BGRA.
17
18static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
19 for (int i = 0; i < count; i++) {
20 uint8_t a = src[i] >> 24,
21 r = src[i] >> 16,
22 g = src[i] >> 8,
23 b = src[i] >> 0;
24 r = (r*a+127)/255;
25 g = (g*a+127)/255;
26 b = (b*a+127)/255;
27 dst[i] = (uint32_t)a << 24
28 | (uint32_t)r << 16
29 | (uint32_t)g << 8
30 | (uint32_t)b << 0;
31 }
32}
33
34static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
35 for (int i = 0; i < count; i++) {
36 uint8_t a = src[i] >> 24,
37 r = src[i] >> 16,
38 g = src[i] >> 8,
39 b = src[i] >> 0;
40 r = (r*a+127)/255;
41 g = (g*a+127)/255;
42 b = (b*a+127)/255;
43 dst[i] = (uint32_t)a << 24
44 | (uint32_t)b << 16
45 | (uint32_t)g << 8
46 | (uint32_t)r << 0;
47 }
48}
49
50#if defined(SK_ARM_HAS_NEON)
51
52// Rounded divide by 255, (x + 127) / 255
53static uint8x8_t div255_round(uint16x8_t x) {
54 // result = (x + 127) / 255
55 // result = (x + 127) / 256 + error1
56 //
57 // error1 = (x + 127) / (255 * 256)
58 // error1 = (x + 127) / (256 * 256) + error2
59 //
60 // error2 = (x + 127) / (255 * 256 * 256)
61 //
62 // The maximum value of error2 is too small to matter. Thus:
63 // result = (x + 127) / 256 + (x + 127) / (256 * 256)
64 // result = ((x + 127) / 256 + x + 127) / 256
65 // result = ((x + 127) >> 8 + x + 127) >> 8
66 //
67 // Use >>> to represent "rounded right shift" which, conveniently,
68 // NEON supports in one instruction.
69 // result = ((x >>> 8) + x) >>> 8
70 //
71 // Note that the second right shift is actually performed as an
72 // "add, round, and narrow back to 8-bits" instruction.
73 return vraddhn_u16(x, vrshrq_n_u16(x, 8));
74}
75
76// Scale a byte by another, (x * y + 127) / 255
77static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
78 return div255_round(vmull_u8(x, y));
79}
80
81template <bool kSwapRB>
82static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
83 while (count >= 8) {
84 // Load 8 pixels.
85 uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);
86
87 uint8x8_t a = bgra.val[3],
88 r = bgra.val[2],
89 g = bgra.val[1],
90 b = bgra.val[0];
91
92 // Premultiply.
93 r = scale(r, a);
94 g = scale(g, a);
95 b = scale(b, a);
96
97 // Store 8 premultiplied pixels.
98 if (kSwapRB) {
99 bgra.val[2] = b;
100 bgra.val[1] = g;
101 bgra.val[0] = r;
102 } else {
103 bgra.val[2] = r;
104 bgra.val[1] = g;
105 bgra.val[0] = b;
106 }
107 vst4_u8((uint8_t*) dst, bgra);
108 src += 8;
109 dst += 8;
110 count -= 8;
111 }
112
113 // Call portable code to finish up the tail of [0,8) pixels.
114 auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
115 proc(dst, src, count);
116}
117
118static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
119 premul_xxxa_should_swaprb<false>(dst, src, count);
120}
121
122static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
123 premul_xxxa_should_swaprb<true>(dst, src, count);
124}
125
126#else
127
128static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
129 premul_xxxa_portable(dst, src, count);
130}
131
132static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
133 premul_swaprb_xxxa_portable(dst, src, count);
134}
135
136#endif
137
138static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
139 for (int i = 0; i < count; i++) {
140 uint8_t a = src[i] >> 24,
141 r = src[i] >> 16,
142 g = src[i] >> 8,
143 b = src[i] >> 0;
144 dst[i] = (uint32_t)a << 24
145 | (uint32_t)b << 16
146 | (uint32_t)g << 8
147 | (uint32_t)r << 0;
148 }
149}
150
151}
152
153#endif // SkSwizzler_opts_DEFINED