Blame - src/opts/SkBlitRow_opts_SSE2.cpp - platform/external/skia

2009-11-04 20:51:06 +0000

[diff] [blame]

1

/*

tomhudson@google.com

98a5b42

2012-02-28 16:15:26 +0000

[diff] [blame]

2

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

3

*

4

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

6

*/

7

commit-bot@chromium.org

8c4953c

2014-04-30 14:58:46 +0000

[diff] [blame]

8

#include <emmintrin.h>

caryclark@google.com

83ecdc3

2012-06-06 12:10:26 +0000

[diff] [blame]

9

#include "SkBitmapProcState_opts_SSE2.h"

commit-bot@chromium.org

8c4953c

2014-04-30 14:58:46 +0000

[diff] [blame]

10

#include "SkBlitRow_opts_SSE2.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

11

#include "SkColorPriv.h"

commit-bot@chromium.org

4759107

2014-02-19 03:09:52 +0000

[diff] [blame]

12

#include "SkColor_opts_SSE2.h"

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

13

#include "SkDither.h"

mtklein

1059b1f

2016-02-03 07:25:02 -0800

[diff] [blame]

14

#include "SkMSAN.h"

senorblanco@chromium.org

c385638

2010-12-13 15:27:20 +0000

[diff] [blame]

15

#include "SkUtils.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

16

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

17

/* SSE2 version of S32_Blend_BlitRow32()

18

* portable version is in core/SkBlitRow_D32.cpp

19

*/

senorblanco@chromium.org

4e75355

2009-11-16 21:09:00 +0000

[diff] [blame]

20

void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

21

const SkPMColor* SK_RESTRICT src,

22

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

23

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

uint32_t src_scale = SkAlpha255To256(alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

29

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

30

if (count >= 4) {

31

SkASSERT(((size_t)dst & 0x03) == 0);

32

while (((size_t)dst & 0x0F) != 0) {

lsalzman

40254c2

2016-08-05 11:48:45 -0700

[diff] [blame]

33

*dst = SkPMLerp(*src, *dst, src_scale);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

38

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

39

const __m128i *s = reinterpret_cast<const __m128i*>(src);

40

__m128i *d = reinterpret_cast<__m128i*>(dst);

tomhudson@google.com

98a5b42

2012-02-28 16:15:26 +0000

[diff] [blame]

41

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

42

while (count >= 4) {

43

// Load 4 pixels each of src and dest.

44

__m128i src_pixel = _mm_loadu_si128(s);

45

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

46

lsalzman

40254c2

2016-08-05 11:48:45 -0700

[diff] [blame]

47

__m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

48

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

54

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

55

}

56

senorblanco@chromium.org

4e75355

2009-11-16 21:09:00 +0000

[diff] [blame]

57

while (count > 0) {

lsalzman

40254c2

2016-08-05 11:48:45 -0700

[diff] [blame]

58

*dst = SkPMLerp(*src, *dst, src_scale);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

4e75355

2009-11-16 21:09:00 +0000

[diff] [blame]

65

void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

66

const SkPMColor* SK_RESTRICT src,

67

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

68

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

73

if (count >= 4) {

74

while (((size_t)dst & 0x0F) != 0) {

75

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

80

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

81

const __m128i *s = reinterpret_cast<const __m128i*>(src);

82

__m128i *d = reinterpret_cast<__m128i*>(dst);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

83

while (count >= 4) {

84

// Load 4 pixels each of src and dest.

85

__m128i src_pixel = _mm_loadu_si128(s);

86

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

87

qiankun.miao

2253aa9

2014-11-25 06:35:02 -0800

[diff] [blame]

88

__m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

89

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

95

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

96

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

97

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

98

while (count > 0) {

99

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

c385638

2010-12-13 15:27:20 +0000

[diff] [blame]

105

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

106

// The following (left) shifts cause the top 5 bits of the mask components to

107

// line up with the corresponding components in an SkPMColor.

108

// Note that the mask's RGB16 order may differ from the SkPMColor order.

109

#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)

110

#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)

111

#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

112

113

#if SK_R16x5_R32x5_SHIFT == 0

114

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)

115

#elif SK_R16x5_R32x5_SHIFT > 0

116

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))

117

#else

118

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))

119

#endif

120

121

#if SK_G16x5_G32x5_SHIFT == 0

122

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)

123

#elif SK_G16x5_G32x5_SHIFT > 0

124

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))

125

#else

126

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))

127

#endif

128

129

#if SK_B16x5_B32x5_SHIFT == 0

130

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)

131

#elif SK_B16x5_B32x5_SHIFT > 0

132

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))

133

#else

134

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))

135

#endif

136

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

137

static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

138

__m128i &mask, __m128i &srcA) {

139

// In the following comments, the components of src, dst and mask are

140

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

141

// by an R, G, B, or A suffix. Components of one of the four pixels that

142

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

143

// example is the blue channel of the second destination pixel. Memory

144

// layout is shown for an ARGB byte order in a color value.

145

146

// src and srcA store 8-bit values interleaved with zeros.

147

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

148

// srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

149

// srcA, 0, srcA, 0, srcA, 0, srcA, 0)

150

// mask stores 16-bit values (compressed three channels) interleaved with zeros.

151

// Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

152

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

153

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

154

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

155

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

156

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

157

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

158

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

159

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

160

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

161

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

162

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

163

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

164

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

165

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

166

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

167

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

168

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

169

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

170

// 8-bit position

171

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

172

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

173

mask = _mm_or_si128(_mm_or_si128(r, g), b);

174

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

175

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

176

// i.e. split the sixteen 8-bit values from mask into two sets of eight

177

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

178

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

179

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

180

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

181

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

182

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

183

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

184

// Upscale from 0..31 to 0..32

185

// (allows to replace division by left-shift further down)

186

// Left-shift each component by 4 and add the result back to that component,

187

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

188

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

189

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

190

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

191

// Multiply each component of maskLo and maskHi by srcA

192

maskLo = _mm_mullo_epi16(maskLo, srcA);

193

maskHi = _mm_mullo_epi16(maskHi, srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

194

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

195

// Left shift mask components by 8 (divide by 256)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

196

maskLo = _mm_srli_epi16(maskLo, 8);

197

maskHi = _mm_srli_epi16(maskHi, 8);

198

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

199

// Interleave R,G,B into the lower byte of the word

200

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

201

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

202

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

203

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

204

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

205

// mask = (src - dst) * mask

206

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

207

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

208

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

209

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

210

maskLo = _mm_srai_epi16(maskLo, 5);

211

maskHi = _mm_srai_epi16(maskHi, 5);

212

213

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

214

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

215

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

216

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

217

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

218

// Pack into 4 32bit dst pixels.

219

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

220

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

221

// clamping to 255 if necessary.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

222

return _mm_packus_epi16(resultLo, resultHi);

223

}

224

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

225

static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

226

__m128i &mask) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

227

// In the following comments, the components of src, dst and mask are

228

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

229

// by an R, G, B, or A suffix. Components of one of the four pixels that

230

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

231

// example is the blue channel of the second destination pixel. Memory

232

// layout is shown for an ARGB byte order in a color value.

233

234

// src and srcA store 8-bit values interleaved with zeros.

235

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

236

// mask stores 16-bit values (shown as high and low bytes) interleaved with

237

// zeros

238

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

239

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

240

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

241

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

242

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

243

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

244

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

245

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

246

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

247

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

248

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

249

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

250

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

251

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

252

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

253

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

254

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

255

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

256

// 8-bit position

257

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

258

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

259

mask = _mm_or_si128(_mm_or_si128(r, g), b);

260

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

261

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

262

// i.e. split the sixteen 8-bit values from mask into two sets of eight

263

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

264

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

265

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

266

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

267

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

268

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

269

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

270

// Upscale from 0..31 to 0..32

271

// (allows to replace division by left-shift further down)

272

// Left-shift each component by 4 and add the result back to that component,

273

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

274

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

275

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

276

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

277

// Interleave R,G,B into the lower byte of the word

278

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

279

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

280

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

281

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

282

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

283

// mask = (src - dst) * mask

284

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

285

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

286

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

287

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

288

maskLo = _mm_srai_epi16(maskLo, 5);

289

maskHi = _mm_srai_epi16(maskHi, 5);

290

291

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

292

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

293

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

294

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

295

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

296

// Pack into 4 32bit dst pixels and force opaque.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

297

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

298

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

299

// clamping to 255 if necessary. Set alpha components to 0xFF.

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

300

return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),

301

_mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

302

}

303

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

304

void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],

305

SkColor src, int width, SkPMColor) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

310

int srcA = SkColorGetA(src);

311

int srcR = SkColorGetR(src);

312

int srcG = SkColorGetG(src);

313

int srcB = SkColorGetB(src);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

314

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

315

srcA = SkAlpha255To256(srcA);

316

317

if (width >= 4) {

318

SkASSERT(((size_t)dst & 0x03) == 0);

319

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

320

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

321

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

327

// Set alpha to 0xFF and replicate source four times in SSE register.

328

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

329

// Interleave with zeros to get two sets of four 16-bit values.

330

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

331

// Set srcA_sse to contain eight copies of srcA, padded with zero.

332

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

333

__m128i srcA_sse = _mm_set1_epi16(srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

334

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

335

// Load four destination pixels into dst_sse.

336

__m128i dst_sse = _mm_load_si128(d);

337

// Load four 16-bit masks into lower half of mask_sse.

338

__m128i mask_sse = _mm_loadl_epi64(

339

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

340

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

341

// Check whether masks are equal to 0 and get the highest bit

342

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

343

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

344

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

345

_mm_setzero_si128()));

346

347

// if mask pixels are not all zero, we will blend the dst pixels

348

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

349

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

350

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

351

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

352

mask_sse = _mm_unpacklo_epi16(mask_sse,

353

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

354

355

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

356

__m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,

357

mask_sse, srcA_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

358

_mm_store_si128(d, result);

359

}

360

361

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

362

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

367

}

368

369

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

370

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

371

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

372

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

373

width--;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

374

}

375

}

376

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

377

void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],

378

SkColor src, int width, SkPMColor opaqueDst) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

383

int srcR = SkColorGetR(src);

384

int srcG = SkColorGetG(src);

385

int srcB = SkColorGetB(src);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

386

387

if (width >= 4) {

388

SkASSERT(((size_t)dst & 0x03) == 0);

389

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

390

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

391

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

397

// Set alpha to 0xFF and replicate source four times in SSE register.

398

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

399

// Set srcA_sse to contain eight copies of srcA, padded with zero.

400

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

401

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

402

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

403

// Load four destination pixels into dst_sse.

404

__m128i dst_sse = _mm_load_si128(d);

405

// Load four 16-bit masks into lower half of mask_sse.

406

__m128i mask_sse = _mm_loadl_epi64(

407

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

408

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

409

// Check whether masks are equal to 0 and get the highest bit

410

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

411

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

412

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

413

_mm_setzero_si128()));

414

415

// if mask pixels are not all zero, we will blend the dst pixels

416

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

417

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

418

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

419

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

420

mask_sse = _mm_unpacklo_epi16(mask_sse,

421

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

422

423

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

424

__m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,

425

mask_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

426

_mm_store_si128(d, result);

427

}

428

429

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

430

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

435

}

436

437

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

438

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

439

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

440

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

441

width--;

tomhudson@google.com