Blame - src/opts/SkBlitRow_opts_SSE2.cpp - platform/external/skia

2009-11-04 20:51:06 +0000

[diff] [blame]

1

/*

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

2

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

3

*

4

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

6

*/

7

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

8

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

9

#include "SkBlitRow_opts_SSE2.h"

caryclark@google.com

83ecdc3

2012-06-06 12:10:26 +0000

[diff] [blame]

10

#include "SkBitmapProcState_opts_SSE2.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

11

#include "SkColorPriv.h"

commit-bot@chromium.org

4759107

2014-02-19 03:09:52 +0000

[diff] [blame]

12

#include "SkColor_opts_SSE2.h"

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

13

#include "SkDither.h"

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

14

#include "SkUtils.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

15

16

#include <emmintrin.h>

17

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

18

/* SSE2 version of S32_Blend_BlitRow32()

19

* portable version is in core/SkBlitRow_D32.cpp

20

*/

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

21

void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

22

const SkPMColor* SK_RESTRICT src,

23

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

24

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

uint32_t src_scale = SkAlpha255To256(alpha);

30

uint32_t dst_scale = 256 - src_scale;

31

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

32

if (count >= 4) {

33

SkASSERT(((size_t)dst & 0x03) == 0);

34

while (((size_t)dst & 0x0F) != 0) {

35

*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

40

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

41

const __m128i *s = reinterpret_cast<const __m128i*>(src);

42

__m128i *d = reinterpret_cast<__m128i*>(dst);

43

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

44

__m128i ag_mask = _mm_set1_epi32(0xFF00FF00);

45

46

// Move scale factors to upper byte of word

47

__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

48

__m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

49

while (count >= 4) {

50

// Load 4 pixels each of src and dest.

51

__m128i src_pixel = _mm_loadu_si128(s);

52

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

53

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

54

// Interleave Atom port 0/1 operations based on the execution port

55

// constraints that multiply can only be executed on port 0 (while

56

// boolean operations can be executed on either port 0 or port 1)

57

// because GCC currently doesn't do a good job scheduling

58

// instructions based on these constraints.

59

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

60

// Get red and blue pixels into lower byte of each word.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

61

// (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

62

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

63

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

64

// Multiply by scale.

65

// (4 x (0, rs.h, 0, bs.h))

66

// where rs.h stands for the higher byte of r * scale, and

67

// bs.h the higher byte of b * scale.

68

src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

69

70

// Get alpha and green pixels into higher byte of each word.

71

// (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)

72

__m128i src_ag = _mm_and_si128(ag_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

73

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

74

// Multiply by scale.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

75

// (4 x (as.h, as.l, gs.h, gs.l))

76

src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

77

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

78

// Clear the lower byte of the a*scale and g*scale results

79

// (4 x (as.h, 0, gs.h, 0))

80

src_ag = _mm_and_si128(src_ag, ag_mask);

81

82

// Operations the destination pixels are the same as on the

83

// source pixels. See the comments above.

84

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

85

dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);

86

__m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);

87

dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);

88

dst_ag = _mm_and_si128(dst_ag, ag_mask);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

89

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

90

// Combine back into RGBA.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

91

// (4 x (as.h, rs.h, gs.h, bs.h))

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

92

src_pixel = _mm_or_si128(src_rb, src_ag);

93

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

94

95

// Add result

96

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

97

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

103

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

104

}

105

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

106

while (count > 0) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

107

*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

114

void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

115

const SkPMColor* SK_RESTRICT src,

116

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

117

SkASSERT(alpha == 255);

118

if (count <= 0) {

119

return;

120

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

121

122

if (count >= 4) {

123

SkASSERT(((size_t)dst & 0x03) == 0);

124

while (((size_t)dst & 0x0F) != 0) {

125

*dst = SkPMSrcOver(*src, *dst);

src++;

dst++;

count--;

}

const __m128i *s = reinterpret_cast<const __m128i*>(src);

132

__m128i *d = reinterpret_cast<__m128i*>(dst);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

133

#ifdef SK_USE_ACCURATE_BLENDING

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

134

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

135

__m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)

136

__m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)

137

while (count >= 4) {

138

// Load 4 pixels

139

__m128i src_pixel = _mm_loadu_si128(s);

140

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

141

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

142

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

143

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

144

// Shift alphas down to lower 8 bits of each quad.

145

__m128i alpha = _mm_srli_epi32(src_pixel, 24);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

146

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

147

// Copy alpha to upper 3rd byte of each quad

148

alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

149

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

150

// Subtract alphas from 255, to get 0..255

151

alpha = _mm_sub_epi16(c_255, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

152

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

153

// Multiply by red and blue by src alpha.

154

dst_rb = _mm_mullo_epi16(dst_rb, alpha);

155

// Multiply by alpha and green by src alpha.

156

dst_ag = _mm_mullo_epi16(dst_ag, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

157

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

158

// dst_rb_low = (dst_rb >> 8)

159

__m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);

160

__m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

161

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

162

// dst_rb = (dst_rb + dst_rb_low + 128) >> 8

163

dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);

164

dst_rb = _mm_add_epi16(dst_rb, c_128);

165

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

166

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

167

// dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask

168

dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);

169

dst_ag = _mm_add_epi16(dst_ag, c_128);

170

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

171

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

172

// Combine back into RGBA.

173

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

174

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

175

// Add result

176

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

177

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

#else

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

184

__m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)

185

while (count >= 4) {

186

// Load 4 pixels

187

__m128i src_pixel = _mm_loadu_si128(s);

188

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

189

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

190

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

191

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

192

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

193

// (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)

194

__m128i alpha = _mm_srli_epi16(src_pixel, 8);

195

196

// (a0, a0, a1, a1, a2, g2, a3, g3)

197

alpha = _mm_shufflehi_epi16(alpha, 0xF5);

198

199

// (a0, a0, a1, a1, a2, a2, a3, a3)

200

alpha = _mm_shufflelo_epi16(alpha, 0xF5);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

201

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

202

// Subtract alphas from 256, to get 1..256

203

alpha = _mm_sub_epi16(c_256, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

204

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

205

// Multiply by red and blue by src alpha.

206

dst_rb = _mm_mullo_epi16(dst_rb, alpha);

207

// Multiply by alpha and green by src alpha.

208

dst_ag = _mm_mullo_epi16(dst_ag, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

209

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

210

// Divide by 256.

211

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

212

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

213

// Mask out high bits (already in the right place)

214

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

215

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

216

// Combine back into RGBA.

217

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

218

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

219

// Add result

220

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

221

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

226

#endif

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

227

src = reinterpret_cast<const SkPMColor*>(s);

228

dst = reinterpret_cast<SkPMColor*>(d);

229

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

230

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

231

while (count > 0) {

232

*dst = SkPMSrcOver(*src, *dst);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

239

void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

240

const SkPMColor* SK_RESTRICT src,

241

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

242

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

247

if (count >= 4) {

248

while (((size_t)dst & 0x0F) != 0) {

249

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

254

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

255

uint32_t src_scale = SkAlpha255To256(alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

256

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

257

const __m128i *s = reinterpret_cast<const __m128i*>(src);

258

__m128i *d = reinterpret_cast<__m128i*>(dst);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

259

__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

260

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

261

__m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)

262

while (count >= 4) {

263

// Load 4 pixels each of src and dest.

264

__m128i src_pixel = _mm_loadu_si128(s);

265

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

266

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

267

// Get red and blue pixels into lower byte of each word.

268

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

269

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

270

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

271

// Get alpha and green into lower byte of each word.

272

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

273

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

274

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

275

// Put per-pixel alpha in low byte of each word.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

276

// After the following two statements, the dst_alpha looks like

277

// (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

278

__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

279

dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

280

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

281

// dst_alpha = dst_alpha * src_scale

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

282

// Because src_scales are in the higher byte of each word and

283

// we use mulhi here, the resulting alpha values are already

284

// in the right place and don't need to be divided by 256.

285

// (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)

286

dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

287

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

288

// Subtract alphas from 256, to get 1..256

289

dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

290

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

291

// Multiply red and blue by dst pixel alpha.

292

dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

293

// Multiply alpha and green by dst pixel alpha.

294

dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

295

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

296

// Multiply red and blue by global alpha.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

297

// (4 x (0, rs.h, 0, bs.h))

298

// where rs.h stands for the higher byte of r * src_scale,

299

// and bs.h the higher byte of b * src_scale.

300

// Again, because we use mulhi, the resuling red and blue

301

// values are already in the right place and don't need to

302

// be divided by 256.

303

src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

304

// Multiply alpha and green by global alpha.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

305

// (4 x (0, as.h, 0, gs.h))

306

src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

307

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

308

// Divide by 256.

309

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

310

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

311

// Mask out low bits (goodies already in the right place; no need to divide)

312

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

313

// Shift alpha and green to higher byte of each word.

314

// (4 x (as.h, 0, gs.h, 0))

315

src_ag = _mm_slli_epi16(src_ag, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

316

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

317

// Combine back into RGBA.

318

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

319

src_pixel = _mm_or_si128(src_rb, src_ag);

320

321

// Add two pixels into result.

322

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

323

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

329

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

330

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

331

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

332

while (count > 0) {

333

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

339

340

/* SSE2 version of Color32()

341

* portable version is in core/SkBlitRow_D32.cpp

342

*/

343

void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,

SkPMColor color) {

if (count <= 0) {

return;

}

if (0 == color) {

if (src != dst) {

memcpy(dst, src, count * sizeof(SkPMColor));

353

}

reed@google.com

c909a1e

2011-10-25 19:07:23 +0000

[diff] [blame]

354

return;

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

355

}

356

357

unsigned colorA = SkGetPackedA32(color);

358

if (255 == colorA) {

359

sk_memset32(dst, color, count);

360

} else {

361

unsigned scale = 256 - SkAlpha255To256(colorA);

362

363

if (count >= 4) {

364

SkASSERT(((size_t)dst & 0x03) == 0);

365

while (((size_t)dst & 0x0F) != 0) {

366

*dst = color + SkAlphaMulQ(*src, scale);

src++;

dst++;

count--;

}

const __m128i *s = reinterpret_cast<const __m128i*>(src);

373

__m128i *d = reinterpret_cast<__m128i*>(dst);

374

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

375

__m128i src_scale_wide = _mm_set1_epi16(scale);

376

__m128i color_wide = _mm_set1_epi32(color);

377

while (count >= 4) {

378

// Load 4 pixels each of src and dest.

379

__m128i src_pixel = _mm_loadu_si128(s);

380

381

// Get red and blue pixels into lower byte of each word.

382

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

383

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

384

// Get alpha and green into lower byte of each word.

385

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

386

387

// Multiply by scale.

388

src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

389

src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

390

391

// Divide by 256.

392

src_rb = _mm_srli_epi16(src_rb, 8);

393

src_ag = _mm_andnot_si128(rb_mask, src_ag);

394

395

// Combine back into RGBA.

396

src_pixel = _mm_or_si128(src_rb, src_ag);

397

398

// Add color to result.

399

__m128i result = _mm_add_epi8(color_wide, src_pixel);

400

401

// Store result.

402

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

408

dst = reinterpret_cast<SkPMColor*>(d);

}

while (count > 0) {

*dst = color + SkAlphaMulQ(*src, scale);

413

src += 1;

414

dst += 1;

415

count--;

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

416

}

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

417

}

418

}

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

419

reed@google.com

edb606c

2011-10-18 13:56:50 +0000

[diff] [blame]

420

void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,

421

size_t maskRB, SkColor origColor,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

422

int width, int height) {

reed@google.com

ee467ee

2011-03-09 13:23:57 +0000

[diff] [blame]

423

SkPMColor color = SkPreMultiplyColor(origColor);

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

424

size_t dstOffset = dstRB - (width << 2);

425

size_t maskOffset = maskRB - width;

426

SkPMColor* dst = (SkPMColor *)device;

reed@google.com

edb606c

2011-10-18 13:56:50 +0000

[diff] [blame]

427

const uint8_t* mask = (const uint8_t*)maskPtr;

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

do {

int count = width;

if (count >= 4) {

while (((size_t)dst & 0x0F) != 0 && (count > 0)) {

432

*dst = SkBlendARGB32(color, *dst, *mask);

mask++;

dst++;

count--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

438

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

439

__m128i c_256 = _mm_set1_epi16(256);

440

__m128i c_1 = _mm_set1_epi16(1);

441

__m128i src_pixel = _mm_set1_epi32(color);

442

while (count >= 4) {

443

// Load 4 pixels each of src and dest.

444

__m128i dst_pixel = _mm_load_si128(d);

445

446

//set the aphla value

447

__m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\

448

0, *(mask+3),0, \

449

*(mask+2),0, *(mask+2),\

450

0,*(mask+1), 0,*(mask+1),\

451

0, *mask,0,*mask);

452

453

//call SkAlpha255To256()

454

src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);

455

456

// Get red and blue pixels into lower byte of each word.

457

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

458

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

459

460

// Get alpha and green into lower byte of each word.

461

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

462

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

463

464

// Put per-pixel alpha in low byte of each word.

465

__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

466

dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

467

468

// dst_alpha = dst_alpha * src_scale

469

dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);

470

471

// Divide by 256.

472

dst_alpha = _mm_srli_epi16(dst_alpha, 8);

473

474

// Subtract alphas from 256, to get 1..256

475

dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

476

// Multiply red and blue by dst pixel alpha.

477

dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

478

// Multiply alpha and green by dst pixel alpha.

479

dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

480

481

// Multiply red and blue by global alpha.

482

src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

483

// Multiply alpha and green by global alpha.

484

src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

485

// Divide by 256.

486

dst_rb = _mm_srli_epi16(dst_rb, 8);

487

src_rb = _mm_srli_epi16(src_rb, 8);

488

489

// Mask out low bits (goodies already in the right place; no need to divide)

490

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

491

src_ag = _mm_andnot_si128(rb_mask, src_ag);

492

493

// Combine back into RGBA.

494

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

495

__m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);

496

497

// Add two pixels into result.

498

__m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);

499

_mm_store_si128(d, result);

500

// load the next 4 pixel

mask = mask + 4;

d++;

count -= 4;

}

dst = reinterpret_cast<SkPMColor *>(d);

506

}

507

while(count > 0) {

508

*dst= SkBlendARGB32(color, *dst, *mask);

dst += 1;

mask++;

count --;

}

dst = (SkPMColor *)((char*)dst + dstOffset);

514

mask += maskOffset;

515

} while (--height != 0);

516

}

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

517

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

518

// The following (left) shifts cause the top 5 bits of the mask components to

519

// line up with the corresponding components in an SkPMColor.

520

// Note that the mask's RGB16 order may differ from the SkPMColor order.

521

#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)

522

#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)

523

#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

524

525

#if SK_R16x5_R32x5_SHIFT == 0

526

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)

527

#elif SK_R16x5_R32x5_SHIFT > 0

528

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))

529

#else

530

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))

531

#endif

532

533

#if SK_G16x5_G32x5_SHIFT == 0

534

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)

535

#elif SK_G16x5_G32x5_SHIFT > 0

536

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))

537

#else

538

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))

539

#endif

540

541

#if SK_B16x5_B32x5_SHIFT == 0

542

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)

543

#elif SK_B16x5_B32x5_SHIFT > 0

544

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))

545

#else

546

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))

547

#endif

548

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

549

static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

550

__m128i &mask, __m128i &srcA) {

551

// In the following comments, the components of src, dst and mask are

552

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

553

// by an R, G, B, or A suffix. Components of one of the four pixels that

554

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

555

// example is the blue channel of the second destination pixel. Memory

556

// layout is shown for an ARGB byte order in a color value.

557

558

// src and srcA store 8-bit values interleaved with zeros.

559

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

560

// srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

561

// srcA, 0, srcA, 0, srcA, 0, srcA, 0)

562

// mask stores 16-bit values (compressed three channels) interleaved with zeros.

563

// Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

564

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

565

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

566

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

567

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

568

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

569

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

570

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

571

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

572

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

573

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

574

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

575

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

576

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

577

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

578

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

579

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

580

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

581

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

582

// 8-bit position

583

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

584

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

585

mask = _mm_or_si128(_mm_or_si128(r, g), b);

586

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

587

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

588

// i.e. split the sixteen 8-bit values from mask into two sets of eight

589

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

590

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

591

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

592

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

593

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

594

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

595

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

596

// Upscale from 0..31 to 0..32

597

// (allows to replace division by left-shift further down)

598

// Left-shift each component by 4 and add the result back to that component,

599

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

600

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

601

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

602

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

603

// Multiply each component of maskLo and maskHi by srcA

604

maskLo = _mm_mullo_epi16(maskLo, srcA);

605

maskHi = _mm_mullo_epi16(maskHi, srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

606

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

607

// Left shift mask components by 8 (divide by 256)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

608

maskLo = _mm_srli_epi16(maskLo, 8);

609

maskHi = _mm_srli_epi16(maskHi, 8);

610

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

611

// Interleave R,G,B into the lower byte of the word

612

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

613

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

614

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

615

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

616

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

617

// mask = (src - dst) * mask

618

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

619

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

620

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

621

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

622

maskLo = _mm_srai_epi16(maskLo, 5);

623

maskHi = _mm_srai_epi16(maskHi, 5);

624

625

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

626

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

627

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

628

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

629

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

630

// Pack into 4 32bit dst pixels.

631

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

632

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

633

// clamping to 255 if necessary.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

634

return _mm_packus_epi16(resultLo, resultHi);

635

}

636

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

637

static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

638

__m128i &mask) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

639

// In the following comments, the components of src, dst and mask are

640

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

641

// by an R, G, B, or A suffix. Components of one of the four pixels that

642

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

643

// example is the blue channel of the second destination pixel. Memory

644

// layout is shown for an ARGB byte order in a color value.

645

646

// src and srcA store 8-bit values interleaved with zeros.

647

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

648

// mask stores 16-bit values (shown as high and low bytes) interleaved with

649

// zeros

650

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

651

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

652

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

653

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

654

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

655

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

656

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

657

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

658

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

659

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

660

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

661

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

662

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

663

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

664

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

665

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

666

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

667

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

668

// 8-bit position

669

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

670

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

671

mask = _mm_or_si128(_mm_or_si128(r, g), b);

672

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

673

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

674

// i.e. split the sixteen 8-bit values from mask into two sets of eight

675

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

676

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

677

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

678

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

679

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

680

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

681

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

682

// Upscale from 0..31 to 0..32

683

// (allows to replace division by left-shift further down)

684

// Left-shift each component by 4 and add the result back to that component,

685

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

686

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

687

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

688

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

689

// Interleave R,G,B into the lower byte of the word

690

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

691

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

692

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

693

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

694

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

695

// mask = (src - dst) * mask

696

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

697

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

698

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

699

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

700

maskLo = _mm_srai_epi16(maskLo, 5);

701

maskHi = _mm_srai_epi16(maskHi, 5);

702

703

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

704

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

705

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

706

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

707

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

708

// Pack into 4 32bit dst pixels and force opaque.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

709

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

710

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

711

// clamping to 255 if necessary. Set alpha components to 0xFF.

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

712

return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),

713

_mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

714

}

715

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

716

void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],

717

SkColor src, int width, SkPMColor) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

722

int srcA = SkColorGetA(src);

723

int srcR = SkColorGetR(src);

724

int srcG = SkColorGetG(src);

725

int srcB = SkColorGetB(src);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

726

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

727

srcA = SkAlpha255To256(srcA);

728

729

if (width >= 4) {

730

SkASSERT(((size_t)dst & 0x03) == 0);

731

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

732

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

733

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

739

// Set alpha to 0xFF and replicate source four times in SSE register.

740

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

741

// Interleave with zeros to get two sets of four 16-bit values.

742

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

743

// Set srcA_sse to contain eight copies of srcA, padded with zero.

744

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

745

__m128i srcA_sse = _mm_set1_epi16(srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

746

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

747

// Load four destination pixels into dst_sse.

748

__m128i dst_sse = _mm_load_si128(d);

749

// Load four 16-bit masks into lower half of mask_sse.

750

__m128i mask_sse = _mm_loadl_epi64(

751

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

752

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

753

// Check whether masks are equal to 0 and get the highest bit

754

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

755

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

756

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

757

_mm_setzero_si128()));

758

759

// if mask pixels are not all zero, we will blend the dst pixels

760

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

761

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

762

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

763

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

764

mask_sse = _mm_unpacklo_epi16(mask_sse,

765

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

766

767

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

768

__m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,

769

mask_sse, srcA_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

770

_mm_store_si128(d, result);

771

}

772

773

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

774

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

779

}

780

781

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

782

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

783

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

784

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

785

width--;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

786

}

787

}

788

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

789

void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],

790

SkColor src, int width, SkPMColor opaqueDst) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

795

int srcR = SkColorGetR(src);

796

int srcG = SkColorGetG(src);

797

int srcB = SkColorGetB(src);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

798

799

if (width >= 4) {

800

SkASSERT(((size_t)dst & 0x03) == 0);

801

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

802

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

803

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

809

// Set alpha to 0xFF and replicate source four times in SSE register.

810

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

811

// Set srcA_sse to contain eight copies of srcA, padded with zero.

812

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

813

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

814

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

815

// Load four destination pixels into dst_sse.

816

__m128i dst_sse = _mm_load_si128(d);

817

// Load four 16-bit masks into lower half of mask_sse.

818

__m128i mask_sse = _mm_loadl_epi64(

819

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

820

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

821

// Check whether masks are equal to 0 and get the highest bit

822

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

823

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

824

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

825

_mm_setzero_si128()));

826

827

// if mask pixels are not all zero, we will blend the dst pixels

828

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

829

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

830

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

831

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

832

mask_sse = _mm_unpacklo_epi16(mask_sse,

833

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

834

835

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

836

__m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,

837

mask_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

838

_mm_store_si128(d, result);

839

}

840

841

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

842

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

847

}

848

849

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

850

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

851

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

852

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

853

width--;

tomhudson@google.com