Blame - src/opts/SkBlitRow_opts_SSE2.cpp - platform/external/skia

2009-11-04 20:51:06 +0000

[diff] [blame]

1

/*

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

2

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

3

*

4

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

6

*/

7

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

8

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

9

#include "SkBlitRow_opts_SSE2.h"

caryclark@google.com

83ecdc3

2012-06-06 12:10:26 +0000

[diff] [blame]

10

#include "SkBitmapProcState_opts_SSE2.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

11

#include "SkColorPriv.h"

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

12

#include "SkUtils.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

13

14

#include <emmintrin.h>

15

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

16

/* SSE2 version of S32_Blend_BlitRow32()

17

* portable version is in core/SkBlitRow_D32.cpp

18

*/

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

19

void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

20

const SkPMColor* SK_RESTRICT src,

21

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

22

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

uint32_t src_scale = SkAlpha255To256(alpha);

28

uint32_t dst_scale = 256 - src_scale;

29

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

30

if (count >= 4) {

31

SkASSERT(((size_t)dst & 0x03) == 0);

32

while (((size_t)dst & 0x0F) != 0) {

33

*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

38

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

39

const __m128i *s = reinterpret_cast<const __m128i*>(src);

40

__m128i *d = reinterpret_cast<__m128i*>(dst);

41

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

42

__m128i ag_mask = _mm_set1_epi32(0xFF00FF00);

43

44

// Move scale factors to upper byte of word

45

__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

46

__m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

47

while (count >= 4) {

48

// Load 4 pixels each of src and dest.

49

__m128i src_pixel = _mm_loadu_si128(s);

50

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

51

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

52

// Interleave Atom port 0/1 operations based on the execution port

53

// constraints that multiply can only be executed on port 0 (while

54

// boolean operations can be executed on either port 0 or port 1)

55

// because GCC currently doesn't do a good job scheduling

56

// instructions based on these constraints.

57

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

58

// Get red and blue pixels into lower byte of each word.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

59

// (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

60

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

61

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

62

// Multiply by scale.

63

// (4 x (0, rs.h, 0, bs.h))

64

// where rs.h stands for the higher byte of r * scale, and

65

// bs.h the higher byte of b * scale.

66

src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

67

68

// Get alpha and green pixels into higher byte of each word.

69

// (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)

70

__m128i src_ag = _mm_and_si128(ag_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

71

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

72

// Multiply by scale.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

73

// (4 x (as.h, as.l, gs.h, gs.l))

74

src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

75

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

76

// Clear the lower byte of the a*scale and g*scale results

77

// (4 x (as.h, 0, gs.h, 0))

78

src_ag = _mm_and_si128(src_ag, ag_mask);

79

80

// Operations the destination pixels are the same as on the

81

// source pixels. See the comments above.

82

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

83

dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);

84

__m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);

85

dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);

86

dst_ag = _mm_and_si128(dst_ag, ag_mask);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

87

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

88

// Combine back into RGBA.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

89

// (4 x (as.h, rs.h, gs.h, bs.h))

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

90

src_pixel = _mm_or_si128(src_rb, src_ag);

91

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

92

93

// Add result

94

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

95

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

101

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

102

}

103

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

104

while (count > 0) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

105

*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

112

void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

113

const SkPMColor* SK_RESTRICT src,

114

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

115

SkASSERT(alpha == 255);

116

if (count <= 0) {

117

return;

118

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

119

120

if (count >= 4) {

121

SkASSERT(((size_t)dst & 0x03) == 0);

122

while (((size_t)dst & 0x0F) != 0) {

123

*dst = SkPMSrcOver(*src, *dst);

src++;

dst++;

count--;

}

const __m128i *s = reinterpret_cast<const __m128i*>(src);

130

__m128i *d = reinterpret_cast<__m128i*>(dst);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

131

#ifdef SK_USE_ACCURATE_BLENDING

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

132

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

133

__m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)

134

__m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)

135

while (count >= 4) {

136

// Load 4 pixels

137

__m128i src_pixel = _mm_loadu_si128(s);

138

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

139

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

140

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

141

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

142

// Shift alphas down to lower 8 bits of each quad.

143

__m128i alpha = _mm_srli_epi32(src_pixel, 24);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

144

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

145

// Copy alpha to upper 3rd byte of each quad

146

alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

147

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

148

// Subtract alphas from 255, to get 0..255

149

alpha = _mm_sub_epi16(c_255, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

150

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

151

// Multiply by red and blue by src alpha.

152

dst_rb = _mm_mullo_epi16(dst_rb, alpha);

153

// Multiply by alpha and green by src alpha.

154

dst_ag = _mm_mullo_epi16(dst_ag, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

155

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

156

// dst_rb_low = (dst_rb >> 8)

157

__m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);

158

__m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

159

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

160

// dst_rb = (dst_rb + dst_rb_low + 128) >> 8

161

dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);

162

dst_rb = _mm_add_epi16(dst_rb, c_128);

163

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

164

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

165

// dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask

166

dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);

167

dst_ag = _mm_add_epi16(dst_ag, c_128);

168

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

169

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

170

// Combine back into RGBA.

171

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

172

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

173

// Add result

174

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

175

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

#else

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

182

__m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)

183

while (count >= 4) {

184

// Load 4 pixels

185

__m128i src_pixel = _mm_loadu_si128(s);

186

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

187

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

188

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

189

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

190

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

191

// (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)

192

__m128i alpha = _mm_srli_epi16(src_pixel, 8);

193

194

// (a0, a0, a1, a1, a2, g2, a3, g3)

195

alpha = _mm_shufflehi_epi16(alpha, 0xF5);

196

197

// (a0, a0, a1, a1, a2, a2, a3, a3)

198

alpha = _mm_shufflelo_epi16(alpha, 0xF5);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

199

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

200

// Subtract alphas from 256, to get 1..256

201

alpha = _mm_sub_epi16(c_256, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

202

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

203

// Multiply by red and blue by src alpha.

204

dst_rb = _mm_mullo_epi16(dst_rb, alpha);

205

// Multiply by alpha and green by src alpha.

206

dst_ag = _mm_mullo_epi16(dst_ag, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

207

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

208

// Divide by 256.

209

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

210

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

211

// Mask out high bits (already in the right place)

212

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

213

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

214

// Combine back into RGBA.

215

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

216

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

217

// Add result

218

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

219

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

224

#endif

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

225

src = reinterpret_cast<const SkPMColor*>(s);

226

dst = reinterpret_cast<SkPMColor*>(d);

227

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

228

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

229

while (count > 0) {

230

*dst = SkPMSrcOver(*src, *dst);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

237

void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

238

const SkPMColor* SK_RESTRICT src,

239

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

240

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

245

if (count >= 4) {

246

while (((size_t)dst & 0x0F) != 0) {

247

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

252

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

253

uint32_t src_scale = SkAlpha255To256(alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

254

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

255

const __m128i *s = reinterpret_cast<const __m128i*>(src);

256

__m128i *d = reinterpret_cast<__m128i*>(dst);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

257

__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

258

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

259

__m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)

260

while (count >= 4) {

261

// Load 4 pixels each of src and dest.

262

__m128i src_pixel = _mm_loadu_si128(s);

263

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

264

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

265

// Get red and blue pixels into lower byte of each word.

266

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

267

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

268

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

269

// Get alpha and green into lower byte of each word.

270

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

271

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

272

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

273

// Put per-pixel alpha in low byte of each word.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

274

// After the following two statements, the dst_alpha looks like

275

// (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

276

__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

277

dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

278

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

279

// dst_alpha = dst_alpha * src_scale

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

280

// Because src_scales are in the higher byte of each word and

281

// we use mulhi here, the resulting alpha values are already

282

// in the right place and don't need to be divided by 256.

283

// (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)

284

dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

285

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

286

// Subtract alphas from 256, to get 1..256

287

dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

288

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

289

// Multiply red and blue by dst pixel alpha.

290

dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

291

// Multiply alpha and green by dst pixel alpha.

292

dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

293

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

294

// Multiply red and blue by global alpha.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

295

// (4 x (0, rs.h, 0, bs.h))

296

// where rs.h stands for the higher byte of r * src_scale,

297

// and bs.h the higher byte of b * src_scale.

298

// Again, because we use mulhi, the resuling red and blue

299

// values are already in the right place and don't need to

300

// be divided by 256.

301

src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

302

// Multiply alpha and green by global alpha.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

303

// (4 x (0, as.h, 0, gs.h))

304

src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

305

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

306

// Divide by 256.

307

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

308

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

309

// Mask out low bits (goodies already in the right place; no need to divide)

310

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

311

// Shift alpha and green to higher byte of each word.

312

// (4 x (as.h, 0, gs.h, 0))

313

src_ag = _mm_slli_epi16(src_ag, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

314

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

315

// Combine back into RGBA.

316

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

317

src_pixel = _mm_or_si128(src_rb, src_ag);

318

319

// Add two pixels into result.

320

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

321

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

327

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

328

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

329

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

330

while (count > 0) {

331

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

337

338

/* SSE2 version of Color32()

339

* portable version is in core/SkBlitRow_D32.cpp

340

*/

341

void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,

SkPMColor color) {

if (count <= 0) {

return;

}

if (0 == color) {

if (src != dst) {

memcpy(dst, src, count * sizeof(SkPMColor));

351

}

reed@google.com

c909a1e

2011-10-25 19:07:23 +0000

[diff] [blame]

352

return;

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

353

}

354

355

unsigned colorA = SkGetPackedA32(color);

356

if (255 == colorA) {

357

sk_memset32(dst, color, count);

358

} else {

359

unsigned scale = 256 - SkAlpha255To256(colorA);

360

361

if (count >= 4) {

362

SkASSERT(((size_t)dst & 0x03) == 0);

363

while (((size_t)dst & 0x0F) != 0) {

364

*dst = color + SkAlphaMulQ(*src, scale);

src++;

dst++;

count--;

}

const __m128i *s = reinterpret_cast<const __m128i*>(src);

371

__m128i *d = reinterpret_cast<__m128i*>(dst);

372

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

373

__m128i src_scale_wide = _mm_set1_epi16(scale);

374

__m128i color_wide = _mm_set1_epi32(color);

375

while (count >= 4) {

376

// Load 4 pixels each of src and dest.

377

__m128i src_pixel = _mm_loadu_si128(s);

378

379

// Get red and blue pixels into lower byte of each word.

380

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

381

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

382

// Get alpha and green into lower byte of each word.

383

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

384

385

// Multiply by scale.

386

src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

387

src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

388

389

// Divide by 256.

390

src_rb = _mm_srli_epi16(src_rb, 8);

391

src_ag = _mm_andnot_si128(rb_mask, src_ag);

392

393

// Combine back into RGBA.

394

src_pixel = _mm_or_si128(src_rb, src_ag);

395

396

// Add color to result.

397

__m128i result = _mm_add_epi8(color_wide, src_pixel);

398

399

// Store result.

400

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

406

dst = reinterpret_cast<SkPMColor*>(d);

}

while (count > 0) {

*dst = color + SkAlphaMulQ(*src, scale);

411

src += 1;

412

dst += 1;

413

count--;

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

414

}

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

415

}

416

}

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

417

reed@google.com

edb606c

2011-10-18 13:56:50 +0000

[diff] [blame]

418

void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,

419

size_t maskRB, SkColor origColor,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

420

int width, int height) {

reed@google.com

ee467ee

2011-03-09 13:23:57 +0000

[diff] [blame]

421

SkPMColor color = SkPreMultiplyColor(origColor);

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

422

size_t dstOffset = dstRB - (width << 2);

423

size_t maskOffset = maskRB - width;

424

SkPMColor* dst = (SkPMColor *)device;

reed@google.com

edb606c

2011-10-18 13:56:50 +0000

[diff] [blame]

425

const uint8_t* mask = (const uint8_t*)maskPtr;

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

do {

int count = width;

if (count >= 4) {

while (((size_t)dst & 0x0F) != 0 && (count > 0)) {

430

*dst = SkBlendARGB32(color, *dst, *mask);

mask++;

dst++;

count--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

436

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

437

__m128i c_256 = _mm_set1_epi16(256);

438

__m128i c_1 = _mm_set1_epi16(1);

439

__m128i src_pixel = _mm_set1_epi32(color);

440

while (count >= 4) {

441

// Load 4 pixels each of src and dest.

442

__m128i dst_pixel = _mm_load_si128(d);

443

444

//set the aphla value

445

__m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\

446

0, *(mask+3),0, \

447

*(mask+2),0, *(mask+2),\

448

0,*(mask+1), 0,*(mask+1),\

449

0, *mask,0,*mask);

450

451

//call SkAlpha255To256()

452

src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);

453

454

// Get red and blue pixels into lower byte of each word.

455

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

456

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

457

458

// Get alpha and green into lower byte of each word.

459

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

460

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

461

462

// Put per-pixel alpha in low byte of each word.

463

__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

464

dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

465

466

// dst_alpha = dst_alpha * src_scale

467

dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);

468

469

// Divide by 256.

470

dst_alpha = _mm_srli_epi16(dst_alpha, 8);

471

472

// Subtract alphas from 256, to get 1..256

473

dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

474

// Multiply red and blue by dst pixel alpha.

475

dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

476

// Multiply alpha and green by dst pixel alpha.

477

dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

478

479

// Multiply red and blue by global alpha.

480

src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

481

// Multiply alpha and green by global alpha.

482

src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

483

// Divide by 256.

484

dst_rb = _mm_srli_epi16(dst_rb, 8);

485

src_rb = _mm_srli_epi16(src_rb, 8);

486

487

// Mask out low bits (goodies already in the right place; no need to divide)

488

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

489

src_ag = _mm_andnot_si128(rb_mask, src_ag);

490

491

// Combine back into RGBA.

492

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

493

__m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);

494

495

// Add two pixels into result.

496

__m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);

497

_mm_store_si128(d, result);

498

// load the next 4 pixel

mask = mask + 4;

d++;

count -= 4;

}

dst = reinterpret_cast<SkPMColor *>(d);

504

}

505

while(count > 0) {

506

*dst= SkBlendARGB32(color, *dst, *mask);

dst += 1;

mask++;

count --;

}

dst = (SkPMColor *)((char*)dst + dstOffset);

512

mask += maskOffset;

513

} while (--height != 0);

514

}

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

515

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

516

// The following (left) shifts cause the top 5 bits of the mask components to

517

// line up with the corresponding components in an SkPMColor.

518

// Note that the mask's RGB16 order may differ from the SkPMColor order.

519

#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)

520

#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)

521

#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

522

523

#if SK_R16x5_R32x5_SHIFT == 0

524

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)

525

#elif SK_R16x5_R32x5_SHIFT > 0

526

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))

527

#else

528

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))

529

#endif

530

531

#if SK_G16x5_G32x5_SHIFT == 0

532

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)

533

#elif SK_G16x5_G32x5_SHIFT > 0

534

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))

535

#else

536

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))

537

#endif

538

539

#if SK_B16x5_B32x5_SHIFT == 0

540

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)

541

#elif SK_B16x5_B32x5_SHIFT > 0

542

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))

543

#else

544

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))

545

#endif

546

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

547

static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

548

__m128i &mask, __m128i &srcA) {

549

// In the following comments, the components of src, dst and mask are

550

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

551

// by an R, G, B, or A suffix. Components of one of the four pixels that

552

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

553

// example is the blue channel of the second destination pixel. Memory

554

// layout is shown for an ARGB byte order in a color value.

555

556

// src and srcA store 8-bit values interleaved with zeros.

557

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

558

// srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

559

// srcA, 0, srcA, 0, srcA, 0, srcA, 0)

560

// mask stores 16-bit values (compressed three channels) interleaved with zeros.

561

// Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

562

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

563

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

564

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

565

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

566

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

567

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

568

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

569

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

570

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

571

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

572

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

573

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

574

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

575

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

576

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

577

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

578

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

579

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

580

// 8-bit position

581

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

582

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

583

mask = _mm_or_si128(_mm_or_si128(r, g), b);

584

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

585

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

586

// i.e. split the sixteen 8-bit values from mask into two sets of eight

587

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

588

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

589

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

590

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

591

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

592

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

593

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

594

// Upscale from 0..31 to 0..32

595

// (allows to replace division by left-shift further down)

596

// Left-shift each component by 4 and add the result back to that component,

597

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

598

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

599

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

600

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

601

// Multiply each component of maskLo and maskHi by srcA

602

maskLo = _mm_mullo_epi16(maskLo, srcA);

603

maskHi = _mm_mullo_epi16(maskHi, srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

604

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

605

// Left shift mask components by 8 (divide by 256)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

606

maskLo = _mm_srli_epi16(maskLo, 8);

607

maskHi = _mm_srli_epi16(maskHi, 8);

608

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

609

// Interleave R,G,B into the lower byte of the word

610

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

611

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

612

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

613

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

614

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

615

// mask = (src - dst) * mask

616

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

617

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

618

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

619

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

620

maskLo = _mm_srai_epi16(maskLo, 5);

621

maskHi = _mm_srai_epi16(maskHi, 5);

622

623

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

624

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

625

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

626

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

627

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

628

// Pack into 4 32bit dst pixels.

629

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

630

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

631

// clamping to 255 if necessary.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

632

return _mm_packus_epi16(resultLo, resultHi);

633

}

634

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

635

static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

636

__m128i &mask) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

637

// In the following comments, the components of src, dst and mask are

638

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

639

// by an R, G, B, or A suffix. Components of one of the four pixels that

640

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

641

// example is the blue channel of the second destination pixel. Memory

642

// layout is shown for an ARGB byte order in a color value.

643

644

// src and srcA store 8-bit values interleaved with zeros.

645

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

646

// mask stores 16-bit values (shown as high and low bytes) interleaved with

647

// zeros

648

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

649

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

650

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

651

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

652

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

653

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

654

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

655

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

656

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

657

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

658

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

659

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

660

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

661

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

662

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

663

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

664

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

665

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

666

// 8-bit position

667

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

668

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

669

mask = _mm_or_si128(_mm_or_si128(r, g), b);

670

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

671

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

672

// i.e. split the sixteen 8-bit values from mask into two sets of eight

673

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

674

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

675

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

676

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

677

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

678

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

679

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

680

// Upscale from 0..31 to 0..32

681

// (allows to replace division by left-shift further down)

682

// Left-shift each component by 4 and add the result back to that component,

683

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

684

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

685

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

686

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

687

// Interleave R,G,B into the lower byte of the word

688

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

689

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

690

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

691

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

692

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

693

// mask = (src - dst) * mask

694

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

695

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

696

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

697

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

698

maskLo = _mm_srai_epi16(maskLo, 5);

699

maskHi = _mm_srai_epi16(maskHi, 5);

700

701

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

702

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

703

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

704

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

705

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

706

// Pack into 4 32bit dst pixels and force opaque.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

707

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

708

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

709

// clamping to 255 if necessary. Set alpha components to 0xFF.

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

710

return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),

711

_mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

712

}

713

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

714

void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],

715

SkColor src, int width, SkPMColor) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

720

int srcA = SkColorGetA(src);

721

int srcR = SkColorGetR(src);

722

int srcG = SkColorGetG(src);

723

int srcB = SkColorGetB(src);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

724

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

725

srcA = SkAlpha255To256(srcA);

726

727

if (width >= 4) {

728

SkASSERT(((size_t)dst & 0x03) == 0);

729

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

730

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

731

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

737

// Set alpha to 0xFF and replicate source four times in SSE register.

738

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

739

// Interleave with zeros to get two sets of four 16-bit values.

740

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

741

// Set srcA_sse to contain eight copies of srcA, padded with zero.

742

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

743

__m128i srcA_sse = _mm_set1_epi16(srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

744

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

745

// Load four destination pixels into dst_sse.

746

__m128i dst_sse = _mm_load_si128(d);

747

// Load four 16-bit masks into lower half of mask_sse.

748

__m128i mask_sse = _mm_loadl_epi64(

749

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

750

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

751

// Check whether masks are equal to 0 and get the highest bit

752

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

753

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

754

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

755

_mm_setzero_si128()));

756

757

// if mask pixels are not all zero, we will blend the dst pixels

758

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

759

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

760

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

761

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

762

mask_sse = _mm_unpacklo_epi16(mask_sse,

763

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

764

765

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

766

__m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,

767

mask_sse, srcA_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

768

_mm_store_si128(d, result);

769

}

770

771

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

772

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

777

}

778

779

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

780

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

781

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

782

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

783

width--;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

784

}

785

}

786

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

787

void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],

788

SkColor src, int width, SkPMColor opaqueDst) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

793

int srcR = SkColorGetR(src);

794

int srcG = SkColorGetG(src);

795

int srcB = SkColorGetB(src);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

796

797

if (width >= 4) {

798

SkASSERT(((size_t)dst & 0x03) == 0);

799

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

800

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

801

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

807

// Set alpha to 0xFF and replicate source four times in SSE register.

808

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

809

// Set srcA_sse to contain eight copies of srcA, padded with zero.

810

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

811

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

812

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

813

// Load four destination pixels into dst_sse.

814

__m128i dst_sse = _mm_load_si128(d);

815

// Load four 16-bit masks into lower half of mask_sse.

816

__m128i mask_sse = _mm_loadl_epi64(

817

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

818

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

819

// Check whether masks are equal to 0 and get the highest bit

820

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

821

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

822

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

823

_mm_setzero_si128()));

824

825

// if mask pixels are not all zero, we will blend the dst pixels

826

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

827

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

828

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

829

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

830

mask_sse = _mm_unpacklo_epi16(mask_sse,

831

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

832

833

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

834

__m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,

835

mask_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

836

_mm_store_si128(d, result);

837

}

838

839

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

840

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

845

}

846

847

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

848

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

849

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

850

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

851

width--;

tomhudson@google.com