Blame - src/opts/SkBlitRow_opts_SSE2.cpp - platform/external/skia

2009-11-04 20:51:06 +0000

[diff] [blame]

1

/*

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

2

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

3

*

4

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

6

*/

7

commit-bot@chromium.org

2014-04-30 14:58:46 +0000

[diff] [blame]

8

#include <emmintrin.h>

caryclark@google.com

83ecdc3

2012-06-06 12:10:26 +0000

[diff] [blame]

9

#include "SkBitmapProcState_opts_SSE2.h"

commit-bot@chromium.org

2014-04-30 14:58:46 +0000

[diff] [blame]

10

#include "SkBlitRow_opts_SSE2.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

11

#include "SkColorPriv.h"

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

12

#include "SkColor_opts_SSE2.h"

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

13

#include "SkDither.h"

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

14

#include "SkUtils.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

15

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

16

/* SSE2 version of S32_Blend_BlitRow32()

17

* portable version is in core/SkBlitRow_D32.cpp

18

*/

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

19

void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

20

const SkPMColor* SK_RESTRICT src,

21

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

22

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

uint32_t src_scale = SkAlpha255To256(alpha);

28

uint32_t dst_scale = 256 - src_scale;

29

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

30

if (count >= 4) {

31

SkASSERT(((size_t)dst & 0x03) == 0);

32

while (((size_t)dst & 0x0F) != 0) {

33

*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

38

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

39

const __m128i *s = reinterpret_cast<const __m128i*>(src);

40

__m128i *d = reinterpret_cast<__m128i*>(dst);

41

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

42

__m128i ag_mask = _mm_set1_epi32(0xFF00FF00);

43

44

// Move scale factors to upper byte of word

45

__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

46

__m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

47

while (count >= 4) {

48

// Load 4 pixels each of src and dest.

49

__m128i src_pixel = _mm_loadu_si128(s);

50

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

51

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

52

// Interleave Atom port 0/1 operations based on the execution port

53

// constraints that multiply can only be executed on port 0 (while

54

// boolean operations can be executed on either port 0 or port 1)

55

// because GCC currently doesn't do a good job scheduling

56

// instructions based on these constraints.

57

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

58

// Get red and blue pixels into lower byte of each word.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

59

// (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

60

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

61

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

62

// Multiply by scale.

63

// (4 x (0, rs.h, 0, bs.h))

64

// where rs.h stands for the higher byte of r * scale, and

65

// bs.h the higher byte of b * scale.

66

src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

67

68

// Get alpha and green pixels into higher byte of each word.

69

// (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)

70

__m128i src_ag = _mm_and_si128(ag_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

71

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

72

// Multiply by scale.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

73

// (4 x (as.h, as.l, gs.h, gs.l))

74

src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

75

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

76

// Clear the lower byte of the a*scale and g*scale results

77

// (4 x (as.h, 0, gs.h, 0))

78

src_ag = _mm_and_si128(src_ag, ag_mask);

79

80

// Operations the destination pixels are the same as on the

81

// source pixels. See the comments above.

82

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

83

dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);

84

__m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);

85

dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);

86

dst_ag = _mm_and_si128(dst_ag, ag_mask);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

87

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

88

// Combine back into RGBA.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

89

// (4 x (as.h, rs.h, gs.h, bs.h))

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

90

src_pixel = _mm_or_si128(src_rb, src_ag);

91

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

92

93

// Add result

94

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

95

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

101

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

102

}

103

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

104

while (count > 0) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

105

*dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

112

void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

113

const SkPMColor* SK_RESTRICT src,

114

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

115

SkASSERT(alpha == 255);

116

if (count <= 0) {

117

return;

118

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

119

120

if (count >= 4) {

121

SkASSERT(((size_t)dst & 0x03) == 0);

122

while (((size_t)dst & 0x0F) != 0) {

123

*dst = SkPMSrcOver(*src, *dst);

src++;

dst++;

count--;

}

const __m128i *s = reinterpret_cast<const __m128i*>(src);

130

__m128i *d = reinterpret_cast<__m128i*>(dst);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

131

#ifdef SK_USE_ACCURATE_BLENDING

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

132

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

133

__m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)

134

__m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)

135

while (count >= 4) {

136

// Load 4 pixels

137

__m128i src_pixel = _mm_loadu_si128(s);

138

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

139

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

140

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

141

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

142

// Shift alphas down to lower 8 bits of each quad.

143

__m128i alpha = _mm_srli_epi32(src_pixel, 24);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

144

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

145

// Copy alpha to upper 3rd byte of each quad

146

alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

147

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

148

// Subtract alphas from 255, to get 0..255

149

alpha = _mm_sub_epi16(c_255, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

150

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

151

// Multiply by red and blue by src alpha.

152

dst_rb = _mm_mullo_epi16(dst_rb, alpha);

153

// Multiply by alpha and green by src alpha.

154

dst_ag = _mm_mullo_epi16(dst_ag, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

155

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

156

// dst_rb_low = (dst_rb >> 8)

157

__m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);

158

__m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

159

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

160

// dst_rb = (dst_rb + dst_rb_low + 128) >> 8

161

dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);

162

dst_rb = _mm_add_epi16(dst_rb, c_128);

163

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

164

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

165

// dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask

166

dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);

167

dst_ag = _mm_add_epi16(dst_ag, c_128);

168

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

169

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

170

// Combine back into RGBA.

171

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

172

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

173

// Add result

174

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

175

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

commit-bot@chromium.org

2014-04-30 14:58:46 +0000

[diff] [blame]

180

#else

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

181

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

182

__m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)

183

while (count >= 4) {

184

// Load 4 pixels

185

__m128i src_pixel = _mm_loadu_si128(s);

186

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

187

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

188

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

189

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

190

senorblanco@chromium.org

f3f0bd7

2009-12-10 22:46:31 +0000

[diff] [blame]

191

// (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)

192

__m128i alpha = _mm_srli_epi16(src_pixel, 8);

193

194

// (a0, a0, a1, a1, a2, g2, a3, g3)

195

alpha = _mm_shufflehi_epi16(alpha, 0xF5);

196

197

// (a0, a0, a1, a1, a2, a2, a3, a3)

198

alpha = _mm_shufflelo_epi16(alpha, 0xF5);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

199

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

200

// Subtract alphas from 256, to get 1..256

201

alpha = _mm_sub_epi16(c_256, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

202

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

203

// Multiply by red and blue by src alpha.

204

dst_rb = _mm_mullo_epi16(dst_rb, alpha);

205

// Multiply by alpha and green by src alpha.

206

dst_ag = _mm_mullo_epi16(dst_ag, alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

207

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

208

// Divide by 256.

209

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

210

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

211

// Mask out high bits (already in the right place)

212

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

213

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

214

// Combine back into RGBA.

215

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

216

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

217

// Add result

218

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

219

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

224

#endif

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

225

src = reinterpret_cast<const SkPMColor*>(s);

226

dst = reinterpret_cast<SkPMColor*>(d);

227

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

228

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

229

while (count > 0) {

230

*dst = SkPMSrcOver(*src, *dst);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2009-11-16 21:09:00 +0000

[diff] [blame]

237

void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

238

const SkPMColor* SK_RESTRICT src,

239

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

240

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

245

if (count >= 4) {

246

while (((size_t)dst & 0x0F) != 0) {

247

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

252

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

253

uint32_t src_scale = SkAlpha255To256(alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

254

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

255

const __m128i *s = reinterpret_cast<const __m128i*>(src);

256

__m128i *d = reinterpret_cast<__m128i*>(dst);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

257

__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

258

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

259

__m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)

260

while (count >= 4) {

261

// Load 4 pixels each of src and dest.

262

__m128i src_pixel = _mm_loadu_si128(s);

263

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

264

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

265

// Get red and blue pixels into lower byte of each word.

266

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

267

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

268

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

269

// Get alpha and green into lower byte of each word.

270

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

271

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

272

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

273

// Put per-pixel alpha in low byte of each word.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

274

// After the following two statements, the dst_alpha looks like

275

// (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

276

__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

277

dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

278

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

279

// dst_alpha = dst_alpha * src_scale

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

280

// Because src_scales are in the higher byte of each word and

281

// we use mulhi here, the resulting alpha values are already

282

// in the right place and don't need to be divided by 256.

283

// (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)

284

dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

285

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

286

// Subtract alphas from 256, to get 1..256

287

dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

288

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

289

// Multiply red and blue by dst pixel alpha.

290

dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

291

// Multiply alpha and green by dst pixel alpha.

292

dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

293

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

294

// Multiply red and blue by global alpha.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

295

// (4 x (0, rs.h, 0, bs.h))

296

// where rs.h stands for the higher byte of r * src_scale,

297

// and bs.h the higher byte of b * src_scale.

298

// Again, because we use mulhi, the resuling red and blue

299

// values are already in the right place and don't need to

300

// be divided by 256.

301

src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

302

// Multiply alpha and green by global alpha.

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

303

// (4 x (0, as.h, 0, gs.h))

304

src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

305

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

306

// Divide by 256.

307

dst_rb = _mm_srli_epi16(dst_rb, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

308

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

309

// Mask out low bits (goodies already in the right place; no need to divide)

310

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

tomhudson@google.com

2012-02-28 16:15:26 +0000

[diff] [blame]

311

// Shift alpha and green to higher byte of each word.

312

// (4 x (as.h, 0, gs.h, 0))

313

src_ag = _mm_slli_epi16(src_ag, 8);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

314

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

315

// Combine back into RGBA.

316

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

317

src_pixel = _mm_or_si128(src_rb, src_ag);

318

319

// Add two pixels into result.

320

__m128i result = _mm_add_epi8(src_pixel, dst_pixel);

321

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

327

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

328

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

329

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

330

while (count > 0) {

331

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

337

338

/* SSE2 version of Color32()

339

* portable version is in core/SkBlitRow_D32.cpp

340

*/

341

void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,

342

SkPMColor color) {

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

if (count <= 0) {

return;

}

if (0 == color) {

if (src != dst) {

memcpy(dst, src, count * sizeof(SkPMColor));

350

}

reed@google.com

c909a1e

2011-10-25 19:07:23 +0000

[diff] [blame]

351

return;

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

352

}

353

354

unsigned colorA = SkGetPackedA32(color);

355

if (255 == colorA) {

356

sk_memset32(dst, color, count);

357

} else {

358

unsigned scale = 256 - SkAlpha255To256(colorA);

359

360

if (count >= 4) {

361

SkASSERT(((size_t)dst & 0x03) == 0);

362

while (((size_t)dst & 0x0F) != 0) {

363

*dst = color + SkAlphaMulQ(*src, scale);

src++;

dst++;

count--;

}

const __m128i *s = reinterpret_cast<const __m128i*>(src);

370

__m128i *d = reinterpret_cast<__m128i*>(dst);

371

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

372

__m128i src_scale_wide = _mm_set1_epi16(scale);

373

__m128i color_wide = _mm_set1_epi32(color);

374

while (count >= 4) {

375

// Load 4 pixels each of src and dest.

376

__m128i src_pixel = _mm_loadu_si128(s);

377

378

// Get red and blue pixels into lower byte of each word.

379

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

380

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

381

// Get alpha and green into lower byte of each word.

382

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

383

384

// Multiply by scale.

385

src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

386

src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

387

388

// Divide by 256.

389

src_rb = _mm_srli_epi16(src_rb, 8);

390

src_ag = _mm_andnot_si128(rb_mask, src_ag);

391

392

// Combine back into RGBA.

393

src_pixel = _mm_or_si128(src_rb, src_ag);

394

395

// Add color to result.

396

__m128i result = _mm_add_epi8(color_wide, src_pixel);

397

398

// Store result.

399

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

405

dst = reinterpret_cast<SkPMColor*>(d);

commit-bot@chromium.org

2014-04-30 14:58:46 +0000

[diff] [blame]

406

}

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

407

408

while (count > 0) {

409

*dst = color + SkAlphaMulQ(*src, scale);

410

src += 1;

411

dst += 1;

412

count--;

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

413

}

senorblanco@chromium.org

2010-12-13 15:27:20 +0000

[diff] [blame]

414

}

415

}

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

416

reed@google.com

edb606c

2011-10-18 13:56:50 +0000

[diff] [blame]

417

void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,

418

size_t maskRB, SkColor origColor,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

419

int width, int height) {

reed@google.com

ee467ee

2011-03-09 13:23:57 +0000

[diff] [blame]

420

SkPMColor color = SkPreMultiplyColor(origColor);

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

421

size_t dstOffset = dstRB - (width << 2);

422

size_t maskOffset = maskRB - width;

423

SkPMColor* dst = (SkPMColor *)device;

reed@google.com

edb606c

2011-10-18 13:56:50 +0000

[diff] [blame]

424

const uint8_t* mask = (const uint8_t*)maskPtr;

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

do {

int count = width;

if (count >= 4) {

while (((size_t)dst & 0x0F) != 0 && (count > 0)) {

429

*dst = SkBlendARGB32(color, *dst, *mask);

mask++;

dst++;

count--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

435

__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);

436

__m128i c_256 = _mm_set1_epi16(256);

437

__m128i c_1 = _mm_set1_epi16(1);

438

__m128i src_pixel = _mm_set1_epi32(color);

439

while (count >= 4) {

440

// Load 4 pixels each of src and dest.

441

__m128i dst_pixel = _mm_load_si128(d);

442

443

//set the aphla value

444

__m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\

445

0, *(mask+3),0, \

446

*(mask+2),0, *(mask+2),\

447

0,*(mask+1), 0,*(mask+1),\

448

0, *mask,0,*mask);

449

450

//call SkAlpha255To256()

451

src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);

452

453

// Get red and blue pixels into lower byte of each word.

454

__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);

455

__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

456

457

// Get alpha and green into lower byte of each word.

458

__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

459

__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

460

461

// Put per-pixel alpha in low byte of each word.

462

__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);

463

dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

464

465

// dst_alpha = dst_alpha * src_scale

466

dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);

467

468

// Divide by 256.

469

dst_alpha = _mm_srli_epi16(dst_alpha, 8);

470

471

// Subtract alphas from 256, to get 1..256

472

dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

473

// Multiply red and blue by dst pixel alpha.

474

dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);

475

// Multiply alpha and green by dst pixel alpha.

476

dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

477

478

// Multiply red and blue by global alpha.

479

src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);

480

// Multiply alpha and green by global alpha.

481

src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

482

// Divide by 256.

483

dst_rb = _mm_srli_epi16(dst_rb, 8);

484

src_rb = _mm_srli_epi16(src_rb, 8);

485

486

// Mask out low bits (goodies already in the right place; no need to divide)

487

dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

488

src_ag = _mm_andnot_si128(rb_mask, src_ag);

489

490

// Combine back into RGBA.

491

dst_pixel = _mm_or_si128(dst_rb, dst_ag);

492

__m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);

493

494

// Add two pixels into result.

495

__m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);

496

_mm_store_si128(d, result);

497

// load the next 4 pixel

mask = mask + 4;

d++;

count -= 4;

}

dst = reinterpret_cast<SkPMColor *>(d);

503

}

commit-bot@chromium.org

2014-04-30 14:58:46 +0000

[diff] [blame]

504

while (count > 0) {

reed@google.com

2011-03-09 12:55:47 +0000

[diff] [blame]

505

*dst= SkBlendARGB32(color, *dst, *mask);

dst += 1;

mask++;

count --;

}

dst = (SkPMColor *)((char*)dst + dstOffset);

511

mask += maskOffset;

512

} while (--height != 0);

513

}

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

514

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

515

// The following (left) shifts cause the top 5 bits of the mask components to

516

// line up with the corresponding components in an SkPMColor.

517

// Note that the mask's RGB16 order may differ from the SkPMColor order.

518

#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)

519

#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)

520

#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

521

522

#if SK_R16x5_R32x5_SHIFT == 0

523

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)

524

#elif SK_R16x5_R32x5_SHIFT > 0

525

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))

526

#else

527

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))

528

#endif

529

530

#if SK_G16x5_G32x5_SHIFT == 0

531

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)

532

#elif SK_G16x5_G32x5_SHIFT > 0

533

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))

534

#else

535

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))

536

#endif

537

538

#if SK_B16x5_B32x5_SHIFT == 0

539

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)

540

#elif SK_B16x5_B32x5_SHIFT > 0

541

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))

542

#else

543

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))

544

#endif

545

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

546

static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

547

__m128i &mask, __m128i &srcA) {

548

// In the following comments, the components of src, dst and mask are

549

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

550

// by an R, G, B, or A suffix. Components of one of the four pixels that

551

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

552

// example is the blue channel of the second destination pixel. Memory

553

// layout is shown for an ARGB byte order in a color value.

554

555

// src and srcA store 8-bit values interleaved with zeros.

556

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

557

// srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

558

// srcA, 0, srcA, 0, srcA, 0, srcA, 0)

559

// mask stores 16-bit values (compressed three channels) interleaved with zeros.

560

// Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

561

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

562

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

563

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

564

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

565

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

566

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

567

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

568

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

569

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

570

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

571

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

572

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

573

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

574

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

575

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

576

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

577

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

578

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

579

// 8-bit position

580

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

581

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

582

mask = _mm_or_si128(_mm_or_si128(r, g), b);

583

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

584

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

585

// i.e. split the sixteen 8-bit values from mask into two sets of eight

586

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

587

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

588

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

589

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

590

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

591

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

592

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

593

// Upscale from 0..31 to 0..32

594

// (allows to replace division by left-shift further down)

595

// Left-shift each component by 4 and add the result back to that component,

596

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

597

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

598

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

599

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

600

// Multiply each component of maskLo and maskHi by srcA

601

maskLo = _mm_mullo_epi16(maskLo, srcA);

602

maskHi = _mm_mullo_epi16(maskHi, srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

603

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

604

// Left shift mask components by 8 (divide by 256)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

605

maskLo = _mm_srli_epi16(maskLo, 8);

606

maskHi = _mm_srli_epi16(maskHi, 8);

607

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

608

// Interleave R,G,B into the lower byte of the word

609

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

610

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

611

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

612

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

613

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

614

// mask = (src - dst) * mask

615

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

616

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

617

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

618

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

619

maskLo = _mm_srai_epi16(maskLo, 5);

620

maskHi = _mm_srai_epi16(maskHi, 5);

621

622

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

623

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

624

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

625

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

626

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

627

// Pack into 4 32bit dst pixels.

628

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

629

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

630

// clamping to 255 if necessary.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

631

return _mm_packus_epi16(resultLo, resultHi);

632

}

633

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

634

static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

635

__m128i &mask) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

636

// In the following comments, the components of src, dst and mask are

637

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

638

// by an R, G, B, or A suffix. Components of one of the four pixels that

639

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

640

// example is the blue channel of the second destination pixel. Memory

641

// layout is shown for an ARGB byte order in a color value.

642

643

// src and srcA store 8-bit values interleaved with zeros.

644

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

645

// mask stores 16-bit values (shown as high and low bytes) interleaved with

646

// zeros

647

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

648

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

649

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

650

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

651

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

652

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

653

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

654

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

655

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

656

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

657

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

658

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

659

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

660

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

661

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

662

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

663

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

664

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

665

// 8-bit position

666

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

667

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

668

mask = _mm_or_si128(_mm_or_si128(r, g), b);

669

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

670

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

671

// i.e. split the sixteen 8-bit values from mask into two sets of eight

672

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

673

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

674

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

675

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

676

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

677

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

678

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

679

// Upscale from 0..31 to 0..32

680

// (allows to replace division by left-shift further down)

681

// Left-shift each component by 4 and add the result back to that component,

682

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

683

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

684

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

685

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

686

// Interleave R,G,B into the lower byte of the word

687

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

688

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

689

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

690

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

691

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

692

// mask = (src - dst) * mask

693

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

694

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

695

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

696

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

697

maskLo = _mm_srai_epi16(maskLo, 5);

698

maskHi = _mm_srai_epi16(maskHi, 5);

699

700

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

701

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

702

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

703

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

704

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

705

// Pack into 4 32bit dst pixels and force opaque.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

706

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

707

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

708

// clamping to 255 if necessary. Set alpha components to 0xFF.

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

709

return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),

710

_mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

711

}

712

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

713

void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],

714

SkColor src, int width, SkPMColor) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

719

int srcA = SkColorGetA(src);

720

int srcR = SkColorGetR(src);

721

int srcG = SkColorGetG(src);

722

int srcB = SkColorGetB(src);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

723

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

724

srcA = SkAlpha255To256(srcA);

725

726

if (width >= 4) {

727

SkASSERT(((size_t)dst & 0x03) == 0);

728

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

729

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

730

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

736

// Set alpha to 0xFF and replicate source four times in SSE register.

737

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

738

// Interleave with zeros to get two sets of four 16-bit values.

739

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

740

// Set srcA_sse to contain eight copies of srcA, padded with zero.

741

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

742

__m128i srcA_sse = _mm_set1_epi16(srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

743

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

744

// Load four destination pixels into dst_sse.

745

__m128i dst_sse = _mm_load_si128(d);

746

// Load four 16-bit masks into lower half of mask_sse.

747

__m128i mask_sse = _mm_loadl_epi64(

748

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

749

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

750

// Check whether masks are equal to 0 and get the highest bit

751

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

752

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

753

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

754

_mm_setzero_si128()));

755

756

// if mask pixels are not all zero, we will blend the dst pixels

757

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

758

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

759

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

760

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

761

mask_sse = _mm_unpacklo_epi16(mask_sse,

762

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

763

764

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

765

__m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,

766

mask_sse, srcA_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

767

_mm_store_si128(d, result);

768

}

769

770

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

771

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

776

}

777

778

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

779

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

780

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

781

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

782

width--;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

783

}

784

}

785

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

786

void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],

787

SkColor src, int width, SkPMColor opaqueDst) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

792

int srcR = SkColorGetR(src);

793

int srcG = SkColorGetG(src);

794

int srcB = SkColorGetB(src);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

795

796

if (width >= 4) {

797

SkASSERT(((size_t)dst & 0x03) == 0);

798

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

799

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

800

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

806

// Set alpha to 0xFF and replicate source four times in SSE register.

807

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

808

// Set srcA_sse to contain eight copies of srcA, padded with zero.

809

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

810

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

811

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

812

// Load four destination pixels into dst_sse.

813

__m128i dst_sse = _mm_load_si128(d);

814

// Load four 16-bit masks into lower half of mask_sse.

815

__m128i mask_sse = _mm_loadl_epi64(

816

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

817

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

818

// Check whether masks are equal to 0 and get the highest bit

819

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

820

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

821

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

822

_mm_setzero_si128()));

823

824

// if mask pixels are not all zero, we will blend the dst pixels

825

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

826

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

827

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

828

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

829

mask_sse = _mm_unpacklo_epi16(mask_sse,

830

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

831

832

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

833

__m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,

834

mask_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

835

_mm_store_si128(d, result);

836

}

837

838

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

839

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

844

}

845

846

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

847

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

848

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

849

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

850

width--;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

851

}

852

}

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

853

commit-bot@chromium.org

39ce33a

2014-02-24 04:23:39 +0000

[diff] [blame]

854

/* SSE2 version of S32_D565_Opaque()

855

* portable version is in core/SkBlitRow_D16.cpp

856

*/

857

void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,

858

const SkPMColor* SK_RESTRICT src, int count,

859

U8CPU alpha, int /*x*/, int /*y*/) {

860

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

while (((size_t)dst & 0x0F) != 0) {

868

SkPMColor c = *src++;

869

SkPMColorAssert(c);

870

871

*dst++ = SkPixel32ToPixel16_ToU16(c);

count--;

}

const __m128i* s = reinterpret_cast<const __m128i*>(src);

876

__m128i* d = reinterpret_cast<__m128i*>(dst);

877

__m128i r16_mask = _mm_set1_epi32(SK_R16_MASK);

878

__m128i g16_mask = _mm_set1_epi32(SK_G16_MASK);

879

__m128i b16_mask = _mm_set1_epi32(SK_B16_MASK);

880

881

while (count >= 8) {

882

// Load 8 pixels of src.

883

__m128i src_pixel1 = _mm_loadu_si128(s++);

884

__m128i src_pixel2 = _mm_loadu_si128(s++);

885

886

// Calculate result r.

887

__m128i r1 = _mm_srli_epi32(src_pixel1,

888

SK_R32_SHIFT + (8 - SK_R16_BITS));

889

r1 = _mm_and_si128(r1, r16_mask);

890

__m128i r2 = _mm_srli_epi32(src_pixel2,

891

SK_R32_SHIFT + (8 - SK_R16_BITS));

892

r2 = _mm_and_si128(r2, r16_mask);

893

__m128i r = _mm_packs_epi32(r1, r2);

894

895

// Calculate result g.

896

__m128i g1 = _mm_srli_epi32(src_pixel1,

897

SK_G32_SHIFT + (8 - SK_G16_BITS));

898

g1 = _mm_and_si128(g1, g16_mask);

899

__m128i g2 = _mm_srli_epi32(src_pixel2,

900

SK_G32_SHIFT + (8 - SK_G16_BITS));

901

g2 = _mm_and_si128(g2, g16_mask);

902

__m128i g = _mm_packs_epi32(g1, g2);

903

904

// Calculate result b.

905

__m128i b1 = _mm_srli_epi32(src_pixel1,

906

SK_B32_SHIFT + (8 - SK_B16_BITS));

907

b1 = _mm_and_si128(b1, b16_mask);

908

__m128i b2 = _mm_srli_epi32(src_pixel2,

909

SK_B32_SHIFT + (8 - SK_B16_BITS));

910

b2 = _mm_and_si128(b2, b16_mask);

911

__m128i b = _mm_packs_epi32(b1, b2);

912

913

// Store 8 16-bit colors in dst.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

914

__m128i d_pixel = SkPackRGB16_SSE2(r, g, b);

commit-bot@chromium.org

39ce33a

2014-02-24 04:23:39 +0000

[diff] [blame]

915

_mm_store_si128(d++, d_pixel);

916

count -= 8;

917

}

918

src = reinterpret_cast<const SkPMColor*>(s);

919

dst = reinterpret_cast<uint16_t*>(d);

}

if (count > 0) {

do {

SkPMColor c = *src++;

925

SkPMColorAssert(c);

926

*dst++ = SkPixel32ToPixel16_ToU16(c);

927

} while (--count != 0);

928

}

929

}

930

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

931

/* SSE2 version of S32A_D565_Opaque()

932

* portable version is in core/SkBlitRow_D16.cpp

933

*/

934

void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,

935

const SkPMColor* SK_RESTRICT src,

936

int count, U8CPU alpha, int /*x*/, int /*y*/) {

937

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

// Make dst 16 bytes alignment

945

while (((size_t)dst & 0x0F) != 0) {

946

SkPMColor c = *src++;

947

if (c) {

948

*dst = SkSrcOver32To16(c, *dst);

}

dst += 1;

count--;

}

const __m128i* s = reinterpret_cast<const __m128i*>(src);

955

__m128i* d = reinterpret_cast<__m128i*>(dst);

956

__m128i var255 = _mm_set1_epi16(255);

957

__m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);

958

__m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);

959

__m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);

960

961

while (count >= 8) {

962

// Load 8 pixels of src.

963

__m128i src_pixel1 = _mm_loadu_si128(s++);

964

__m128i src_pixel2 = _mm_loadu_si128(s++);

965

966

// Check whether src pixels are equal to 0 and get the highest bit

967

// of each byte of result, if src pixels are all zero, src_cmp1 and

968

// src_cmp2 will be 0xFFFF.

969

int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,

970

_mm_setzero_si128()));

971

int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,

972

_mm_setzero_si128()));

973

if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {

d++;

count -= 8;

continue;

}

// Load 8 pixels of dst.

980

__m128i dst_pixel = _mm_load_si128(d);

981

982

// Extract A from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

983

__m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

984

sa1 = _mm_srli_epi32(sa1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

985

__m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

986

sa2 = _mm_srli_epi32(sa2, 24);

987

__m128i sa = _mm_packs_epi32(sa1, sa2);

988

989

// Extract R from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

990

__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

991

sr1 = _mm_srli_epi32(sr1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

992

__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

993

sr2 = _mm_srli_epi32(sr2, 24);

994

__m128i sr = _mm_packs_epi32(sr1, sr2);

995

996

// Extract G from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

997

__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

998

sg1 = _mm_srli_epi32(sg1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

999

__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1000

sg2 = _mm_srli_epi32(sg2, 24);

1001

__m128i sg = _mm_packs_epi32(sg1, sg2);

1002

1003

// Extract B from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1004

__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1005

sb1 = _mm_srli_epi32(sb1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1006

__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1007

sb2 = _mm_srli_epi32(sb2, 24);

1008

__m128i sb = _mm_packs_epi32(sb1, sb2);

1009

1010

// Extract R G B from dst.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1011

__m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1012

dr = _mm_and_si128(dr, r16_mask);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1013

__m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1014

dg = _mm_and_si128(dg, g16_mask);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1015

__m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1016

db = _mm_and_si128(db, b16_mask);

1017

1018

__m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa

1019

1020

// Calculate R G B of result.

1021

// Original algorithm is in SkSrcOver32To16().

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1022

dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1023

dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1024

dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1025

dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1026

db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1027

db = _mm_srli_epi16(db, 8 - SK_B16_BITS);

1028

1029

// Pack R G B into 16-bit color.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1030

__m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

1031

1032

// Store 8 16-bit colors in dst.

1033

_mm_store_si128(d++, d_pixel);

count -= 8;

}

src = reinterpret_cast<const SkPMColor*>(s);

1038

dst = reinterpret_cast<uint16_t*>(d);

}

if (count > 0) {

do {

SkPMColor c = *src++;

1044

SkPMColorAssert(c);

1045

if (c) {

1046

*dst = SkSrcOver32To16(c, *dst);

1047

}

1048

dst += 1;

1049

} while (--count != 0);

1050

}

1051

}

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

1052

1053

void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,

1054

const SkPMColor* SK_RESTRICT src,

1055

int count, U8CPU alpha, int x, int y) {

1056

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

while (((size_t)dst & 0x0F) != 0) {

1064

DITHER_565_SCAN(y);

1065

SkPMColor c = *src++;

1066

SkPMColorAssert(c);

1067

1068

unsigned dither = DITHER_VALUE(x);

1069

*dst++ = SkDitherRGB32To565(c, dither);

DITHER_INC_X(x);

count--;

}

unsigned short dither_value[8];

1075

__m128i dither;

1076

#ifdef ENABLE_DITHER_MATRIX_4X4

1077

const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];

1078

dither_value[0] = dither_value[4] = dither_scan[(x) & 3];

1079

dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];

1080

dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];

1081

dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];

1082

#else

1083

const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];

1084

dither_value[0] = dither_value[4] = (dither_scan

1085

>> (((x) & 3) << 2)) & 0xF;

1086

dither_value[1] = dither_value[5] = (dither_scan

1087

>> (((x + 1) & 3) << 2)) & 0xF;

1088

dither_value[2] = dither_value[6] = (dither_scan

1089

>> (((x + 2) & 3) << 2)) & 0xF;

1090

dither_value[3] = dither_value[7] = (dither_scan

1091

>> (((x + 3) & 3) << 2)) & 0xF;

1092

#endif

1093

dither = _mm_loadu_si128((__m128i*) dither_value);

1094

1095

const __m128i* s = reinterpret_cast<const __m128i*>(src);

1096

__m128i* d = reinterpret_cast<__m128i*>(dst);

1097

1098

while (count >= 8) {

1099

// Load 8 pixels of src.

1100

__m128i src_pixel1 = _mm_loadu_si128(s++);

1101

__m128i src_pixel2 = _mm_loadu_si128(s++);

1102

1103

// Extract R from src.

1104

__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

1105

sr1 = _mm_srli_epi32(sr1, 24);

1106

__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

1107

sr2 = _mm_srli_epi32(sr2, 24);

1108

__m128i sr = _mm_packs_epi32(sr1, sr2);

1109

1110

// SkDITHER_R32To565(sr, dither)

1111

__m128i sr_offset = _mm_srli_epi16(sr, 5);

1112

sr = _mm_add_epi16(sr, dither);

1113

sr = _mm_sub_epi16(sr, sr_offset);

1114

sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);

1115

1116

// Extract G from src.

1117

__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

1118

sg1 = _mm_srli_epi32(sg1, 24);

1119

__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

1120

sg2 = _mm_srli_epi32(sg2, 24);

1121

__m128i sg = _mm_packs_epi32(sg1, sg2);

1122

1123

// SkDITHER_R32To565(sg, dither)

1124

__m128i sg_offset = _mm_srli_epi16(sg, 6);

1125

sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));

1126

sg = _mm_sub_epi16(sg, sg_offset);

1127

sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);

1128

1129

// Extract B from src.

1130

__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

1131

sb1 = _mm_srli_epi32(sb1, 24);

1132

__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

1133

sb2 = _mm_srli_epi32(sb2, 24);

1134

__m128i sb = _mm_packs_epi32(sb1, sb2);

1135

1136

// SkDITHER_R32To565(sb, dither)

1137

__m128i sb_offset = _mm_srli_epi16(sb, 5);

1138

sb = _mm_add_epi16(sb, dither);

1139

sb = _mm_sub_epi16(sb, sb_offset);

1140

sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);

1141

1142

// Pack and store 16-bit dst pixel.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1143

__m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

1144

_mm_store_si128(d++, d_pixel);

count -= 8;

x += 8;

}

src = reinterpret_cast<const SkPMColor*>(s);

1151

dst = reinterpret_cast<uint16_t*>(d);

}

if (count > 0) {

DITHER_565_SCAN(y);

do {

SkPMColor c = *src++;

1158

SkPMColorAssert(c);

1159

1160

unsigned dither = DITHER_VALUE(x);

1161

*dst++ = SkDitherRGB32To565(c, dither);

1162

DITHER_INC_X(x);

1163

} while (--count != 0);

1164

}

1165

}

commit-bot@chromium.org

2014-03-07 13:24:42 +0000

[diff] [blame]

1166

1167

/* SSE2 version of S32A_D565_Opaque_Dither()

1168

* portable version is in core/SkBlitRow_D16.cpp

1169

*/

1170

void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,

1171

const SkPMColor* SK_RESTRICT src,

1172

int count, U8CPU alpha, int x, int y) {

1173

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

while (((size_t)dst & 0x0F) != 0) {

1181

DITHER_565_SCAN(y);

1182

SkPMColor c = *src++;

1183

SkPMColorAssert(c);

1184

if (c) {

1185

unsigned a = SkGetPackedA32(c);

1186

1187

int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));

1188

1189

unsigned sr = SkGetPackedR32(c);

1190

unsigned sg = SkGetPackedG32(c);

1191

unsigned sb = SkGetPackedB32(c);

1192

sr = SkDITHER_R32_FOR_565(sr, d);

1193

sg = SkDITHER_G32_FOR_565(sg, d);

1194

sb = SkDITHER_B32_FOR_565(sb, d);

1195

1196

uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);

1197

uint32_t dst_expanded = SkExpand_rgb_16(*dst);

1198

dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

1199

// now src and dst expanded are in g:11 r:10 x:1 b:10

1200

*dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

}

dst += 1;

DITHER_INC_X(x);

count--;

}

unsigned short dither_value[8];

1208

__m128i dither, dither_cur;

1209

#ifdef ENABLE_DITHER_MATRIX_4X4

1210

const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];

1211

dither_value[0] = dither_value[4] = dither_scan[(x) & 3];

1212

dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];

1213

dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];

1214

dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];

1215

#else

1216

const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];

1217

dither_value[0] = dither_value[4] = (dither_scan

1218

>> (((x) & 3) << 2)) & 0xF;

1219

dither_value[1] = dither_value[5] = (dither_scan

1220

>> (((x + 1) & 3) << 2)) & 0xF;

1221

dither_value[2] = dither_value[6] = (dither_scan

1222

>> (((x + 2) & 3) << 2)) & 0xF;

1223

dither_value[3] = dither_value[7] = (dither_scan

1224

>> (((x + 3) & 3) << 2)) & 0xF;

1225

#endif

1226

dither = _mm_loadu_si128((__m128i*) dither_value);

1227

1228

const __m128i* s = reinterpret_cast<const __m128i*>(src);

1229

__m128i* d = reinterpret_cast<__m128i*>(dst);

1230

__m128i var256 = _mm_set1_epi16(256);

1231

__m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);

1232

__m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);

1233

__m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);

1234

1235

while (count >= 8) {

1236

// Load 8 pixels of src and dst.

1237

__m128i src_pixel1 = _mm_loadu_si128(s++);

1238

__m128i src_pixel2 = _mm_loadu_si128(s++);

1239

__m128i dst_pixel = _mm_load_si128(d);

1240

1241

// Extract A from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1242

__m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-03-07 13:24:42 +0000

[diff] [blame]

1243

sa1 = _mm_srli_epi32(sa1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1244

__m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-03-07 13:24:42 +0000

[diff] [blame]

1245

sa2 = _mm_srli_epi32(sa2, 24);

1246

__m128i sa = _mm_packs_epi32(sa1, sa2);

1247

1248

// Calculate current dither value.

1249

dither_cur = _mm_mullo_epi16(dither,

1250

_mm_add_epi16(sa, _mm_set1_epi16(1)));

1251

dither_cur = _mm_srli_epi16(dither_cur, 8);

1252

1253

// Extract R from src.

1254

__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

1255

sr1 = _mm_srli_epi32(sr1, 24);

1256

__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

1257

sr2 = _mm_srli_epi32(sr2, 24);

1258

__m128i sr = _mm_packs_epi32(sr1, sr2);

1259

1260

// SkDITHER_R32_FOR_565(sr, d)

1261

__m128i sr_offset = _mm_srli_epi16(sr, 5);

1262

sr = _mm_add_epi16(sr, dither_cur);

1263

sr = _mm_sub_epi16(sr, sr_offset);

1264

1265

// Expand sr.

1266

sr = _mm_slli_epi16(sr, 2);

1267

1268

// Extract G from src.

1269

__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

1270

sg1 = _mm_srli_epi32(sg1, 24);

1271

__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

1272

sg2 = _mm_srli_epi32(sg2, 24);

1273

__m128i sg = _mm_packs_epi32(sg1, sg2);

1274

1275

// sg = SkDITHER_G32_FOR_565(sg, d).

1276

__m128i sg_offset = _mm_srli_epi16(sg, 6);

1277

sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));

1278

sg = _mm_sub_epi16(sg, sg_offset);

1279

1280

// Expand sg.

1281

sg = _mm_slli_epi16(sg, 3);

1282

1283

// Extract B from src.

1284

__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

1285

sb1 = _mm_srli_epi32(sb1, 24);

1286

__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

1287

sb2 = _mm_srli_epi32(sb2, 24);

1288

__m128i sb = _mm_packs_epi32(sb1, sb2);

1289

1290

// sb = SkDITHER_B32_FOR_565(sb, d).

1291

__m128i sb_offset = _mm_srli_epi16(sb, 5);

1292

sb = _mm_add_epi16(sb, dither_cur);

1293

sb = _mm_sub_epi16(sb, sb_offset);

1294

1295

// Expand sb.

1296

sb = _mm_slli_epi16(sb, 2);

1297

1298

// Extract R G B from dst.

1299

__m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);

1300

dr = _mm_and_si128(dr, r16_mask);

1301

__m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);

1302

dg = _mm_and_si128(dg, g16_mask);

1303

__m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);

1304

db = _mm_and_si128(db, b16_mask);

1305

1306

// SkAlpha255To256(255 - a) >> 3

1307

__m128i isa = _mm_sub_epi16(var256, sa);

1308

isa = _mm_srli_epi16(isa, 3);

1309

1310

dr = _mm_mullo_epi16(dr, isa);

1311

dr = _mm_add_epi16(dr, sr);

1312

dr = _mm_srli_epi16(dr, 5);

1313

1314

dg = _mm_mullo_epi16(dg, isa);

1315

dg = _mm_add_epi16(dg, sg);

1316

dg = _mm_srli_epi16(dg, 5);

1317

1318

db = _mm_mullo_epi16(db, isa);

1319

db = _mm_add_epi16(db, sb);

1320

db = _mm_srli_epi16(db, 5);

1321

1322

// Package and store dst pixel.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

1323

__m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);

commit-bot@chromium.org