Blame - src/opts/SkBlitRow_opts_SSE2.cpp - platform/external/skqp

2009-11-04 20:51:06 +0000

[diff] [blame]

1

/*

tomhudson@google.com

98a5b42

2012-02-28 16:15:26 +0000

[diff] [blame]

2

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

3

*

4

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

6

*/

7

commit-bot@chromium.org

8c4953c

2014-04-30 14:58:46 +0000

[diff] [blame]

8

#include <emmintrin.h>

caryclark@google.com

83ecdc3

2012-06-06 12:10:26 +0000

[diff] [blame]

9

#include "SkBitmapProcState_opts_SSE2.h"

commit-bot@chromium.org

8c4953c

2014-04-30 14:58:46 +0000

[diff] [blame]

10

#include "SkBlitRow_opts_SSE2.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

11

#include "SkColorPriv.h"

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

12

#include "SkColor_opts_SSE2.h"

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

13

#include "SkDither.h"

mtklein

1059b1f

2016-02-03 07:25:02 -0800

[diff] [blame]

14

#include "SkMSAN.h"

senorblanco@chromium.org

c385638

2010-12-13 15:27:20 +0000

[diff] [blame]

15

#include "SkUtils.h"

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

16

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

17

/* SSE2 version of S32_Blend_BlitRow32()

18

* portable version is in core/SkBlitRow_D32.cpp

19

*/

senorblanco@chromium.org

4e75355

2009-11-16 21:09:00 +0000

[diff] [blame]

20

void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

21

const SkPMColor* SK_RESTRICT src,

22

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

23

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

uint32_t src_scale = SkAlpha255To256(alpha);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

29

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

30

if (count >= 4) {

31

SkASSERT(((size_t)dst & 0x03) == 0);

32

while (((size_t)dst & 0x0F) != 0) {

lsalzman

40254c2

2016-08-05 11:48:45 -0700

[diff] [blame]

33

*dst = SkPMLerp(*src, *dst, src_scale);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

38

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

39

const __m128i *s = reinterpret_cast<const __m128i*>(src);

40

__m128i *d = reinterpret_cast<__m128i*>(dst);

tomhudson@google.com

98a5b42

2012-02-28 16:15:26 +0000

[diff] [blame]

41

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

42

while (count >= 4) {

43

// Load 4 pixels each of src and dest.

44

__m128i src_pixel = _mm_loadu_si128(s);

45

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

46

lsalzman

40254c2

2016-08-05 11:48:45 -0700

[diff] [blame]

47

__m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

48

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

54

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

55

}

56

senorblanco@chromium.org

4e75355

2009-11-16 21:09:00 +0000

[diff] [blame]

57

while (count > 0) {

lsalzman

40254c2

2016-08-05 11:48:45 -0700

[diff] [blame]

58

*dst = SkPMLerp(*src, *dst, src_scale);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

4e75355

2009-11-16 21:09:00 +0000

[diff] [blame]

65

void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,

66

const SkPMColor* SK_RESTRICT src,

67

int count, U8CPU alpha) {

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

68

SkASSERT(alpha <= 255);

if (count <= 0) {

return;

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

73

if (count >= 4) {

74

while (((size_t)dst & 0x0F) != 0) {

75

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

80

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

81

const __m128i *s = reinterpret_cast<const __m128i*>(src);

82

__m128i *d = reinterpret_cast<__m128i*>(dst);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

83

while (count >= 4) {

84

// Load 4 pixels each of src and dest.

85

__m128i src_pixel = _mm_loadu_si128(s);

86

__m128i dst_pixel = _mm_load_si128(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

87

qiankun.miao

2253aa9

2014-11-25 06:35:02 -0800

[diff] [blame]

88

__m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

89

_mm_store_si128(d, result);

s++;

d++;

count -= 4;

}

src = reinterpret_cast<const SkPMColor*>(s);

95

dst = reinterpret_cast<SkPMColor*>(d);

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

96

}

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

97

senorblanco@chromium.org

2009-11-04 20:51:06 +0000

[diff] [blame]

98

while (count > 0) {

99

*dst = SkBlendARGB32(*src, *dst, alpha);

src++;

dst++;

count--;

}

}

senorblanco@chromium.org

c385638

2010-12-13 15:27:20 +0000

[diff] [blame]

105

henrik.smiding

70840cb

2015-03-20 09:20:46 -0700

[diff] [blame]

106

void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) {

107

SkASSERT(count > 0);

108

109

uint32_t src_expand = (SkGetPackedG32(src) << 24) |

110

(SkGetPackedR32(src) << 13) |

111

(SkGetPackedB32(src) << 2);

112

unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;

113

114

// Check if we have enough pixels to run SIMD

115

if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) {

116

__m128i* dst_wide;

117

const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2);

118

const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3);

119

const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2);

120

const __m128i scale_wide = _mm_set1_epi16(scale);

121

const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK);

122

const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT);

123

124

// Align dst to an even 16 byte address (0-7 pixels)

125

while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) {

126

*dst = SkBlend32_RGB16(src_expand, *dst, scale);

dst += 1;

count--;

}

dst_wide = reinterpret_cast<__m128i*>(dst);

132

do {

133

// Load eight RGB565 pixels

134

__m128i pixels = _mm_load_si128(dst_wide);

135

136

// Mask out sub-pixels

137

__m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT);

138

__m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS);

139

pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS);

140

__m128i pixel_B = _mm_and_si128(pixels, mask_blue);

141

142

// Scale with alpha

143

pixel_R = _mm_mullo_epi16(pixel_R, scale_wide);

144

pixel_G = _mm_mullo_epi16(pixel_G, scale_wide);

145

pixel_B = _mm_mullo_epi16(pixel_B, scale_wide);

146

147

// Add src_X_wide and shift down again

148

pixel_R = _mm_add_epi16(pixel_R, src_R_wide);

149

pixel_R = _mm_srli_epi16(pixel_R, 5);

150

pixel_G = _mm_add_epi16(pixel_G, src_G_wide);

151

pixel_B = _mm_add_epi16(pixel_B, src_B_wide);

152

pixel_B = _mm_srli_epi16(pixel_B, 5);

153

154

// Combine into RGB565 and store

155

pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT);

156

pixel_G = _mm_and_si128(pixel_G, mask_green);

157

pixels = _mm_or_si128(pixel_R, pixel_G);

158

pixels = _mm_or_si128(pixels, pixel_B);

159

_mm_store_si128(dst_wide, pixels);

160

count -= 8;

161

dst_wide++;

162

} while (count >= 8);

163

164

dst = reinterpret_cast<uint16_t*>(dst_wide);

165

}

166

167

// Small loop to handle remaining pixels.

168

while (count > 0) {

169

*dst = SkBlend32_RGB16(src_expand, *dst, scale);

dst += 1;

count--;

}

}

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

175

// The following (left) shifts cause the top 5 bits of the mask components to

176

// line up with the corresponding components in an SkPMColor.

177

// Note that the mask's RGB16 order may differ from the SkPMColor order.

178

#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)

179

#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)

180

#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)

181

182

#if SK_R16x5_R32x5_SHIFT == 0

183

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)

184

#elif SK_R16x5_R32x5_SHIFT > 0

185

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))

186

#else

187

#define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))

188

#endif

189

190

#if SK_G16x5_G32x5_SHIFT == 0

191

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)

192

#elif SK_G16x5_G32x5_SHIFT > 0

193

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))

194

#else

195

#define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))

196

#endif

197

198

#if SK_B16x5_B32x5_SHIFT == 0

199

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)

200

#elif SK_B16x5_B32x5_SHIFT > 0

201

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))

202

#else

203

#define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))

204

#endif

205

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

206

static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,

207

__m128i &mask, __m128i &srcA) {

208

// In the following comments, the components of src, dst and mask are

209

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

210

// by an R, G, B, or A suffix. Components of one of the four pixels that

211

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

212

// example is the blue channel of the second destination pixel. Memory

213

// layout is shown for an ARGB byte order in a color value.

214

215

// src and srcA store 8-bit values interleaved with zeros.

216

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

217

// srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,

218

// srcA, 0, srcA, 0, srcA, 0, srcA, 0)

219

// mask stores 16-bit values (compressed three channels) interleaved with zeros.

220

// Lo and Hi denote the low and high bytes of a 16-bit value, respectively.

221

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

222

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

223

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

224

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

225

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

226

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

227

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

228

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

229

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

230

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

231

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

232

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

233

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

234

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

235

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

236

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

237

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

238

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

239

// 8-bit position

240

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

241

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

242

mask = _mm_or_si128(_mm_or_si128(r, g), b);

243

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

244

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

245

// i.e. split the sixteen 8-bit values from mask into two sets of eight

246

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

247

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

248

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

249

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

250

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

251

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

252

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

253

// Upscale from 0..31 to 0..32

254

// (allows to replace division by left-shift further down)

255

// Left-shift each component by 4 and add the result back to that component,

256

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

257

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

258

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

259

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

260

// Multiply each component of maskLo and maskHi by srcA

261

maskLo = _mm_mullo_epi16(maskLo, srcA);

262

maskHi = _mm_mullo_epi16(maskHi, srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

263

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

264

// Left shift mask components by 8 (divide by 256)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

265

maskLo = _mm_srli_epi16(maskLo, 8);

266

maskHi = _mm_srli_epi16(maskHi, 8);

267

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

268

// Interleave R,G,B into the lower byte of the word

269

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

270

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

271

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

272

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

273

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

274

// mask = (src - dst) * mask

275

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

276

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

277

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

278

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

279

maskLo = _mm_srai_epi16(maskLo, 5);

280

maskHi = _mm_srai_epi16(maskHi, 5);

281

282

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

283

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

284

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

285

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

286

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

287

// Pack into 4 32bit dst pixels.

288

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

289

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

290

// clamping to 255 if necessary.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

291

return _mm_packus_epi16(resultLo, resultHi);

292

}

293

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

294

static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

295

__m128i &mask) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

296

// In the following comments, the components of src, dst and mask are

297

// abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked

298

// by an R, G, B, or A suffix. Components of one of the four pixels that

299

// are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for

300

// example is the blue channel of the second destination pixel. Memory

301

// layout is shown for an ARGB byte order in a color value.

302

303

// src and srcA store 8-bit values interleaved with zeros.

304

// src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

305

// mask stores 16-bit values (shown as high and low bytes) interleaved with

306

// zeros

307

// mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

308

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

309

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

310

// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

311

// r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

312

__m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),

313

_mm_set1_epi32(0x1F << SK_R32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

314

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

315

// g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

316

__m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),

317

_mm_set1_epi32(0x1F << SK_G32_SHIFT));

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

318

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

319

// b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)

bungeman@google.com

2012-07-09 17:44:57 +0000

[diff] [blame]

320

__m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),

321

_mm_set1_epi32(0x1F << SK_B32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

322

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

323

// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

324

// Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an

325

// 8-bit position

326

// mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,

327

// 0, m2R, m2G, m2B, 0, m3R, m3G, m3B)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

328

mask = _mm_or_si128(_mm_or_si128(r, g), b);

329

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

330

// Interleave R,G,B into the lower byte of word.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

331

// i.e. split the sixteen 8-bit values from mask into two sets of eight

332

// 16-bit values, padded by zero.

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

333

__m128i maskLo, maskHi;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

334

// maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

335

maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

336

// maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

337

maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

338

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

339

// Upscale from 0..31 to 0..32

340

// (allows to replace division by left-shift further down)

341

// Left-shift each component by 4 and add the result back to that component,

342

// mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

343

maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));

344

maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

345

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

346

// Interleave R,G,B into the lower byte of the word

347

// dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

348

__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

349

// dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

350

__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

351

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

352

// mask = (src - dst) * mask

353

maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));

354

maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

355

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

356

// mask = (src - dst) * mask >> 5

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

357

maskLo = _mm_srai_epi16(maskLo, 5);

358

maskHi = _mm_srai_epi16(maskHi, 5);

359

360

// Add two pixels into result.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

361

// result = dst + ((src - dst) * mask >> 5)

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

362

__m128i resultLo = _mm_add_epi16(dstLo, maskLo);

363

__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

364

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

365

// Pack into 4 32bit dst pixels and force opaque.

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

366

// resultLo and resultHi contain eight 16-bit components (two pixels) each.

367

// Merge into one SSE regsiter with sixteen 8-bit values (four pixels),

368

// clamping to 255 if necessary. Set alpha components to 0xFF.

bungeman@google.com

27123cd

2012-08-21 19:25:42 +0000

[diff] [blame]

369

return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),

370

_mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

371

}

372

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

373

void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],

374

SkColor src, int width, SkPMColor) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

379

int srcA = SkColorGetA(src);

380

int srcR = SkColorGetR(src);

381

int srcG = SkColorGetG(src);

382

int srcB = SkColorGetB(src);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

383

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

384

srcA = SkAlpha255To256(srcA);

385

386

if (width >= 4) {

387

SkASSERT(((size_t)dst & 0x03) == 0);

388

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

389

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

390

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

396

// Set alpha to 0xFF and replicate source four times in SSE register.

397

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

398

// Interleave with zeros to get two sets of four 16-bit values.

399

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

400

// Set srcA_sse to contain eight copies of srcA, padded with zero.

401

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

402

__m128i srcA_sse = _mm_set1_epi16(srcA);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

403

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

404

// Load four destination pixels into dst_sse.

405

__m128i dst_sse = _mm_load_si128(d);

406

// Load four 16-bit masks into lower half of mask_sse.

407

__m128i mask_sse = _mm_loadl_epi64(

408

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

409

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

410

// Check whether masks are equal to 0 and get the highest bit

411

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

412

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

413

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

414

_mm_setzero_si128()));

415

416

// if mask pixels are not all zero, we will blend the dst pixels

417

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

418

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

419

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

420

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

421

mask_sse = _mm_unpacklo_epi16(mask_sse,

422

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

423

424

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

425

__m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,

426

mask_sse, srcA_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

427

_mm_store_si128(d, result);

428

}

429

430

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

431

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

436

}

437

438

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

439

*dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);

440

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

441

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

442

width--;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

443

}

444

}

445

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

446

void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],

447

SkColor src, int width, SkPMColor opaqueDst) {

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

if (width <= 0) {

return;

}

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

452

int srcR = SkColorGetR(src);

453

int srcG = SkColorGetG(src);

454

int srcB = SkColorGetB(src);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

455

456

if (width >= 4) {

457

SkASSERT(((size_t)dst & 0x03) == 0);

458

while (((size_t)dst & 0x0F) != 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

459

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

460

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

dst++;

width--;

}

__m128i *d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

466

// Set alpha to 0xFF and replicate source four times in SSE register.

467

__m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));

468

// Set srcA_sse to contain eight copies of srcA, padded with zero.

469

// src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)

470

src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

471

while (width >= 4) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

472

// Load four destination pixels into dst_sse.

473

__m128i dst_sse = _mm_load_si128(d);

474

// Load four 16-bit masks into lower half of mask_sse.

475

__m128i mask_sse = _mm_loadl_epi64(

476

reinterpret_cast<const __m128i*>(mask));

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

477

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

478

// Check whether masks are equal to 0 and get the highest bit

479

// of each byte of result, if masks are all zero, we will get

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

480

// pack_cmp to 0xFFFF

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

481

int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

482

_mm_setzero_si128()));

483

484

// if mask pixels are not all zero, we will blend the dst pixels

485

if (pack_cmp != 0xFFFF) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

486

// Unpack 4 16bit mask pixels to

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

487

// mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,

488

// m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)

489

mask_sse = _mm_unpacklo_epi16(mask_sse,

490

_mm_setzero_si128());

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

491

492

// Process 4 32bit dst pixels

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

493

__m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,

494

mask_sse);

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

495

_mm_store_si128(d, result);

496

}

497

498

d++;

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

499

mask += 4;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

width -= 4;

}

dst = reinterpret_cast<SkPMColor*>(d);

504

}

505

506

while (width > 0) {

commit-bot@chromium.org

2013-07-02 17:40:19 +0000

[diff] [blame]

507

*dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);

508

mask++;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

509

dst++;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

510

width--;

tomhudson@google.com

2012-02-14 16:01:15 +0000

[diff] [blame]

511

}

512

}

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

513

commit-bot@chromium.org

39ce33a

2014-02-24 04:23:39 +0000

[diff] [blame]

514

/* SSE2 version of S32_D565_Opaque()

515

* portable version is in core/SkBlitRow_D16.cpp

516

*/

517

void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,

518

const SkPMColor* SK_RESTRICT src, int count,

519

U8CPU alpha, int /*x*/, int /*y*/) {

520

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

while (((size_t)dst & 0x0F) != 0) {

528

SkPMColor c = *src++;

529

SkPMColorAssert(c);

530

531

*dst++ = SkPixel32ToPixel16_ToU16(c);

count--;

}

const __m128i* s = reinterpret_cast<const __m128i*>(src);

536

__m128i* d = reinterpret_cast<__m128i*>(dst);

commit-bot@chromium.org

39ce33a

2014-02-24 04:23:39 +0000

[diff] [blame]

537

538

while (count >= 8) {

539

// Load 8 pixels of src.

540

__m128i src_pixel1 = _mm_loadu_si128(s++);

541

__m128i src_pixel2 = _mm_loadu_si128(s++);

542

qiankun.miao

52e74c6

2014-11-24 06:59:44 -0800

[diff] [blame]

543

__m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2);

commit-bot@chromium.org

39ce33a

2014-02-24 04:23:39 +0000

[diff] [blame]

544

_mm_store_si128(d++, d_pixel);

545

count -= 8;

546

}

547

src = reinterpret_cast<const SkPMColor*>(s);

548

dst = reinterpret_cast<uint16_t*>(d);

}

if (count > 0) {

do {

SkPMColor c = *src++;

554

SkPMColorAssert(c);

555

*dst++ = SkPixel32ToPixel16_ToU16(c);

556

} while (--count != 0);

557

}

558

}

559

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

560

/* SSE2 version of S32A_D565_Opaque()

561

* portable version is in core/SkBlitRow_D16.cpp

562

*/

563

void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst,

564

const SkPMColor* SK_RESTRICT src,

565

int count, U8CPU alpha, int /*x*/, int /*y*/) {

566

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

// Make dst 16 bytes alignment

574

while (((size_t)dst & 0x0F) != 0) {

575

SkPMColor c = *src++;

576

if (c) {

577

*dst = SkSrcOver32To16(c, *dst);

}

dst += 1;

count--;

}

const __m128i* s = reinterpret_cast<const __m128i*>(src);

584

__m128i* d = reinterpret_cast<__m128i*>(dst);

585

__m128i var255 = _mm_set1_epi16(255);

586

__m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);

587

__m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);

588

__m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);

589

590

while (count >= 8) {

591

// Load 8 pixels of src.

592

__m128i src_pixel1 = _mm_loadu_si128(s++);

593

__m128i src_pixel2 = _mm_loadu_si128(s++);

594

595

// Check whether src pixels are equal to 0 and get the highest bit

596

// of each byte of result, if src pixels are all zero, src_cmp1 and

597

// src_cmp2 will be 0xFFFF.

598

int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1,

599

_mm_setzero_si128()));

600

int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2,

601

_mm_setzero_si128()));

602

if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) {

d++;

count -= 8;

continue;

}

// Load 8 pixels of dst.

609

__m128i dst_pixel = _mm_load_si128(d);

610

611

// Extract A from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

612

__m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

613

sa1 = _mm_srli_epi32(sa1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

614

__m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

615

sa2 = _mm_srli_epi32(sa2, 24);

616

__m128i sa = _mm_packs_epi32(sa1, sa2);

617

618

// Extract R from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

619

__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

620

sr1 = _mm_srli_epi32(sr1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

621

__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

622

sr2 = _mm_srli_epi32(sr2, 24);

623

__m128i sr = _mm_packs_epi32(sr1, sr2);

624

625

// Extract G from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

626

__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

627

sg1 = _mm_srli_epi32(sg1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

628

__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

629

sg2 = _mm_srli_epi32(sg2, 24);

630

__m128i sg = _mm_packs_epi32(sg1, sg2);

631

632

// Extract B from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

633

__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

634

sb1 = _mm_srli_epi32(sb1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

635

__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

636

sb2 = _mm_srli_epi32(sb2, 24);

637

__m128i sb = _mm_packs_epi32(sb1, sb2);

638

639

// Extract R G B from dst.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

640

__m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

641

dr = _mm_and_si128(dr, r16_mask);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

642

__m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

643

dg = _mm_and_si128(dg, g16_mask);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

644

__m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

645

db = _mm_and_si128(db, b16_mask);

646

647

__m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa

648

649

// Calculate R G B of result.

650

// Original algorithm is in SkSrcOver32To16().

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

651

dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

652

dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

653

dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

654

dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

655

db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS));

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

656

db = _mm_srli_epi16(db, 8 - SK_B16_BITS);

657

658

// Pack R G B into 16-bit color.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

659

__m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);

commit-bot@chromium.org

2014-02-19 03:09:52 +0000

[diff] [blame]

660

661

// Store 8 16-bit colors in dst.

662

_mm_store_si128(d++, d_pixel);

count -= 8;

}

src = reinterpret_cast<const SkPMColor*>(s);

667

dst = reinterpret_cast<uint16_t*>(d);

}

if (count > 0) {

do {

SkPMColor c = *src++;

673

SkPMColorAssert(c);

674

if (c) {

675

*dst = SkSrcOver32To16(c, *dst);

676

}

677

dst += 1;

678

} while (--count != 0);

679

}

680

}

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

681

682

void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,

683

const SkPMColor* SK_RESTRICT src,

684

int count, U8CPU alpha, int x, int y) {

685

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

while (((size_t)dst & 0x0F) != 0) {

693

DITHER_565_SCAN(y);

694

SkPMColor c = *src++;

695

SkPMColorAssert(c);

696

697

unsigned dither = DITHER_VALUE(x);

698

*dst++ = SkDitherRGB32To565(c, dither);

DITHER_INC_X(x);

count--;

}

unsigned short dither_value[8];

704

__m128i dither;

705

#ifdef ENABLE_DITHER_MATRIX_4X4

706

const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];

707

dither_value[0] = dither_value[4] = dither_scan[(x) & 3];

708

dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];

709

dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];

710

dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];

711

#else

712

const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];

713

dither_value[0] = dither_value[4] = (dither_scan

714

>> (((x) & 3) << 2)) & 0xF;

715

dither_value[1] = dither_value[5] = (dither_scan

716

>> (((x + 1) & 3) << 2)) & 0xF;

717

dither_value[2] = dither_value[6] = (dither_scan

718

>> (((x + 2) & 3) << 2)) & 0xF;

719

dither_value[3] = dither_value[7] = (dither_scan

720

>> (((x + 3) & 3) << 2)) & 0xF;

721

#endif

722

dither = _mm_loadu_si128((__m128i*) dither_value);

723

724

const __m128i* s = reinterpret_cast<const __m128i*>(src);

725

__m128i* d = reinterpret_cast<__m128i*>(dst);

726

727

while (count >= 8) {

728

// Load 8 pixels of src.

729

__m128i src_pixel1 = _mm_loadu_si128(s++);

730

__m128i src_pixel2 = _mm_loadu_si128(s++);

731

732

// Extract R from src.

733

__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

734

sr1 = _mm_srli_epi32(sr1, 24);

735

__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

736

sr2 = _mm_srli_epi32(sr2, 24);

737

__m128i sr = _mm_packs_epi32(sr1, sr2);

738

739

// SkDITHER_R32To565(sr, dither)

740

__m128i sr_offset = _mm_srli_epi16(sr, 5);

741

sr = _mm_add_epi16(sr, dither);

742

sr = _mm_sub_epi16(sr, sr_offset);

743

sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS);

744

745

// Extract G from src.

746

__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

747

sg1 = _mm_srli_epi32(sg1, 24);

748

__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

749

sg2 = _mm_srli_epi32(sg2, 24);

750

__m128i sg = _mm_packs_epi32(sg1, sg2);

751

752

// SkDITHER_R32To565(sg, dither)

753

__m128i sg_offset = _mm_srli_epi16(sg, 6);

754

sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1));

755

sg = _mm_sub_epi16(sg, sg_offset);

756

sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS);

757

758

// Extract B from src.

759

__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

760

sb1 = _mm_srli_epi32(sb1, 24);

761

__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

762

sb2 = _mm_srli_epi32(sb2, 24);

763

__m128i sb = _mm_packs_epi32(sb1, sb2);

764

765

// SkDITHER_R32To565(sb, dither)

766

__m128i sb_offset = _mm_srli_epi16(sb, 5);

767

sb = _mm_add_epi16(sb, dither);

768

sb = _mm_sub_epi16(sb, sb_offset);

769

sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS);

770

771

// Pack and store 16-bit dst pixel.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

772

__m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb);

commit-bot@chromium.org

2758047

2014-03-07 03:25:32 +0000

[diff] [blame]

773

_mm_store_si128(d++, d_pixel);

count -= 8;

x += 8;

}

src = reinterpret_cast<const SkPMColor*>(s);

780

dst = reinterpret_cast<uint16_t*>(d);

}

if (count > 0) {

DITHER_565_SCAN(y);

do {

SkPMColor c = *src++;

787

SkPMColorAssert(c);

788

789

unsigned dither = DITHER_VALUE(x);

790

*dst++ = SkDitherRGB32To565(c, dither);

791

DITHER_INC_X(x);

792

} while (--count != 0);

793

}

794

}

commit-bot@chromium.org

2014-03-07 13:24:42 +0000

[diff] [blame]

795

796

/* SSE2 version of S32A_D565_Opaque_Dither()

797

* portable version is in core/SkBlitRow_D16.cpp

798

*/

799

void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst,

800

const SkPMColor* SK_RESTRICT src,

801

int count, U8CPU alpha, int x, int y) {

802

SkASSERT(255 == alpha);

if (count <= 0) {

return;

}

if (count >= 8) {

while (((size_t)dst & 0x0F) != 0) {

810

DITHER_565_SCAN(y);

811

SkPMColor c = *src++;

812

SkPMColorAssert(c);

813

if (c) {

814

unsigned a = SkGetPackedA32(c);

815

816

int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a));

817

818

unsigned sr = SkGetPackedR32(c);

819

unsigned sg = SkGetPackedG32(c);

820

unsigned sb = SkGetPackedB32(c);

821

sr = SkDITHER_R32_FOR_565(sr, d);

822

sg = SkDITHER_G32_FOR_565(sg, d);

823

sb = SkDITHER_B32_FOR_565(sb, d);

824

825

uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);

826

uint32_t dst_expanded = SkExpand_rgb_16(*dst);

827

dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

828

// now src and dst expanded are in g:11 r:10 x:1 b:10

829

*dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

}

dst += 1;

DITHER_INC_X(x);

count--;

}

unsigned short dither_value[8];

837

__m128i dither, dither_cur;

838

#ifdef ENABLE_DITHER_MATRIX_4X4

839

const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3];

840

dither_value[0] = dither_value[4] = dither_scan[(x) & 3];

841

dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3];

842

dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3];

843

dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3];

844

#else

845

const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3];

846

dither_value[0] = dither_value[4] = (dither_scan

847

>> (((x) & 3) << 2)) & 0xF;

848

dither_value[1] = dither_value[5] = (dither_scan

849

>> (((x + 1) & 3) << 2)) & 0xF;

850

dither_value[2] = dither_value[6] = (dither_scan

851

>> (((x + 2) & 3) << 2)) & 0xF;

852

dither_value[3] = dither_value[7] = (dither_scan

853

>> (((x + 3) & 3) << 2)) & 0xF;

854

#endif

855

dither = _mm_loadu_si128((__m128i*) dither_value);

856

857

const __m128i* s = reinterpret_cast<const __m128i*>(src);

858

__m128i* d = reinterpret_cast<__m128i*>(dst);

859

__m128i var256 = _mm_set1_epi16(256);

860

__m128i r16_mask = _mm_set1_epi16(SK_R16_MASK);

861

__m128i g16_mask = _mm_set1_epi16(SK_G16_MASK);

862

__m128i b16_mask = _mm_set1_epi16(SK_B16_MASK);

863

864

while (count >= 8) {

865

// Load 8 pixels of src and dst.

866

__m128i src_pixel1 = _mm_loadu_si128(s++);

867

__m128i src_pixel2 = _mm_loadu_si128(s++);

868

__m128i dst_pixel = _mm_load_si128(d);

869

870

// Extract A from src.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

871

__m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-03-07 13:24:42 +0000

[diff] [blame]

872

sa1 = _mm_srli_epi32(sa1, 24);

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

873

__m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT));

commit-bot@chromium.org

2014-03-07 13:24:42 +0000

[diff] [blame]

874

sa2 = _mm_srli_epi32(sa2, 24);

875

__m128i sa = _mm_packs_epi32(sa1, sa2);

876

877

// Calculate current dither value.

878

dither_cur = _mm_mullo_epi16(dither,

879

_mm_add_epi16(sa, _mm_set1_epi16(1)));

880

dither_cur = _mm_srli_epi16(dither_cur, 8);

881

882

// Extract R from src.

883

__m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT));

884

sr1 = _mm_srli_epi32(sr1, 24);

885

__m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT));

886

sr2 = _mm_srli_epi32(sr2, 24);

887

__m128i sr = _mm_packs_epi32(sr1, sr2);

888

889

// SkDITHER_R32_FOR_565(sr, d)

890

__m128i sr_offset = _mm_srli_epi16(sr, 5);

891

sr = _mm_add_epi16(sr, dither_cur);

892

sr = _mm_sub_epi16(sr, sr_offset);

893

894

// Expand sr.

895

sr = _mm_slli_epi16(sr, 2);

896

897

// Extract G from src.

898

__m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT));

899

sg1 = _mm_srli_epi32(sg1, 24);

900

__m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT));

901

sg2 = _mm_srli_epi32(sg2, 24);

902

__m128i sg = _mm_packs_epi32(sg1, sg2);

903

904

// sg = SkDITHER_G32_FOR_565(sg, d).

905

__m128i sg_offset = _mm_srli_epi16(sg, 6);

906

sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1));

907

sg = _mm_sub_epi16(sg, sg_offset);

908

909

// Expand sg.

910

sg = _mm_slli_epi16(sg, 3);

911

912

// Extract B from src.

913

__m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT));

914

sb1 = _mm_srli_epi32(sb1, 24);

915

__m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT));

916

sb2 = _mm_srli_epi32(sb2, 24);

917

__m128i sb = _mm_packs_epi32(sb1, sb2);

918

919

// sb = SkDITHER_B32_FOR_565(sb, d).

920

__m128i sb_offset = _mm_srli_epi16(sb, 5);

921

sb = _mm_add_epi16(sb, dither_cur);

922

sb = _mm_sub_epi16(sb, sb_offset);

923

924

// Expand sb.

925

sb = _mm_slli_epi16(sb, 2);

926

927

// Extract R G B from dst.

928

__m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT);

929

dr = _mm_and_si128(dr, r16_mask);

930

__m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT);

931

dg = _mm_and_si128(dg, g16_mask);

932

__m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT);

933

db = _mm_and_si128(db, b16_mask);

934

935

// SkAlpha255To256(255 - a) >> 3

936

__m128i isa = _mm_sub_epi16(var256, sa);

937

isa = _mm_srli_epi16(isa, 3);

938

939

dr = _mm_mullo_epi16(dr, isa);

940

dr = _mm_add_epi16(dr, sr);

941

dr = _mm_srli_epi16(dr, 5);

942

943

dg = _mm_mullo_epi16(dg, isa);

944

dg = _mm_add_epi16(dg, sg);

945

dg = _mm_srli_epi16(dg, 5);

946

947

db = _mm_mullo_epi16(db, isa);

948

db = _mm_add_epi16(db, sb);

949

db = _mm_srli_epi16(db, 5);

950

951

// Package and store dst pixel.

commit-bot@chromium.org

2014-04-09 15:43:46 +0000

[diff] [blame]

952

__m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db);

commit-bot@chromium.org