Blame - src/opts/SkBitmapProcState_opts_SSE2.cpp - platform/external/skia

2009-11-30 20:00:29 +0000

[diff] [blame]

1

/*

epoger@google.com

ec3ed6a

2011-07-28 14:26:00 +0000

[diff] [blame]

2

3

*

4

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

6

*/

7

8

#include <emmintrin.h>

9

#include "SkBitmapProcState_opts_SSE2.h"

commit-bot@chromium.org

4b9b456

2014-04-28 15:07:50 +0000

[diff] [blame]

10

#include "SkColorPriv.h"

reed@google.com

9cfc83c

2013-07-22 17:18:18 +0000

[diff] [blame]

11

#include "SkPaint.h"

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

12

#include "SkUtils.h"

13

14

void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s,

15

const uint32_t* xy,

16

int count, uint32_t* colors) {

halcanary

96fcdcc

2015-08-27 07:41:13 -0700

[diff] [blame]

17

SkASSERT(count > 0 && colors != nullptr);

reed

05a5647

2016-03-02 09:49:02 -0800

[diff] [blame]

18

SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

19

SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

senorblanco@chromium.org

aa4f0c6

2009-12-01 13:36:19 +0000

[diff] [blame]

20

SkASSERT(s.fAlphaScale == 256);

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

21

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

22

const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());

23

size_t rb = s.fPixmap.rowBytes();

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

24

uint32_t XY = *xy++;

25

unsigned y0 = XY >> 14;

26

const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);

27

const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);

28

unsigned subY = y0 & 0xF;

29

30

// ( 0, 0, 0, 0, 0, 0, 0, 16)

31

__m128i sixteen = _mm_cvtsi32_si128(16);

32

33

// ( 0, 0, 0, 0, 16, 16, 16, 16)

34

sixteen = _mm_shufflelo_epi16(sixteen, 0);

35

36

// ( 0, 0, 0, 0, 0, 0, 0, y)

37

__m128i allY = _mm_cvtsi32_si128(subY);

38

39

// ( 0, 0, 0, 0, y, y, y, y)

40

allY = _mm_shufflelo_epi16(allY, 0);

41

42

// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)

43

__m128i negY = _mm_sub_epi16(sixteen, allY);

44

45

// (16-y, 16-y, 16-y, 16-y, y, y, y, y)

46

allY = _mm_unpacklo_epi64(allY, negY);

47

48

// (16, 16, 16, 16, 16, 16, 16, 16 )

49

sixteen = _mm_shuffle_epi32(sixteen, 0);

50

51

// ( 0, 0, 0, 0, 0, 0, 0, 0)

52

__m128i zero = _mm_setzero_si128();

53

do {

54

uint32_t XX = *xy++; // x0:14 | 4 | x1:14

55

unsigned x0 = XX >> 18;

56

unsigned x1 = XX & 0x3FFF;

57

58

// (0, 0, 0, 0, 0, 0, 0, x)

59

__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

60

senorblanco@chromium.org

2009-11-30 20:00:29 +0000

[diff] [blame]

61

// (0, 0, 0, 0, x, x, x, x)

62

allX = _mm_shufflelo_epi16(allX, 0);

63

64

// (x, x, x, x, x, x, x, x)

65

allX = _mm_shuffle_epi32(allX, 0);

66

67

// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)

68

__m128i negX = _mm_sub_epi16(sixteen, allX);

69

70

// Load 4 samples (pixels).

71

__m128i a00 = _mm_cvtsi32_si128(row0[x0]);

72

__m128i a01 = _mm_cvtsi32_si128(row0[x1]);

73

__m128i a10 = _mm_cvtsi32_si128(row1[x0]);

74

__m128i a11 = _mm_cvtsi32_si128(row1[x1]);

75

76

// (0, 0, a00, a10)

77

__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);

78

79

// Expand to 16 bits per component.

80

a00a10 = _mm_unpacklo_epi8(a00a10, zero);

81

82

// ((a00 * (16-y)), (a10 * y)).

83

a00a10 = _mm_mullo_epi16(a00a10, allY);

84

85

// (a00 * (16-y) * (16-x), a10 * y * (16-x)).

86

a00a10 = _mm_mullo_epi16(a00a10, negX);

87

88

// (0, 0, a01, a10)

89

__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);

90

91

// Expand to 16 bits per component.

92

a01a11 = _mm_unpacklo_epi8(a01a11, zero);

93

94

// (a01 * (16-y)), (a11 * y)

95

a01a11 = _mm_mullo_epi16(a01a11, allY);

96

97

// (a01 * (16-y) * x), (a11 * y * x)

98

a01a11 = _mm_mullo_epi16(a01a11, allX);

99

100

// (a00*w00 + a01*w01, a10*w10 + a11*w11)

101

__m128i sum = _mm_add_epi16(a00a10, a01a11);

102

103

// (DC, a00*w00 + a01*w01)

104

__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);

105

106

// (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)

107

sum = _mm_add_epi16(sum, shifted);

108

109

// Divide each 16 bit component by 256.

110

sum = _mm_srli_epi16(sum, 8);

111

112

// Pack lower 4 16 bit values of sum into lower 4 bytes.

113

sum = _mm_packus_epi16(sum, zero);

114

115

// Extract low int and store.

116

*colors++ = _mm_cvtsi128_si32(sum);

117

} while (--count > 0);

118

}

senorblanco@chromium.org

2009-12-10 22:46:31 +0000

[diff] [blame]

119

120

void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s,

121

const uint32_t* xy,

122

int count, uint32_t* colors) {

halcanary

96fcdcc

2015-08-27 07:41:13 -0700

[diff] [blame]

123

SkASSERT(count > 0 && colors != nullptr);

reed

05a5647

2016-03-02 09:49:02 -0800

[diff] [blame]

124

SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

125

SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

senorblanco@chromium.org

2009-12-10 22:46:31 +0000

[diff] [blame]

126

SkASSERT(s.fAlphaScale < 256);

127

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

128

const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());

129

size_t rb = s.fPixmap.rowBytes();

senorblanco@chromium.org

2009-12-10 22:46:31 +0000

[diff] [blame]

130

uint32_t XY = *xy++;

131

unsigned y0 = XY >> 14;

132

const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);

133

const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);

134

unsigned subY = y0 & 0xF;

135

136

// ( 0, 0, 0, 0, 0, 0, 0, 16)

137

__m128i sixteen = _mm_cvtsi32_si128(16);

138

139

// ( 0, 0, 0, 0, 16, 16, 16, 16)

140

sixteen = _mm_shufflelo_epi16(sixteen, 0);

141

142

// ( 0, 0, 0, 0, 0, 0, 0, y)

143

__m128i allY = _mm_cvtsi32_si128(subY);

144

145

// ( 0, 0, 0, 0, y, y, y, y)

146

allY = _mm_shufflelo_epi16(allY, 0);

147

148

// ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)

149

__m128i negY = _mm_sub_epi16(sixteen, allY);

150

151

// (16-y, 16-y, 16-y, 16-y, y, y, y, y)

152

allY = _mm_unpacklo_epi64(allY, negY);

153

154

// (16, 16, 16, 16, 16, 16, 16, 16 )

155

sixteen = _mm_shuffle_epi32(sixteen, 0);

156

157

// ( 0, 0, 0, 0, 0, 0, 0, 0)

158

__m128i zero = _mm_setzero_si128();

159

160

// ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )

161

__m128i alpha = _mm_set1_epi16(s.fAlphaScale);

162

163

do {

164

uint32_t XX = *xy++; // x0:14 | 4 | x1:14

165

unsigned x0 = XX >> 18;

166

unsigned x1 = XX & 0x3FFF;

167

168

// (0, 0, 0, 0, 0, 0, 0, x)

169

__m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

170

senorblanco@chromium.org

2009-12-10 22:46:31 +0000

[diff] [blame]

171

// (0, 0, 0, 0, x, x, x, x)

172

allX = _mm_shufflelo_epi16(allX, 0);

173

174

// (x, x, x, x, x, x, x, x)

175

allX = _mm_shuffle_epi32(allX, 0);

176

177

// (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)

178

__m128i negX = _mm_sub_epi16(sixteen, allX);

179

180

// Load 4 samples (pixels).

181

__m128i a00 = _mm_cvtsi32_si128(row0[x0]);

182

__m128i a01 = _mm_cvtsi32_si128(row0[x1]);

183

__m128i a10 = _mm_cvtsi32_si128(row1[x0]);

184

__m128i a11 = _mm_cvtsi32_si128(row1[x1]);

185

186

// (0, 0, a00, a10)

187

__m128i a00a10 = _mm_unpacklo_epi32(a10, a00);

188

189

// Expand to 16 bits per component.

190

a00a10 = _mm_unpacklo_epi8(a00a10, zero);

191

192

// ((a00 * (16-y)), (a10 * y)).

193

a00a10 = _mm_mullo_epi16(a00a10, allY);

194

195

// (a00 * (16-y) * (16-x), a10 * y * (16-x)).

196

a00a10 = _mm_mullo_epi16(a00a10, negX);

197

198

// (0, 0, a01, a10)

199

__m128i a01a11 = _mm_unpacklo_epi32(a11, a01);

200

201

// Expand to 16 bits per component.

202

a01a11 = _mm_unpacklo_epi8(a01a11, zero);

203

204

// (a01 * (16-y)), (a11 * y)

205

a01a11 = _mm_mullo_epi16(a01a11, allY);

206

207

// (a01 * (16-y) * x), (a11 * y * x)

208

a01a11 = _mm_mullo_epi16(a01a11, allX);

209

210

// (a00*w00 + a01*w01, a10*w10 + a11*w11)

211

__m128i sum = _mm_add_epi16(a00a10, a01a11);

212

213

// (DC, a00*w00 + a01*w01)

214

__m128i shifted = _mm_shuffle_epi32(sum, 0xEE);

215

216

// (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)

217

sum = _mm_add_epi16(sum, shifted);

218

219

// Divide each 16 bit component by 256.

220

sum = _mm_srli_epi16(sum, 8);

221

222

// Multiply by alpha.

223

sum = _mm_mullo_epi16(sum, alpha);

224

225

// Divide each 16 bit component by 256.

226

sum = _mm_srli_epi16(sum, 8);

227

228

// Pack lower 4 16 bit values of sum into lower 4 bytes.

229

sum = _mm_packus_epi16(sum, zero);

230

231

// Extract low int and store.

232

*colors++ = _mm_cvtsi128_si32(sum);

233

} while (--count > 0);

234

}

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

235

236

static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max,

237

SkFixed one) {

238

unsigned i = SkClampMax(f >> 16, max);

239

i = (i << 4) | ((f >> 12) & 0xF);

240

return (i << 14) | SkClampMax((f + one) >> 16, max);

241

}

242

243

/* SSE version of ClampX_ClampY_filter_scale()

244

* portable version is in core/SkBitmapProcState_matrix.h

245

*/

246

void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[],

247

int count, int x, int y) {

248

SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |

249

SkMatrix::kScale_Mask)) == 0);

250

SkASSERT(s.fInvKy == 0);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

251

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

252

const unsigned maxX = s.fPixmap.width() - 1;

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

253

const SkFixed one = s.fFilterOneX;

254

const SkFixed dx = s.fInvSx;

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

255

fmalita

2404f03

2016-02-03 05:44:21 -0800

[diff] [blame]

256

const SkBitmapProcStateAutoMapper mapper(s, x, y);

fmalita

2016-02-03 10:21:33 -0800

[diff] [blame]

257

const SkFixed fy = mapper.fixedY();

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

258

const unsigned maxY = s.fPixmap.height() - 1;

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

259

// compute our two Y values up front

260

*xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY);

261

// now initialize fx

fmalita

2016-02-03 10:21:33 -0800

[diff] [blame]

262

SkFixed fx = mapper.fixedX();

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

263

264

// test if we don't need to apply the tile proc

265

if (dx > 0 && (unsigned)(fx >> 16) <= maxX &&

266

(unsigned)((fx + dx * (count - 1)) >> 16) < maxX) {

267

if (count >= 4) {

268

// SSE version of decal_filter_scale

269

while ((size_t(xy) & 0x0F) != 0) {

270

SkASSERT((fx >> (16 + 14)) == 0);

271

*xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);

fx += dx;

count--;

}

__m128i wide_1 = _mm_set1_epi32(1);

277

__m128i wide_dx4 = _mm_set1_epi32(dx * 4);

278

__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,

279

fx + dx, fx);

280

281

while (count >= 4) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

282

__m128i wide_out;

283

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

284

wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14);

285

wide_out = _mm_or_si128(wide_out, _mm_add_epi32(

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

286

_mm_srai_epi32(wide_fx, 16), wide_1));

287

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

288

_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

289

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

290

xy += 4;

291

fx += dx * 4;

292

wide_fx = _mm_add_epi32(wide_fx, wide_dx4);

293

count -= 4;

294

} // while count >= 4

295

} // if count >= 4

296

297

while (count-- > 0) {

298

SkASSERT((fx >> (16 + 14)) == 0);

299

*xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1);

fx += dx;

}

} else {

// SSE2 only support 16bit interger max & min, so only process the case

304

// maxX less than the max 16bit interger. Actually maxX is the bitmap's

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

305

// height, there should be rare bitmap whose height will be greater

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

306

// than max 16bit interger in the real world.

307

if ((count >= 4) && (maxX <= 0xFFFF)) {

308

while (((size_t)xy & 0x0F) != 0) {

309

*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);

310

fx += dx;

311

count--;

312

}

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

313

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

314

__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,

315

fx + dx, fx);

316

__m128i wide_dx4 = _mm_set1_epi32(dx * 4);

317

__m128i wide_one = _mm_set1_epi32(one);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

318

__m128i wide_maxX = _mm_set1_epi32(maxX);

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

319

__m128i wide_mask = _mm_set1_epi32(0xF);

while (count >= 4) {

__m128i wide_i;

__m128i wide_lo;

__m128i wide_fx1;

// i = SkClampMax(f>>16,maxX)

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

327

wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

328

_mm_setzero_si128());

329

wide_i = _mm_min_epi16(wide_i, wide_maxX);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

330

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

331

// i<<4 | TILEX_LOW_BITS(fx)

332

wide_lo = _mm_srli_epi32(wide_fx, 12);

333

wide_lo = _mm_and_si128(wide_lo, wide_mask);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

334

wide_i = _mm_slli_epi32(wide_i, 4);

335

wide_i = _mm_or_si128(wide_i, wide_lo);

336

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

337

// i<<14

338

wide_i = _mm_slli_epi32(wide_i, 14);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

339

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

340

// SkClampMax(((f+one))>>16,max)

341

wide_fx1 = _mm_add_epi32(wide_fx, wide_one);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

342

wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16),

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

343

_mm_setzero_si128());

344

wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

345

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

346

// final combination

347

wide_i = _mm_or_si128(wide_i, wide_fx1);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

348

_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);

349

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

350

wide_fx = _mm_add_epi32(wide_fx, wide_dx4);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

351

fx += dx * 4;

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

352

xy += 4;

353

count -= 4;

354

} // while count >= 4

355

} // if count >= 4

356

357

while (count-- > 0) {

358

*xy++ = ClampX_ClampY_pack_filter(fx, maxX, one);

fx += dx;

}

}

}

/* SSE version of ClampX_ClampY_nofilter_scale()

365

* portable version is in core/SkBitmapProcState_matrix.h

366

*/

367

void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s,

368

uint32_t xy[], int count, int x, int y) {

369

SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |

370

SkMatrix::kScale_Mask)) == 0);

371

372

// we store y, x, x, x, x, x

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

373

const unsigned maxX = s.fPixmap.width() - 1;

fmalita

eb54307

2016-02-02 10:17:24 -0800

[diff] [blame]

374

const SkBitmapProcStateAutoMapper mapper(s, x, y);

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

375

const unsigned maxY = s.fPixmap.height() - 1;

fmalita

2016-02-03 10:21:33 -0800

[diff] [blame]

376

*xy++ = SkClampMax(mapper.intY(), maxY);

377

SkFixed fx = mapper.fixedX();

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

378

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

379

if (0 == maxX) {

380

// all of the following X values must be 0

381

memset(xy, 0, count * sizeof(uint16_t));

return;

}

const SkFixed dx = s.fInvSx;

386

387

// test if we don't need to apply the tile proc

388

if ((unsigned)(fx >> 16) <= maxX &&

389

(unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {

390

// SSE version of decal_nofilter_scale

391

if (count >= 8) {

392

while (((size_t)xy & 0x0F) != 0) {

393

*xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16);

fx += 2 * dx;

count -= 2;

}

__m128i wide_dx4 = _mm_set1_epi32(dx * 4);

399

__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);

400

401

__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,

402

fx + dx, fx);

403

__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);

404

405

while (count >= 8) {

406

__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);

407

__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);

408

409

__m128i wide_result = _mm_packs_epi32(wide_out_low,

410

wide_out_high);

411

_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

412

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

413

wide_low = _mm_add_epi32(wide_low, wide_dx8);

414

wide_high = _mm_add_epi32(wide_high, wide_dx8);

xy += 4;

fx += dx * 8;

count -= 8;

}

} // if count >= 8

uint16_t* xx = reinterpret_cast<uint16_t*>(xy);

423

while (count-- > 0) {

424

*xx++ = SkToU16(fx >> 16);

fx += dx;

}

} else {

// SSE2 only support 16bit interger max & min, so only process the case

429

// maxX less than the max 16bit interger. Actually maxX is the bitmap's

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

430

// height, there should be rare bitmap whose height will be greater

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

431

// than max 16bit interger in the real world.

432

if ((count >= 8) && (maxX <= 0xFFFF)) {

433

while (((size_t)xy & 0x0F) != 0) {

mike@reedtribe.org

602f227

2012-03-14 02:04:40 +0000

[diff] [blame]

434

*xy++ = pack_two_shorts(SkClampMax((fx + dx) >> 16, maxX),

435

SkClampMax(fx >> 16, maxX));

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

fx += 2 * dx;

count -= 2;

}

__m128i wide_dx4 = _mm_set1_epi32(dx * 4);

441

__m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4);

442

443

__m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2,

444

fx + dx, fx);

445

__m128i wide_high = _mm_add_epi32(wide_low, wide_dx4);

446

__m128i wide_maxX = _mm_set1_epi32(maxX);

447

448

while (count >= 8) {

449

__m128i wide_out_low = _mm_srli_epi32(wide_low, 16);

450

__m128i wide_out_high = _mm_srli_epi32(wide_high, 16);

451

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

452

wide_out_low = _mm_max_epi16(wide_out_low,

tomhudson@google.com

2012-02-22 18:30:43 +0000

[diff] [blame]

453

_mm_setzero_si128());

454

wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX);

455

wide_out_high = _mm_max_epi16(wide_out_high,

456

_mm_setzero_si128());

457

wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX);

458

459

__m128i wide_result = _mm_packs_epi32(wide_out_low,

460

wide_out_high);

461

_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result);

462

463

wide_low = _mm_add_epi32(wide_low, wide_dx8);

464

wide_high = _mm_add_epi32(wide_high, wide_dx8);

xy += 4;

fx += dx * 8;

count -= 8;

}

} // if count >= 8

uint16_t* xx = reinterpret_cast<uint16_t*>(xy);

473

while (count-- > 0) {

474

*xx++ = SkClampMax(fx >> 16, maxX);

fx += dx;

}

}

}

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

479

480

/* SSE version of ClampX_ClampY_filter_affine()

481

* portable version is in core/SkBitmapProcState_matrix.h

482

*/

483

void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s,

484

uint32_t xy[], int count, int x, int y) {

fmalita

2404f03

2016-02-03 05:44:21 -0800

[diff] [blame]

485

const SkBitmapProcStateAutoMapper mapper(s, x, y);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

486

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

487

SkFixed oneX = s.fFilterOneX;

488

SkFixed oneY = s.fFilterOneY;

fmalita

2016-02-03 10:21:33 -0800

[diff] [blame]

489

SkFixed fx = mapper.fixedX();

490

SkFixed fy = mapper.fixedY();

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

491

SkFixed dx = s.fInvSx;

492

SkFixed dy = s.fInvKy;

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

493

unsigned maxX = s.fPixmap.width() - 1;

494

unsigned maxY = s.fPixmap.height() - 1;

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

495

496

if (count >= 2 && (maxX <= 0xFFFF)) {

497

SkFixed dx2 = dx + dx;

498

SkFixed dy2 = dy + dy;

499

500

__m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy);

501

__m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2);

502

__m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

503

__m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY);

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

504

__m128i wide_mask = _mm_set1_epi32(0xF);

505

506

while (count >= 2) {

507

// i = SkClampMax(f>>16,maxX)

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

508

__m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16),

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

509

_mm_setzero_si128());

510

wide_i = _mm_min_epi16(wide_i, wide_max);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

511

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

512

// i<<4 | TILEX_LOW_BITS(f)

513

__m128i wide_lo = _mm_srli_epi32(wide_f, 12);

514

wide_lo = _mm_and_si128(wide_lo, wide_mask);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

515

wide_i = _mm_slli_epi32(wide_i, 4);

516

wide_i = _mm_or_si128(wide_i, wide_lo);

517

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

518

// i<<14

519

wide_i = _mm_slli_epi32(wide_i, 14);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

520

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

521

// SkClampMax(((f+one))>>16,max)

522

__m128i wide_f1 = _mm_add_epi32(wide_f, wide_one);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

523

wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16),

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

524

_mm_setzero_si128());

525

wide_f1 = _mm_min_epi16(wide_f1, wide_max);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

526

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

527

// final combination

528

wide_i = _mm_or_si128(wide_i, wide_f1);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

529

_mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i);

530

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

531

wide_f = _mm_add_epi32(wide_f, wide_d2);

532

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

533

fx += dx2;

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

fy += dy2;

xy += 4;

count -= 2;

} // while count >= 2

538

} // if count >= 2

539

540

while (count-- > 0) {

541

*xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY);

542

fy += dy;

543

*xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

544

fx += dx;

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

}

}

/* SSE version of ClampX_ClampY_nofilter_affine()

549

* portable version is in core/SkBitmapProcState_matrix.h

550

*/

551

void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s,

552

uint32_t xy[], int count, int x, int y) {

553

SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);

554

SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |

555

SkMatrix::kScale_Mask |

556

SkMatrix::kAffine_Mask)) == 0);

557

fmalita

eb54307

2016-02-02 10:17:24 -0800

[diff] [blame]

558

const SkBitmapProcStateAutoMapper mapper(s, x, y);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

559

fmalita

2016-02-03 10:21:33 -0800

[diff] [blame]

560

SkFixed fx = mapper.fixedX();

561

SkFixed fy = mapper.fixedY();

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

562

SkFixed dx = s.fInvSx;

563

SkFixed dy = s.fInvKy;

reed

2015-06-04 14:12:25 -0700

[diff] [blame]

564

int maxX = s.fPixmap.width() - 1;

565

int maxY = s.fPixmap.height() - 1;

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

566

567

if (count >= 4 && (maxX <= 0xFFFF)) {

568

while (((size_t)xy & 0x0F) != 0) {

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

569

*xy++ = (SkClampMax(fy >> 16, maxY) << 16) |

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

570

SkClampMax(fx >> 16, maxX);

fx += dx;

fy += dy;

count--;

}

SkFixed dx4 = dx * 4;

577

SkFixed dy4 = dy * 4;

578

579

__m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2,

580

fx + dx, fx);

581

__m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2,

582

fy + dy, fy);

583

__m128i wide_dx4 = _mm_set1_epi32(dx4);

584

__m128i wide_dy4 = _mm_set1_epi32(dy4);

585

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

586

__m128i wide_maxX = _mm_set1_epi32(maxX);

587

__m128i wide_maxY = _mm_set1_epi32(maxY);

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

588

589

while (count >= 4) {

590

// SkClampMax(fx>>16,maxX)

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

591

__m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16),

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

592

_mm_setzero_si128());

593

wide_lo = _mm_min_epi16(wide_lo, wide_maxX);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

594

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

595

// SkClampMax(fy>>16,maxY)

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

596

__m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16),

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

597

_mm_setzero_si128());

598

wide_hi = _mm_min_epi16(wide_hi, wide_maxY);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

599

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

600

// final combination

601

__m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16),

602

wide_lo);

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

603

_mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i);

604

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

605

wide_fx = _mm_add_epi32(wide_fx, wide_dx4);

606

wide_fy = _mm_add_epi32(wide_fy, wide_dy4);

607

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

608

fx += dx4;

tomhudson@google.com

2012-02-28 15:41:49 +0000

[diff] [blame]

fy += dy4;

xy += 4;

count -= 4;

} // while count >= 4

613

} // if count >= 4

614

615

while (count-- > 0) {

616

*xy++ = (SkClampMax(fy >> 16, maxY) << 16) |

617

SkClampMax(fx >> 16, maxX);

618

fx += dx;

rmistry@google.com

2012-08-23 18:09:54 +0000

[diff] [blame]

619

fy += dy;

tomhudson@google.com