Blame - src/opts/SkBitmapProcState_opts.h - platform/external/skia

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

/*

*

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

6

*/

7

8

#ifndef SkBitmapProcState_opts_DEFINED

9

#define SkBitmapProcState_opts_DEFINED

10

11

#include "SkBitmapProcState.h"

12

13

// SkBitmapProcState optimized Shader, Sample, or Matrix procs.

14

//

15

// Only S32_alpha_D32_filter_DX exploits instructions beyond

16

// our common baseline SSE2/NEON instruction sets, so that's

17

// all that lives here.

18

//

19

// The rest are scattershot at the moment but I want to get them

20

// all migrated to be normal code inside SkBitmapProcState.cpp.

21

22

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

23

#include <immintrin.h>

24

#elif defined(SK_ARM_HAS_NEON)

25

#include <arm_neon.h>

26

#endif

27

28

namespace SK_OPTS_NS {

29

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

30

// This same basic packing scheme is used throughout the file.

31

static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {

32

// The top 14 bits are the integer coordinate x0 or y0.

33

*v0 = packed >> 18;

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

34

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

35

// The bottom 14 bits are the integer coordinate x1 or y1.

36

*v1 = packed & 0x3fff;

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

37

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

38

// The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.

39

*w = (packed >> 14) & 0xf;

40

}

41

42

#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

43

44

// As above, 4x.

45

static void decode_packed_coordinates_and_weight(__m128i packed,

46

int v0[4], int v1[4], __m128i* w) {

47

_mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18));

48

_mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));

49

*w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));

50

}

51

52

// This is the crux of the SSSE3 implementation,

53

// interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().

54

static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1,

55

uint32_t B0, uint32_t B1,

56

const __m128i& interlaced_x_weights) {

57

// _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp.

58

//

59

// It takes two arguments interlaced byte-wise:

60

// - first arg: [ x,y, ... 7 more pairs of 8-bit values ...]

61

// - second arg: [ z,w, ... 7 more pairs of 8-bit values ...]

62

// and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ].

63

//

64

// That's why we go to all this trouble to make interlaced_x_weights,

65

// and here we're interlacing A0 with A1, B0 with B1 to match.

66

67

__m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),

68

interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));

69

70

return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),

71

interlaced_x_weights);

72

}

73

74

// Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.

75

// Returns two pixels, with each channel in a 16-bit lane of the __m128i.

76

static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1,

77

uint32_t A2, uint32_t A3,

78

uint32_t B0, uint32_t B1,

79

uint32_t B2, uint32_t B3,

80

const __m128i& interlaced_x_weights,

81

int wy) {

82

// The stored Y weight wy is for y1, and y0 gets a weight 16-wy.

83

const __m128i wy1 = _mm_set1_epi16(wy),

84

wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1);

85

86

// First interpolate in X,

87

// leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights.

88

__m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),

89

row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);

90

91

// Interpolate in Y across the two rows,

92

// then scale everything down by the maximum total weight 16x16 = 256.

93

return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0),

94

_mm_mullo_epi16(row1, wy1)), 8);

95

}

96

97

/*not static*/ inline

98

void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

99

const uint32_t* xy, int count, uint32_t* colors) {

100

SkASSERT(count > 0 && colors != nullptr);

101

SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);

102

SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

103

104

int alpha = s.fAlphaScale;

105

106

// Return (px * s.fAlphaScale) / 256. (s.fAlphaScale is in [0,256].)

107

auto scale_by_alpha = [alpha](const __m128i& px) {

108

return alpha == 256 ? px

109

: _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8);

110

};

111

112

// We're in _DX_ mode here, so we're only varying in X.

113

// That means the first entry of xy is our constant pair of Y coordinates and weight in Y.

114

// All the other entries in xy will be pairs of X coordinates and the X weight.

115

int y0, y1, wy;

116

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

117

118

auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),

119

row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());

120

121

while (count >= 4) {

122

// We can really get going, loading 4 X pairs at a time to produce 4 output pixels.

123

const __m128i xx = _mm_loadu_si128((const __m128i*)xy);

int x0[4],

x1[4];

__m128i wx;

decode_packed_coordinates_and_weight(xx, x0, x1, &wx);

129

130

// Splat out each x weight wx four times (one for each pixel channel) as wx1,

131

// and sixteen minus that as the weight for x0, wx0.

132

__m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),

133

wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);

134

135

// We need to interlace wx0 and wx1 for _mm_maddubs_epi16().

136

__m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1),

137

interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1);

138

139

// interpolate_in_x_and_y() can produce two output pixels (A and B) at a time

140

// from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.

141

__m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]],

142

row1[x0[0]], row1[x1[0]],

143

row0[x0[1]], row0[x1[1]],

144

row1[x0[1]], row1[x1[1]],

145

interlaced_x_weights_AB, wy);

146

147

// Once more with the other half of the x-weights for two more pixels C,D.

148

__m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]],

149

row1[x0[2]], row1[x1[2]],

150

row0[x0[3]], row0[x1[3]],

151

row1[x0[3]], row1[x1[3]],

152

interlaced_x_weights_CD, wy);

153

154

// Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!

155

_mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB),

156

scale_by_alpha(CD)));

xy += 4;

colors += 4;

count -= 4;

}

while (count --> 0) {

163

// This is exactly the same flow as the count >= 4 loop above, but writing one pixel.

164

int x0, x1, wx;

165

decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

166

167

// As above, splat out wx four times as wx1, and sixteen minus that as wx0.

168

__m128i wx1 = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.

169

wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);

170

171

__m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1);

172

173

__m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],

row1[x0], row1[x1],

0, 0,

0, 0,

interlaced_x_weights_A, wy);

178

179

*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128()));

}

}

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

184

#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

185

186

// TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.

187

188

/*not static*/ inline

189

void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

190

const uint32_t* xy, int count, uint32_t* colors) {

191

SkASSERT(count > 0 && colors != nullptr);

192

SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);

193

SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

194

SkASSERT(s.fAlphaScale <= 256);

195

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

196

int y0, y1, wy;

197

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

198

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

199

auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

200

row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

201

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

202

// We'll put one pixel in the low 4 16-bit lanes to line up with wy,

203

// and another in the upper 4 16-bit lanes to line up with 16 - wy.

204

const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy),

205

_mm_set1_epi16(16-wy));

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

206

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

207

while (count --> 0) {

208

int x0, x1, wx;

209

decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

210

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

211

// Load the 4 pixels we're interpolating.

212

const __m128i a00 = _mm_cvtsi32_si128(row0[x0]),

213

a01 = _mm_cvtsi32_si128(row0[x1]),

214

a10 = _mm_cvtsi32_si128(row1[x0]),

215

a11 = _mm_cvtsi32_si128(row1[x1]);

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

216

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

217

// Line up low-x pixels a00 and a10 with allY.

218

__m128i a00a10 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a10, a00),

219

_mm_setzero_si128());

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

220

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

221

// Scale by allY and 16-wx.

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

222

a00a10 = _mm_mullo_epi16(a00a10, allY);

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

223

a00a10 = _mm_mullo_epi16(a00a10, _mm_set1_epi16(16-wx));

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

224

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

225

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

226

// Line up high-x pixels a01 and a11 with allY.

227

__m128i a01a11 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(a11, a01),

228

_mm_setzero_si128());

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

229

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

230

// Scale by allY and wx.

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

231

a01a11 = _mm_mullo_epi16(a01a11, allY);

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

232

a01a11 = _mm_mullo_epi16(a01a11, _mm_set1_epi16(wx));

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

233

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

234

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

235

// Add the two intermediates, summing across in one direction.

236

__m128i halves = _mm_add_epi16(a00a10, a01a11);

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

237

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

238

// Add the two halves to each other to sum in the other direction.

239

__m128i sum = _mm_add_epi16(halves, _mm_srli_si128(halves, 8));

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

240

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

241

// Get back to [0,255] by dividing by maximum weight 16x16 = 256.

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

242

sum = _mm_srli_epi16(sum, 8);

243

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

244

if (s.fAlphaScale < 256) {

245

// Scale by alpha, which is in [0,256].

246

sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));

247

sum = _mm_srli_epi16(sum, 8);

248

}

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

249

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

250

// Pack back into 8-bit values and store.

251

*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));

252

}

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

}

#else

// The NEON code only actually differs from the portable code in the

258

// filtering step after we've loaded all four pixels we want to bilerp.

259

260

#if defined(SK_ARM_HAS_NEON)

261

static void filter_and_scale_by_alpha(unsigned x, unsigned y,

262

SkPMColor a00, SkPMColor a01,

263

SkPMColor a10, SkPMColor a11,

264

SkPMColor *dst,

265

uint16_t scale) {

266

uint8x8_t vy, vconst16_8, v16_y, vres;

267

uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;

268

uint32x2_t va0, va1;

269

uint16x8_t tmp1, tmp2;

270

271

vy = vdup_n_u8(y); // duplicate y into vy

272

vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8

273

v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y

274

275

va0 = vdup_n_u32(a00); // duplicate a00

276

va1 = vdup_n_u32(a10); // duplicate a10

277

va0 = vset_lane_u32(a01, va0, 1); // set top to a01

278

va1 = vset_lane_u32(a11, va1, 1); // set top to a11

279

280

tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)

281

tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y

282

283

vx = vdup_n_u16(x); // duplicate x into vx

284

vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16

285

v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x

286

287

tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x

288

tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x

289

tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)

290

tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)

291

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

292

if (scale < 256) {

293

vscale = vdup_n_u16(scale); // duplicate scale

294

tmp = vshr_n_u16(tmp, 8); // shift down result by 8

295

tmp = vmul_u16(tmp, vscale); // multiply result by scale

296

}

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

297

298

vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8

299

vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result

300

}

301

#else

302

static void filter_and_scale_by_alpha(unsigned x, unsigned y,

303

SkPMColor a00, SkPMColor a01,

304

SkPMColor a10, SkPMColor a11,

305

SkPMColor* dstColor,

306

unsigned alphaScale) {

307

SkASSERT((unsigned)x <= 0xF);

308

SkASSERT((unsigned)y <= 0xF);

309

SkASSERT(alphaScale <= 256);

310

311

int xy = x * y;

312

const uint32_t mask = 0xFF00FF;

313

314

int scale = 256 - 16*y - 16*x + xy;

315

uint32_t lo = (a00 & mask) * scale;

316

uint32_t hi = ((a00 >> 8) & mask) * scale;

317

318

scale = 16*x - xy;

319

lo += (a01 & mask) * scale;

320

hi += ((a01 >> 8) & mask) * scale;

321

322

scale = 16*y - xy;

323

lo += (a10 & mask) * scale;

324

hi += ((a10 >> 8) & mask) * scale;

325

326

lo += (a11 & mask) * xy;

327

hi += ((a11 >> 8) & mask) * xy;

328

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

329

if (alphaScale < 256) {

330

lo = ((lo >> 8) & mask) * alphaScale;

331

hi = ((hi >> 8) & mask) * alphaScale;

332

}

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

333

334

*dstColor = ((lo >> 8) & mask) | (hi & ~mask);

}

#endif

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

339

/*not static*/ inline

340

void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

341

const uint32_t* xy, int count, SkPMColor* colors) {

342

SkASSERT(count > 0 && colors != nullptr);

343

SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);

344

SkASSERT(4 == s.fPixmap.info().bytesPerPixel());

345

SkASSERT(s.fAlphaScale <= 256);

346

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

347

int y0, y1, wy;

348

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

349

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

350

auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

351

row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

352

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

353

while (count --> 0) {

354

int x0, x1, wx;

355

decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

356

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

357

filter_and_scale_by_alpha(wx, wy,

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

358

row0[x0], row0[x1],

359

row1[x0], row1[x1],

Mike Klein

2c8e2bc

2018-11-16 16:44:10 -0500

[diff] [blame]

360

colors++,

361

s.fAlphaScale);

362

}

Mike Klein

a2187bf

2018-11-16 12:22:05 -0500

[diff] [blame]

}

#endif

} // namespace SK_OPTS_NS

368

369

#endif