Blame - src/opts/SkBitmapProcState_opts.h - platform/external/skia

2018-11-16 12:22:05 -0500

[diff] [blame]

/*

*

* Use of this source code is governed by a BSD-style license that can be

5

* found in the LICENSE file.

6

*/

7

8

#ifndef SkBitmapProcState_opts_DEFINED

9

#define SkBitmapProcState_opts_DEFINED

10

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

11

#include "include/private/SkVx.h"

Mike Klein

c0bd9f9

2019-04-23 12:05:21 -0500

[diff] [blame]

12

#include "src/core/SkBitmapProcState.h"

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

13

#include "src/core/SkMSAN.h"

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

14

15

// SkBitmapProcState optimized Shader, Sample, or Matrix procs.

16

//

17

// Only S32_alpha_D32_filter_DX exploits instructions beyond

18

// our common baseline SSE2/NEON instruction sets, so that's

19

// all that lives here.

20

//

21

// The rest are scattershot at the moment but I want to get them

22

// all migrated to be normal code inside SkBitmapProcState.cpp.

23

24

#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

25

#include <immintrin.h>

26

#elif defined(SK_ARM_HAS_NEON)

27

#include <arm_neon.h>

28

#endif

29

30

namespace SK_OPTS_NS {

31

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

32

// This same basic packing scheme is used throughout the file.

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

33

template <typename U32, typename Out>

34

static void decode_packed_coordinates_and_weight(U32 packed, Out* v0, Out* v1, Out* w) {

35

*v0 = (packed >> 18); // Integer coordinate x0 or y0.

36

*v1 = (packed & 0x3fff); // Integer coordinate x1 or y1.

37

*w = (packed >> 14) & 0xf; // Lerp weight for v1; weight for v0 is 16-w.

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

38

}

39

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

40

#if 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2

41

/*not static*/ inline

42

void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

43

const uint32_t* xy, int count, uint32_t* colors) {

44

SkASSERT(count > 0 && colors != nullptr);

Mike Reed

2020-11-24 16:04:05 -0500

[diff] [blame]

45

SkASSERT(s.fBilerp);

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

46

SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

47

SkASSERT(s.fAlphaScale <= 256);

48

49

// In a _DX variant only X varies; all samples share y0/y1 coordinates and wy weight.

50

int y0, y1, wy;

51

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

52

Mike Klein

666e4da

2019-09-27 11:21:20 -0500

[diff] [blame]

53

const uint32_t* row0 = s.fPixmap.addr32(0,y0);

54

const uint32_t* row1 = s.fPixmap.addr32(0,y1);

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

55

56

auto bilerp = [&](skvx::Vec<8,uint32_t> packed_x_coordinates) -> skvx::Vec<8,uint32_t> {

57

// Decode up to 8 output pixels' x-coordinates and weights.

58

skvx::Vec<8,uint32_t> x0,x1,wx;

59

decode_packed_coordinates_and_weight(packed_x_coordinates, &x0, &x1, &wx);

60

61

// Splat wx to each color channel.

wx = (wx << 0)

| (wx << 8)

| (wx << 16)

| (wx << 24);

Mike Klein

2019-09-27 11:21:20 -0500

[diff] [blame]

67

auto gather = [](const uint32_t* ptr, skvx::Vec<8,uint32_t> ix) {

68

#if 1

69

// Drop into AVX2 intrinsics for vpgatherdd.

70

return skvx::bit_pun<skvx::Vec<8,uint32_t>>(

71

_mm256_i32gather_epi32((const int*)ptr, skvx::bit_pun<__m256i>(ix), 4));

72

#else

73

// Portable version... sometimes I don't trust vpgatherdd.

74

return skvx::Vec<8,uint32_t>{

75

ptr[ix[0]], ptr[ix[1]], ptr[ix[2]], ptr[ix[3]],

76

ptr[ix[4]], ptr[ix[5]], ptr[ix[6]], ptr[ix[7]],

};

#endif

};

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

81

// Gather the 32 32-bit pixels that we'll bilerp into our 8 output pixels.

Mike Klein

666e4da

2019-09-27 11:21:20 -0500

[diff] [blame]

82

skvx::Vec<8,uint32_t> tl = gather(row0, x0), tr = gather(row0, x1),

83

bl = gather(row1, x0), br = gather(row1, x1);

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

84

Mike Klein

ac353cb

2019-09-27 12:58:01 -0500

[diff] [blame]

85

#if 1

86

// We'll use _mm256_maddubs_epi16() to lerp much like in the SSSE3 code.

87

auto lerp_x = [&](skvx::Vec<8,uint32_t> L, skvx::Vec<8,uint32_t> R) {

88

__m256i l = skvx::bit_pun<__m256i>(L),

89

r = skvx::bit_pun<__m256i>(R),

90

wr = skvx::bit_pun<__m256i>(wx),

91

wl = _mm256_sub_epi8(_mm256_set1_epi8(16), wr);

92

93

// Interlace l,r bytewise and line them up with their weights, then lerp.

94

__m256i lo = _mm256_maddubs_epi16(_mm256_unpacklo_epi8( l, r),

95

_mm256_unpacklo_epi8(wl,wr));

96

__m256i hi = _mm256_maddubs_epi16(_mm256_unpackhi_epi8( l, r),

97

_mm256_unpackhi_epi8(wl,wr));

98

99

// Those _mm256_unpack??_epi8() calls left us in a bit of an odd order:

100

//

101

// if l = a b c d | e f g h

102

// and r = A B C D | E F G H

103

//

104

// then lo = a A b B | e E f F (low half of each input)

105

// and hi = c C d D | g G h H (high half of each input)

106

//

107

// To get everything back in original order we need to transpose that.

108

__m256i abcd = _mm256_permute2x128_si256(lo, hi, 0x20),

109

efgh = _mm256_permute2x128_si256(lo, hi, 0x31);

110

111

return skvx::join(skvx::bit_pun<skvx::Vec<16,uint16_t>>(abcd),

112

skvx::bit_pun<skvx::Vec<16,uint16_t>>(efgh));

113

};

114

115

skvx::Vec<32, uint16_t> top = lerp_x(tl, tr),

116

bot = lerp_x(bl, br),

117

sum = 16*top + (bot-top)*wy;

118

#else

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

119

// Treat 32-bit pixels as 4 8-bit values, and expand to 16-bit for room to multiply.

120

auto to_16x4 = [](auto v) -> skvx::Vec<32, uint16_t> {

121

return skvx::cast<uint16_t>(skvx::bit_pun<skvx::Vec<32, uint8_t>>(v));

122

};

123

124

// Sum up weighted sample pixels. The naive, redundant math would be,

125

//

126

// sum = tl * (16-wy) * (16-wx)

127

// + bl * ( wy) * (16-wx)

128

// + tr * (16-wy) * ( wx)

129

// + br * ( wy) * ( wx)

130

//

131

// But we refactor to eliminate a bunch of those common factors.

132

auto lerp = [](auto lo, auto hi, auto w) {

133

return 16*lo + (hi-lo)*w;

134

};

135

skvx::Vec<32, uint16_t> sum = lerp(lerp(to_16x4(tl), to_16x4(bl), wy),

136

lerp(to_16x4(tr), to_16x4(br), wy), to_16x4(wx));

Mike Klein

ac353cb

2019-09-27 12:58:01 -0500

[diff] [blame]

137

#endif

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

138

139

// Get back to [0,255] by dividing by maximum weight 16x16 = 256.

140

sum >>= 8;

141

jiepan

e947efb

2020-05-07 14:09:05 +0800

[diff] [blame]

142

// Scale by alpha if needed.

143

if(s.fAlphaScale < 256) {

144

sum *= s.fAlphaScale;

145

sum >>= 8;

146

}

Mike Klein

2019-09-26 15:34:34 -0500

[diff] [blame]

147

148

// Pack back to 8-bit channels, undoing to_16x4().

149

return skvx::bit_pun<skvx::Vec<8,uint32_t>>(skvx::cast<uint8_t>(sum));

};

while (count >= 8) {

bilerp(skvx::Vec<8,uint32_t>::Load(xy)).store(colors);

xy += 8;

colors += 8;

count -= 8;

}

if (count > 0) {

__m256i active = skvx::bit_pun<__m256i>( count > skvx::Vec<8,int>{0,1,2,3, 4,5,6,7} ),

160

coords = _mm256_maskload_epi32((const int*)xy, active),

161

pixels;

162

163

bilerp(skvx::bit_pun<skvx::Vec<8,uint32_t>>(coords)).store(&pixels);

164

_mm256_maskstore_epi32((int*)colors, active, pixels);

165

166

sk_msan_mark_initialized(colors, colors+count,

167

"MSAN still doesn't understand AVX2 mask loads and stores.");

}

}

#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

172

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

173

/*not static*/ inline

174

void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

175

const uint32_t* xy, int count, uint32_t* colors) {

176

SkASSERT(count > 0 && colors != nullptr);

Mike Reed

2020-11-24 16:04:05 -0500

[diff] [blame]

177

SkASSERT(s.fBilerp);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

178

SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

179

SkASSERT(s.fAlphaScale <= 256);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

180

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

181

// interpolate_in_x() is the crux of the SSSE3 implementation,

182

// interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().

183

auto interpolate_in_x = [](uint32_t A0, uint32_t A1,

184

uint32_t B0, uint32_t B1,

185

__m128i interlaced_x_weights) {

186

// _mm_maddubs_epi16() is a little idiosyncratic, but great as the core of a lerp.

187

//

188

// It takes two arguments interlaced byte-wise:

189

// - first arg: [ l,r, ... 7 more pairs of unsigned 8-bit values ...]

190

// - second arg: [ w,W, ... 7 more pairs of signed 8-bit values ...]

191

// and returns 8 signed 16-bit values: [ l*w + r*W, ... 7 more ... ].

192

//

193

// That's why we go to all this trouble to make interlaced_x_weights,

194

// and here we're about to interlace A0 with A1 and B0 with B1 to match.

195

//

196

// Our interlaced_x_weights are all in [0,16], and so we need not worry about

197

// the signedness of that input nor about the signedness of the output.

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

198

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

199

__m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),

200

interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));

201

202

return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),

203

interlaced_x_weights);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

204

};

205

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

206

// Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.

207

// Returns two pixels, with each color channel in a 16-bit lane of the __m128i.

208

auto interpolate_in_x_and_y = [&](uint32_t A0, uint32_t A1,

209

uint32_t A2, uint32_t A3,

210

uint32_t B0, uint32_t B1,

211

uint32_t B2, uint32_t B3,

212

__m128i interlaced_x_weights,

213

int wy) {

214

// Interpolate each row in X, leaving 16-bit lanes scaled by interlaced_x_weights.

215

__m128i top = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),

216

bot = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);

217

218

// Interpolate in Y. As in the SSE2 code, we calculate top*(16-wy) + bot*wy

219

// as 16*top + (bot-top)*wy to save a multiply.

220

__m128i px = _mm_add_epi16(_mm_slli_epi16(top, 4),

221

_mm_mullo_epi16(_mm_sub_epi16(bot, top),

222

_mm_set1_epi16(wy)));

223

224

// Scale down by total max weight 16x16 = 256.

225

px = _mm_srli_epi16(px, 8);

226

227

// Scale by alpha if needed.

228

if (s.fAlphaScale < 256) {

229

px = _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(s.fAlphaScale)), 8);

}

return px;

};

// We're in _DX mode here, so we're only varying in X.

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

235

// That means the first entry of xy is our constant pair of Y coordinates and weight in Y.

236

// All the other entries in xy will be pairs of X coordinates and the X weight.

237

int y0, y1, wy;

238

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

239

240

auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),

241

row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());

242

243

while (count >= 4) {

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

244

// We can really get going, loading 4 X-pairs at a time to produce 4 output pixels.

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

245

int x0[4],

246

x1[4];

247

__m128i wx;

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

248

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

249

// decode_packed_coordinates_and_weight(), 4x.

250

__m128i packed = _mm_loadu_si128((const __m128i*)xy);

251

_mm_storeu_si128((__m128i*)x0, _mm_srli_epi32(packed, 18));

252

_mm_storeu_si128((__m128i*)x1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));

253

wx = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf)); // [0,15]

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

254

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

255

// Splat each x weight 4x (for each color channel) as wr for pixels on the right at x1,

256

// and sixteen minus that as wl for pixels on the left at x0.

257

__m128i wr = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),

258

wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);

259

260

// We need to interlace wl and wr for _mm_maddubs_epi16().

261

__m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wl,wr),

262

interlaced_x_weights_CD = _mm_unpackhi_epi8(wl,wr);

263

264

enum { A,B,C,D };

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

265

266

// interpolate_in_x_and_y() can produce two output pixels (A and B) at a time

267

// from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

268

__m128i AB = interpolate_in_x_and_y(row0[x0[A]], row0[x1[A]],

269

row1[x0[A]], row1[x1[A]],

270

row0[x0[B]], row0[x1[B]],

271

row1[x0[B]], row1[x1[B]],

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

272

interlaced_x_weights_AB, wy);

273

274

// Once more with the other half of the x-weights for two more pixels C,D.

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

275

__m128i CD = interpolate_in_x_and_y(row0[x0[C]], row0[x1[C]],

276

row1[x0[C]], row1[x1[C]],

277

row0[x0[D]], row0[x1[D]],

278

row1[x0[D]], row1[x1[D]],

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

279

interlaced_x_weights_CD, wy);

280

281

// Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

282

_mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(AB, CD));

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

xy += 4;

colors += 4;

count -= 4;

}

while (count --> 0) {

289

// This is exactly the same flow as the count >= 4 loop above, but writing one pixel.

290

int x0, x1, wx;

291

decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

292

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

293

// As above, splat out wx four times as wr, and sixteen minus that as wl.

294

__m128i wr = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.

295

wl = _mm_sub_epi8(_mm_set1_epi8(16), wr);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

296

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

297

__m128i interlaced_x_weights = _mm_unpacklo_epi8(wl, wr);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

298

299

__m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],

300

row1[x0], row1[x1],

301

0, 0,

302

0, 0,

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

303

interlaced_x_weights, wy);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

304

Mike Klein

2019-09-25 13:56:44 -0500

[diff] [blame]

305

*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(A, _mm_setzero_si128()));

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

}

}

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

310

#elif 1 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

311

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

312

/*not static*/ inline

313

void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

314

const uint32_t* xy, int count, uint32_t* colors) {

315

SkASSERT(count > 0 && colors != nullptr);

Mike Reed

2020-11-24 16:04:05 -0500

[diff] [blame]

316

SkASSERT(s.fBilerp);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

317

SkASSERT(kN32_SkColorType == s.fPixmap.colorType());

318

SkASSERT(s.fAlphaScale <= 256);

319

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

320

int y0, y1, wy;

321

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

322

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

323

auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

324

row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

325

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

326

// We'll put one pixel in the low 4 16-bit lanes to line up with wy,

327

// and another in the upper 4 16-bit lanes to line up with 16 - wy.

Mike Klein

0ec56fc

2019-09-25 11:38:46 -0500

[diff] [blame]

328

const __m128i allY = _mm_unpacklo_epi64(_mm_set1_epi16( wy), // Bottom pixel goes here.

329

_mm_set1_epi16(16-wy)); // Top pixel goes here.

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

330

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

331

while (count --> 0) {

332

int x0, x1, wx;

333

decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

334

Mike Klein

0ec56fc

2019-09-25 11:38:46 -0500

[diff] [blame]

335

// Load the 4 pixels we're interpolating, in this grid:

336

// | tl tr |

337

// | bl br |

338

const __m128i tl = _mm_cvtsi32_si128(row0[x0]), tr = _mm_cvtsi32_si128(row0[x1]),

339

bl = _mm_cvtsi32_si128(row1[x0]), br = _mm_cvtsi32_si128(row1[x1]);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

340

Mike Klein

2019-09-25 12:14:28 -0500

[diff] [blame]

341

// We want to calculate a sum of 4 pixels weighted in two directions:

342

//

343

// sum = tl * (16-wy) * (16-wx)

344

// + bl * ( wy) * (16-wx)

345

// + tr * (16-wy) * ( wx)

346

// + br * ( wy) * ( wx)

347

//

348

// (Notice top --> 16-wy, bottom --> wy, left --> 16-wx, right --> wx.)

349

//

350

// We've already prepared allY as a vector containing [wy, 16-wy] as a way

351

// to apply those y-direction weights. So we'll start on the x-direction

352

// first, grouping into left and right halves, lined up with allY:

//

// L = [bl, tl]

// R = [br, tr]

//

// sum = horizontalSum( allY * (L*(16-wx) + R*wx) )

358

//

359

// Rewriting that one more step, we can replace a multiply with a shift:

360

//

361

// sum = horizontalSum( allY * (16*L + (R-L)*wx) )

362

//

363

// That's how we'll actually do this math.

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

364

Mike Klein

2019-09-25 12:14:28 -0500

[diff] [blame]

365

__m128i L = _mm_unpacklo_epi8(_mm_unpacklo_epi32(bl, tl), _mm_setzero_si128()),

366

R = _mm_unpacklo_epi8(_mm_unpacklo_epi32(br, tr), _mm_setzero_si128());

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

367

Mike Klein

2019-09-25 12:14:28 -0500

[diff] [blame]

368

__m128i inner = _mm_add_epi16(_mm_slli_epi16(L, 4),

369

_mm_mullo_epi16(_mm_sub_epi16(R,L), _mm_set1_epi16(wx)));

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

370

Mike Klein

2019-09-25 12:14:28 -0500

[diff] [blame]

371

__m128i sum_in_x = _mm_mullo_epi16(inner, allY);

372

373

// sum = horizontalSum( ... )

Mike Klein

0ec56fc

2019-09-25 11:38:46 -0500

[diff] [blame]

374

__m128i sum = _mm_add_epi16(sum_in_x, _mm_srli_si128(sum_in_x, 8));

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

375

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

376

// Get back to [0,255] by dividing by maximum weight 16x16 = 256.

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

377

sum = _mm_srli_epi16(sum, 8);

378

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

379

if (s.fAlphaScale < 256) {

380

// Scale by alpha, which is in [0,256].

381

sum = _mm_mullo_epi16(sum, _mm_set1_epi16(s.fAlphaScale));

382

sum = _mm_srli_epi16(sum, 8);

383

}

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

384

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

385

// Pack back into 8-bit values and store.

386

*colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(sum, _mm_setzero_si128()));

387

}

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

}

#else

// The NEON code only actually differs from the portable code in the

393

// filtering step after we've loaded all four pixels we want to bilerp.

394

395

#if defined(SK_ARM_HAS_NEON)

396

static void filter_and_scale_by_alpha(unsigned x, unsigned y,

397

SkPMColor a00, SkPMColor a01,

398

SkPMColor a10, SkPMColor a11,

399

SkPMColor *dst,

400

uint16_t scale) {

401

uint8x8_t vy, vconst16_8, v16_y, vres;

402

uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;

403

uint32x2_t va0, va1;

404

uint16x8_t tmp1, tmp2;

405

406

vy = vdup_n_u8(y); // duplicate y into vy

407

vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8

408

v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y

409

410

va0 = vdup_n_u32(a00); // duplicate a00

411

va1 = vdup_n_u32(a10); // duplicate a10

412

va0 = vset_lane_u32(a01, va0, 1); // set top to a01

413

va1 = vset_lane_u32(a11, va1, 1); // set top to a11

414

415

tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)

416

tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y

417

418

vx = vdup_n_u16(x); // duplicate x into vx

419

vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16

420

v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x

421

422

tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x

423

tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x

424

tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)

425

tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)

426

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

427

if (scale < 256) {

428

vscale = vdup_n_u16(scale); // duplicate scale

429

tmp = vshr_n_u16(tmp, 8); // shift down result by 8

430

tmp = vmul_u16(tmp, vscale); // multiply result by scale

431

}

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

432

Jian Cai

9e0afb7

2019-12-18 14:42:49 -0800

[diff] [blame]

433

vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16((uint64_t)0)), 8); // shift down result by 8

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

434

vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result

435

}

436

#else

437

static void filter_and_scale_by_alpha(unsigned x, unsigned y,

438

SkPMColor a00, SkPMColor a01,

439

SkPMColor a10, SkPMColor a11,

440

SkPMColor* dstColor,

441

unsigned alphaScale) {

442

SkASSERT((unsigned)x <= 0xF);

443

SkASSERT((unsigned)y <= 0xF);

444

SkASSERT(alphaScale <= 256);

445

446

int xy = x * y;

447

const uint32_t mask = 0xFF00FF;

448

449

int scale = 256 - 16*y - 16*x + xy;

450

uint32_t lo = (a00 & mask) * scale;

451

uint32_t hi = ((a00 >> 8) & mask) * scale;

452

453

scale = 16*x - xy;

454

lo += (a01 & mask) * scale;

455

hi += ((a01 >> 8) & mask) * scale;

456

457

scale = 16*y - xy;

458

lo += (a10 & mask) * scale;

459

hi += ((a10 >> 8) & mask) * scale;

460

461

lo += (a11 & mask) * xy;

462

hi += ((a11 >> 8) & mask) * xy;

463

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

464

if (alphaScale < 256) {

465

lo = ((lo >> 8) & mask) * alphaScale;

466

hi = ((hi >> 8) & mask) * alphaScale;

467

}

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

468

469

*dstColor = ((lo >> 8) & mask) | (hi & ~mask);

}

#endif

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

474

/*not static*/ inline

475

void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,

476

const uint32_t* xy, int count, SkPMColor* colors) {

477

SkASSERT(count > 0 && colors != nullptr);

Mike Reed

2020-11-24 16:04:05 -0500

[diff] [blame]

478

SkASSERT(s.fBilerp);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

479

SkASSERT(4 == s.fPixmap.info().bytesPerPixel());

480

SkASSERT(s.fAlphaScale <= 256);

481

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

482

int y0, y1, wy;

483

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

484

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

485

auto row0 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes() ),

486

row1 = (const uint32_t*)( (const char*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes() );

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

487

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

488

while (count --> 0) {

489

int x0, x1, wx;

490

decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

491

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

492

filter_and_scale_by_alpha(wx, wy,

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

493

row0[x0], row0[x1],

494

row1[x0], row1[x1],

Mike Klein

2018-11-16 16:44:10 -0500

[diff] [blame]

495

colors++,

496

s.fAlphaScale);

497

}

Mike Klein

2018-11-16 12:22:05 -0500

[diff] [blame]

}

#endif

Mike Klein

2019-10-21 13:10:07 -0500

[diff] [blame]

502

#if defined(SK_ARM_HAS_NEON)

503

/*not static*/ inline

504

void S32_alpha_D32_filter_DXDY(const SkBitmapProcState& s,

505

const uint32_t* xy, int count, SkPMColor* colors) {

506

SkASSERT(count > 0 && colors != nullptr);

Mike Reed

2020-11-24 16:04:05 -0500

[diff] [blame]

507

SkASSERT(s.fBilerp);

Mike Klein

37bc8f9

2019-10-21 13:10:07 -0500

[diff] [blame]

508

SkASSERT(4 == s.fPixmap.info().bytesPerPixel());

509

SkASSERT(s.fAlphaScale <= 256);

510

511

auto src = (const char*)s.fPixmap.addr();

512

size_t rb = s.fPixmap.rowBytes();

513

514

while (count --> 0) {

515

int y0, y1, wy,

516

x0, x1, wx;

517

decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);

518

decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);

519

520

auto row0 = (const uint32_t*)(src + y0*rb),

521

row1 = (const uint32_t*)(src + y1*rb);

522

523

filter_and_scale_by_alpha(wx, wy,

row0[x0], row0[x1],

row1[x0], row1[x1],

colors++,

s.fAlphaScale);

}

}

#else

// It's not yet clear whether it's worthwhile specializing for SSE2/SSSE3/AVX2.

532

constexpr static void (*S32_alpha_D32_filter_DXDY)(const SkBitmapProcState&,

533

const uint32_t*, int, SkPMColor*) = nullptr;

534

#endif

535

Mike Klein