blob: adc03c51e9d3ed8d892ad2d24187ce6857c3b2a6 [file] [log] [blame]
Mike Kleina2187bf2018-11-16 12:22:05 -05001/*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkBitmapProcState_opts_DEFINED
9#define SkBitmapProcState_opts_DEFINED
10
11#include "SkBitmapProcState.h"
12
13// SkBitmapProcState optimized Shader, Sample, or Matrix procs.
14//
15// Only S32_alpha_D32_filter_DX exploits instructions beyond
16// our common baseline SSE2/NEON instruction sets, so that's
17// all that lives here.
18//
19// The rest are scattershot at the moment but I want to get them
20// all migrated to be normal code inside SkBitmapProcState.cpp.
21
22#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
23 #include <immintrin.h>
24#elif defined(SK_ARM_HAS_NEON)
25 #include <arm_neon.h>
26#endif
27
28namespace SK_OPTS_NS {
29
30#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
31 // This same basic packing scheme is used throughout the file.
32 static void decode_packed_coordinates_and_weight(uint32_t packed, int* v0, int* v1, int* w) {
33 // The top 14 bits are the integer coordinate x0 or y0.
34 *v0 = packed >> 18;
35
36 // The bottom 14 bits are the integer coordinate x1 or y1.
37 *v1 = packed & 0x3fff;
38
39 // The middle 4 bits are the interpolating factor between the two, i.e. the weight for v1.
40 *w = (packed >> 14) & 0xf;
41 }
42
43 // As above, 4x.
44 static void decode_packed_coordinates_and_weight(__m128i packed,
45 int v0[4], int v1[4], __m128i* w) {
46 _mm_storeu_si128((__m128i*)v0, _mm_srli_epi32(packed, 18));
47 _mm_storeu_si128((__m128i*)v1, _mm_and_si128 (packed, _mm_set1_epi32(0x3fff)));
48 *w = _mm_and_si128(_mm_srli_epi32(packed, 14), _mm_set1_epi32(0xf));
49 }
50
51 // This is the crux of the SSSE3 implementation,
52 // interpolating in X for up to two output pixels (A and B) using _mm_maddubs_epi16().
53 static inline __m128i interpolate_in_x(uint32_t A0, uint32_t A1,
54 uint32_t B0, uint32_t B1,
55 const __m128i& interlaced_x_weights) {
56 // _mm_maddubs_epi16() is a little idiosyncratic, but very helpful as the core of a lerp.
57 //
58 // It takes two arguments interlaced byte-wise:
59 // - first arg: [ x,y, ... 7 more pairs of 8-bit values ...]
60 // - second arg: [ z,w, ... 7 more pairs of 8-bit values ...]
61 // and returns 8 16-bit values: [ x*z + y*w, ... 7 more 16-bit values ... ].
62 //
63 // That's why we go to all this trouble to make interlaced_x_weights,
64 // and here we're interlacing A0 with A1, B0 with B1 to match.
65
66 __m128i interlaced_A = _mm_unpacklo_epi8(_mm_cvtsi32_si128(A0), _mm_cvtsi32_si128(A1)),
67 interlaced_B = _mm_unpacklo_epi8(_mm_cvtsi32_si128(B0), _mm_cvtsi32_si128(B1));
68
69 return _mm_maddubs_epi16(_mm_unpacklo_epi64(interlaced_A, interlaced_B),
70 interlaced_x_weights);
71 }
72
73 // Interpolate {A0..A3} --> output pixel A, and {B0..B3} --> output pixel B.
74 // Returns two pixels, with each channel in a 16-bit lane of the __m128i.
75 static inline __m128i interpolate_in_x_and_y(uint32_t A0, uint32_t A1,
76 uint32_t A2, uint32_t A3,
77 uint32_t B0, uint32_t B1,
78 uint32_t B2, uint32_t B3,
79 const __m128i& interlaced_x_weights,
80 int wy) {
81 // The stored Y weight wy is for y1, and y0 gets a weight 16-wy.
82 const __m128i wy1 = _mm_set1_epi16(wy),
83 wy0 = _mm_sub_epi16(_mm_set1_epi16(16), wy1);
84
85 // First interpolate in X,
86 // leaving the values in 16-bit lanes scaled up by those [0,16] interlaced_x_weights.
87 __m128i row0 = interpolate_in_x(A0,A1, B0,B1, interlaced_x_weights),
88 row1 = interpolate_in_x(A2,A3, B2,B3, interlaced_x_weights);
89
90 // Interpolate in Y across the two rows,
91 // then scale everything down by the maximum total weight 16x16 = 256.
92 return _mm_srli_epi16(_mm_add_epi16(_mm_mullo_epi16(row0, wy0),
93 _mm_mullo_epi16(row1, wy1)), 8);
94 }
95
96 /*not static*/ inline
97 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
98 const uint32_t* xy, int count, uint32_t* colors) {
99 SkASSERT(count > 0 && colors != nullptr);
100 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
101 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
102
103 int alpha = s.fAlphaScale;
104
105 // Return (px * s.fAlphaScale) / 256. (s.fAlphaScale is in [0,256].)
106 auto scale_by_alpha = [alpha](const __m128i& px) {
107 return alpha == 256 ? px
108 : _mm_srli_epi16(_mm_mullo_epi16(px, _mm_set1_epi16(alpha)), 8);
109 };
110
111 // We're in _DX_ mode here, so we're only varying in X.
112 // That means the first entry of xy is our constant pair of Y coordinates and weight in Y.
113 // All the other entries in xy will be pairs of X coordinates and the X weight.
114 int y0, y1, wy;
115 decode_packed_coordinates_and_weight(*xy++, &y0, &y1, &wy);
116
117 auto row0 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y0 * s.fPixmap.rowBytes()),
118 row1 = (const uint32_t*)((const uint8_t*)s.fPixmap.addr() + y1 * s.fPixmap.rowBytes());
119
120 while (count >= 4) {
121 // We can really get going, loading 4 X pairs at a time to produce 4 output pixels.
122 const __m128i xx = _mm_loadu_si128((const __m128i*)xy);
123
124 int x0[4],
125 x1[4];
126 __m128i wx;
127 decode_packed_coordinates_and_weight(xx, x0, x1, &wx);
128
129 // Splat out each x weight wx four times (one for each pixel channel) as wx1,
130 // and sixteen minus that as the weight for x0, wx0.
131 __m128i wx1 = _mm_shuffle_epi8(wx, _mm_setr_epi8(0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12)),
132 wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
133
134 // We need to interlace wx0 and wx1 for _mm_maddubs_epi16().
135 __m128i interlaced_x_weights_AB = _mm_unpacklo_epi8(wx0,wx1),
136 interlaced_x_weights_CD = _mm_unpackhi_epi8(wx0,wx1);
137
138 // interpolate_in_x_and_y() can produce two output pixels (A and B) at a time
139 // from eight input pixels {A0..A3} and {B0..B3}, arranged in a 2x2 grid for each.
140 __m128i AB = interpolate_in_x_and_y(row0[x0[0]], row0[x1[0]],
141 row1[x0[0]], row1[x1[0]],
142 row0[x0[1]], row0[x1[1]],
143 row1[x0[1]], row1[x1[1]],
144 interlaced_x_weights_AB, wy);
145
146 // Once more with the other half of the x-weights for two more pixels C,D.
147 __m128i CD = interpolate_in_x_and_y(row0[x0[2]], row0[x1[2]],
148 row1[x0[2]], row1[x1[2]],
149 row0[x0[3]], row0[x1[3]],
150 row1[x0[3]], row1[x1[3]],
151 interlaced_x_weights_CD, wy);
152
153 // Scale by alpha, pack back together to 8-bit lanes, and write out four pixels!
154 _mm_storeu_si128((__m128i*)colors, _mm_packus_epi16(scale_by_alpha(AB),
155 scale_by_alpha(CD)));
156 xy += 4;
157 colors += 4;
158 count -= 4;
159 }
160
161 while (count --> 0) {
162 // This is exactly the same flow as the count >= 4 loop above, but writing one pixel.
163 int x0, x1, wx;
164 decode_packed_coordinates_and_weight(*xy++, &x0, &x1, &wx);
165
166 // As above, splat out wx four times as wx1, and sixteen minus that as wx0.
167 __m128i wx1 = _mm_set1_epi8(wx), // This splats it out 16 times, but that's fine.
168 wx0 = _mm_sub_epi8(_mm_set1_epi8(16), wx1);
169
170 __m128i interlaced_x_weights_A = _mm_unpacklo_epi8(wx0, wx1);
171
172 __m128i A = interpolate_in_x_and_y(row0[x0], row0[x1],
173 row1[x0], row1[x1],
174 0, 0,
175 0, 0,
176 interlaced_x_weights_A, wy);
177
178 *colors++ = _mm_cvtsi128_si32(_mm_packus_epi16(scale_by_alpha(A), _mm_setzero_si128()));
179 }
180 }
181
182
183#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
184
185 // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
186
187 /*not static*/ inline
188 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
189 const uint32_t* xy, int count, uint32_t* colors) {
190 SkASSERT(count > 0 && colors != nullptr);
191 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
192 SkASSERT(kN32_SkColorType == s.fPixmap.colorType());
193 SkASSERT(s.fAlphaScale <= 256);
194
195 const char* srcAddr = static_cast<const char*>(s.fPixmap.addr());
196 size_t rb = s.fPixmap.rowBytes();
197 uint32_t XY = *xy++;
198 unsigned y0 = XY >> 14;
199 const uint32_t* row0 = reinterpret_cast<const uint32_t*>(srcAddr + (y0 >> 4) * rb);
200 const uint32_t* row1 = reinterpret_cast<const uint32_t*>(srcAddr + (XY & 0x3FFF) * rb);
201 unsigned subY = y0 & 0xF;
202
203 // ( 0, 0, 0, 0, 0, 0, 0, 16)
204 __m128i sixteen = _mm_cvtsi32_si128(16);
205
206 // ( 0, 0, 0, 0, 16, 16, 16, 16)
207 sixteen = _mm_shufflelo_epi16(sixteen, 0);
208
209 // ( 0, 0, 0, 0, 0, 0, 0, y)
210 __m128i allY = _mm_cvtsi32_si128(subY);
211
212 // ( 0, 0, 0, 0, y, y, y, y)
213 allY = _mm_shufflelo_epi16(allY, 0);
214
215 // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y)
216 __m128i negY = _mm_sub_epi16(sixteen, allY);
217
218 // (16-y, 16-y, 16-y, 16-y, y, y, y, y)
219 allY = _mm_unpacklo_epi64(allY, negY);
220
221 // (16, 16, 16, 16, 16, 16, 16, 16 )
222 sixteen = _mm_shuffle_epi32(sixteen, 0);
223
224 // ( 0, 0, 0, 0, 0, 0, 0, 0)
225 __m128i zero = _mm_setzero_si128();
226
227 // ( alpha, alpha, alpha, alpha, alpha, alpha, alpha, alpha )
228 __m128i alpha = _mm_set1_epi16(s.fAlphaScale);
229
230 do {
231 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
232 unsigned x0 = XX >> 18;
233 unsigned x1 = XX & 0x3FFF;
234
235 // (0, 0, 0, 0, 0, 0, 0, x)
236 __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F);
237
238 // (0, 0, 0, 0, x, x, x, x)
239 allX = _mm_shufflelo_epi16(allX, 0);
240
241 // (x, x, x, x, x, x, x, x)
242 allX = _mm_shuffle_epi32(allX, 0);
243
244 // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x)
245 __m128i negX = _mm_sub_epi16(sixteen, allX);
246
247 // Load 4 samples (pixels).
248 __m128i a00 = _mm_cvtsi32_si128(row0[x0]);
249 __m128i a01 = _mm_cvtsi32_si128(row0[x1]);
250 __m128i a10 = _mm_cvtsi32_si128(row1[x0]);
251 __m128i a11 = _mm_cvtsi32_si128(row1[x1]);
252
253 // (0, 0, a00, a10)
254 __m128i a00a10 = _mm_unpacklo_epi32(a10, a00);
255
256 // Expand to 16 bits per component.
257 a00a10 = _mm_unpacklo_epi8(a00a10, zero);
258
259 // ((a00 * (16-y)), (a10 * y)).
260 a00a10 = _mm_mullo_epi16(a00a10, allY);
261
262 // (a00 * (16-y) * (16-x), a10 * y * (16-x)).
263 a00a10 = _mm_mullo_epi16(a00a10, negX);
264
265 // (0, 0, a01, a10)
266 __m128i a01a11 = _mm_unpacklo_epi32(a11, a01);
267
268 // Expand to 16 bits per component.
269 a01a11 = _mm_unpacklo_epi8(a01a11, zero);
270
271 // (a01 * (16-y)), (a11 * y)
272 a01a11 = _mm_mullo_epi16(a01a11, allY);
273
274 // (a01 * (16-y) * x), (a11 * y * x)
275 a01a11 = _mm_mullo_epi16(a01a11, allX);
276
277 // (a00*w00 + a01*w01, a10*w10 + a11*w11)
278 __m128i sum = _mm_add_epi16(a00a10, a01a11);
279
280 // (DC, a00*w00 + a01*w01)
281 __m128i shifted = _mm_shuffle_epi32(sum, 0xEE);
282
283 // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11)
284 sum = _mm_add_epi16(sum, shifted);
285
286 // Divide each 16 bit component by 256.
287 sum = _mm_srli_epi16(sum, 8);
288
289 // Multiply by alpha.
290 sum = _mm_mullo_epi16(sum, alpha);
291
292 // Divide each 16 bit component by 256.
293 sum = _mm_srli_epi16(sum, 8);
294
295 // Pack lower 4 16 bit values of sum into lower 4 bytes.
296 sum = _mm_packus_epi16(sum, zero);
297
298 // Extract low int and store.
299 *colors++ = _mm_cvtsi128_si32(sum);
300 } while (--count > 0);
301 }
302
303#else
304
305 // The NEON code only actually differs from the portable code in the
306 // filtering step after we've loaded all four pixels we want to bilerp.
307
308 #if defined(SK_ARM_HAS_NEON)
309 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
310 SkPMColor a00, SkPMColor a01,
311 SkPMColor a10, SkPMColor a11,
312 SkPMColor *dst,
313 uint16_t scale) {
314 uint8x8_t vy, vconst16_8, v16_y, vres;
315 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
316 uint32x2_t va0, va1;
317 uint16x8_t tmp1, tmp2;
318
319 vy = vdup_n_u8(y); // duplicate y into vy
320 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
321 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
322
323 va0 = vdup_n_u32(a00); // duplicate a00
324 va1 = vdup_n_u32(a10); // duplicate a10
325 va0 = vset_lane_u32(a01, va0, 1); // set top to a01
326 va1 = vset_lane_u32(a11, va1, 1); // set top to a11
327
328 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
329 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y
330
331 vx = vdup_n_u16(x); // duplicate x into vx
332 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
333 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
334
335 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
336 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
337 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
338 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
339
340 vscale = vdup_n_u16(scale); // duplicate scale
341 tmp = vshr_n_u16(tmp, 8); // shift down result by 8
342 tmp = vmul_u16(tmp, vscale); // multiply result by scale
343
344 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
345 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
346 }
347 #else
348 static void filter_and_scale_by_alpha(unsigned x, unsigned y,
349 SkPMColor a00, SkPMColor a01,
350 SkPMColor a10, SkPMColor a11,
351 SkPMColor* dstColor,
352 unsigned alphaScale) {
353 SkASSERT((unsigned)x <= 0xF);
354 SkASSERT((unsigned)y <= 0xF);
355 SkASSERT(alphaScale <= 256);
356
357 int xy = x * y;
358 const uint32_t mask = 0xFF00FF;
359
360 int scale = 256 - 16*y - 16*x + xy;
361 uint32_t lo = (a00 & mask) * scale;
362 uint32_t hi = ((a00 >> 8) & mask) * scale;
363
364 scale = 16*x - xy;
365 lo += (a01 & mask) * scale;
366 hi += ((a01 >> 8) & mask) * scale;
367
368 scale = 16*y - xy;
369 lo += (a10 & mask) * scale;
370 hi += ((a10 >> 8) & mask) * scale;
371
372 lo += (a11 & mask) * xy;
373 hi += ((a11 >> 8) & mask) * xy;
374
375 // TODO: if (alphaScale < 256) ...
376 lo = ((lo >> 8) & mask) * alphaScale;
377 hi = ((hi >> 8) & mask) * alphaScale;
378
379 *dstColor = ((lo >> 8) & mask) | (hi & ~mask);
380 }
381 #endif
382
383
384 // TODO(mtklein): clean up this code, use decode_packed_coordinates_and_weight(), etc.
385
386 /*not static*/ inline
387 void S32_alpha_D32_filter_DX(const SkBitmapProcState& s,
388 const uint32_t* xy, int count, SkPMColor* colors) {
389 SkASSERT(count > 0 && colors != nullptr);
390 SkASSERT(s.fFilterQuality != kNone_SkFilterQuality);
391 SkASSERT(4 == s.fPixmap.info().bytesPerPixel());
392 SkASSERT(s.fAlphaScale <= 256);
393
394 unsigned alphaScale = s.fAlphaScale;
395
396 const char* srcAddr = (const char*)s.fPixmap.addr();
397 size_t rb = s.fPixmap.rowBytes();
398 unsigned subY;
399 const SkPMColor* row0;
400 const SkPMColor* row1;
401
402 // setup row ptrs and update proc_table
403 {
404 uint32_t XY = *xy++;
405 unsigned y0 = XY >> 14;
406 row0 = (const SkPMColor*)(srcAddr + (y0 >> 4) * rb);
407 row1 = (const SkPMColor*)(srcAddr + (XY & 0x3FFF) * rb);
408 subY = y0 & 0xF;
409 }
410
411 do {
412 uint32_t XX = *xy++; // x0:14 | 4 | x1:14
413 unsigned x0 = XX >> 14;
414 unsigned x1 = XX & 0x3FFF;
415 unsigned subX = x0 & 0xF;
416 x0 >>= 4;
417
418 filter_and_scale_by_alpha(subX, subY,
419 row0[x0], row0[x1],
420 row1[x0], row1[x1],
421 colors,
422 alphaScale);
423 colors += 1;
424
425 } while (--count != 0);
426 }
427
428#endif
429
430} // namespace SK_OPTS_NS
431
432#endif