blob: ef3ab4032cd426f521bbbc1f9097cd80c43cecc6 [file] [log] [blame]
Mike Kleinf7200982017-01-15 18:14:07 -05001/*
2 * Copyright 2017 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8// This file is very similar to SkSplicer_stages.cpp, and you will want to read through that file
9// first before trying to understand this one. We'll note only key differences here.
10
11#include "SkSplicer_shared.h"
12#include <string.h>
13
14#if !defined(__clang__)
15 #error This file is not like the rest of Skia. It must be compiled with clang.
16#endif
17
18#if defined(__aarch64__)
19 #include <arm_neon.h>
20
21 // In this file, F is a vector of SkFixed15.
22 // See SkFixed15.h for notes on its various operations.
23 struct F {
24 using V = uint16_t __attribute__((ext_vector_type(8)));
25
26 V vec;
27
28 F(uint16x8_t v) : vec(v) {}
29 operator V() const { return vec; }
30
31 F() = default;
32 F(uint16_t v) : vec(v) {}
33
34 F operator+(F o) const { return vqaddq_u16(vec, o.vec); }
35 F operator-(F o) const { return vqsubq_u16(vec, o.vec); }
36 F operator*(F o) const {
37 return vsraq_n_u16(vabsq_s16(vqrdmulhq_s16(vec, o.vec)),
38 vandq_s16(vec, o.vec), 15);
39 }
40 F operator>>(int k) const { return vec >> k; }
41 F operator<<(int k) const { return vec << k; }
42 };
43 static F min(F a, F b) { return vminq_u16(a,b); }
44 static F max(F a, F b) { return vmaxq_u16(a,b); }
45
46#elif defined(__ARM_NEON__)
47 #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
48 #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
49 #endif
50 #include <arm_neon.h>
51
52 struct F {
53 using V = uint16_t __attribute__((ext_vector_type(4)));
54
55 V vec;
56
57 F(uint16x4_t v) : vec(v) {}
58 operator V() const { return vec; }
59
60 F() = default;
61 F(uint16_t v) : vec(v) {}
62
63 F operator+(F o) const { return vqadd_u16(vec, o.vec); }
64 F operator-(F o) const { return vqsub_u16(vec, o.vec); }
65 F operator*(F o) const {
66 return vsra_n_u16(vabs_s16(vqrdmulh_s16(vec, o.vec)),
67 vand_s16(vec, o.vec), 15);
68 }
69 F operator>>(int k) const { return vec >> k; }
70 F operator<<(int k) const { return vec << k; }
71 };
72 static F min(F a, F b) { return vmin_u16(a,b); }
73 static F max(F a, F b) { return vmax_u16(a,b); }
74
75#else
76 #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
77 #error On x86, compile with -mavx2 -mfma -mf16c.
78 #endif
79 #include <immintrin.h>
80
81 struct F {
82 using V = uint16_t __attribute__((ext_vector_type(16)));
83
84 V vec;
85
86 F(__m256 v) : vec(v) {}
87 operator V() const { return vec; }
88
89 F() = default;
90 F(uint16_t v) : vec(v) {}
91
92 F operator+(F o) const { return _mm256_adds_epu16(vec, o.vec); }
93 F operator-(F o) const { return _mm256_subs_epu16(vec, o.vec); }
94 F operator*(F o) const { return _mm256_abs_epi16(_mm256_mulhrs_epi16(vec, o.vec)); }
95 F operator>>(int k) const { return vec >> k; }
96 F operator<<(int k) const { return vec << k; }
97 };
98 static F min(F a, F b) { return _mm256_min_epu16(a,b); }
99 static F max(F a, F b) { return _mm256_max_epu16(a,b); }
100#endif
101
102// No platform actually supports FMA for SkFixed15.
103// This fma() method just makes it easier to port stages to lowp.
104static F fma(F f, F m, F a) { return f*m+a; }
105
106#if defined(__ARM_NEON__)
107 #define C extern "C" __attribute__((pcs("aapcs-vfp")))
108#else
109 #define C extern "C"
110#endif
111
112// We use a set of constants suitable for SkFixed15 math.
113using K = const SkSplicer_constants_lowp;
114using Stage = void(size_t x, size_t limit, void* ctx, K* k, F,F,F,F, F,F,F,F);
115
116// The armv7 aapcs-vfp calling convention makes us pass F::V instead of F if we want them in
117// registers. This shouldn't affect performance or how you write STAGEs in any way.
118C void done(size_t, size_t, void*, K*, F::V,F::V,F::V,F::V, F::V,F::V,F::V,F::V);
119
120#define STAGE(name) \
121 static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
122 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
123 C void name##_lowp(size_t x, size_t limit, void* ctx, K* k, \
124 F::V R, F::V G, F::V B, F::V A, \
125 F::V DR, F::V DG, F::V DB, F::V DA) { \
126 F r = R, g = G, b = B, a = A, dr = DR, dg = DG, db = DB, da = DA; \
127 name##_k(x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
128 done (x,limit,ctx,k, r,g,b,a, dr,dg,db,da); \
129 } \
130 static void name##_k(size_t& x, size_t limit, void* ctx, K* k, \
131 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
132
133STAGE(inc_x) {
134 x += sizeof(F) / sizeof(uint16_t);
135}
136
137STAGE(clear) {
138 r = g = b = a = 0;
139}
140
141STAGE(plus_) {
142 r = r + dr;
143 g = g + dg;
144 b = b + db;
145 a = a + da;
146}
147
148STAGE(srcover) {
149 auto A = F(k->_1) - a;
150 r = fma(dr, A, r);
151 g = fma(dg, A, g);
152 b = fma(db, A, b);
153 a = fma(da, A, a);
154}
155STAGE(dstover) { srcover_k(x,limit,ctx,k, dr,dg,db,da, r,g,b,a); }
156
157STAGE(clamp_1) {
158 r = min(r, k->_1);
159 g = min(g, k->_1);
160 b = min(b, k->_1);
161 a = min(a, k->_1);
162}
163
164STAGE(clamp_a) {
165 a = min(a, k->_1);
166 r = min(r, a);
167 g = min(g, a);
168 b = min(b, a);
169}
170
171STAGE(swap) {
172 auto swap = [](F& v, F& dv) {
173 auto tmp = v;
174 v = dv;
175 dv = tmp;
176 };
177 swap(r, dr);
178 swap(g, dg);
179 swap(b, db);
180 swap(a, da);
181}
182STAGE(move_src_dst) {
183 dr = r;
184 dg = g;
185 db = b;
186 da = a;
187}
188STAGE(move_dst_src) {
189 r = dr;
190 g = dg;
191 b = db;
192 a = da;
193}
194
195STAGE(premul) {
196 r = r * a;
197 g = g * a;
198 b = b * a;
199}
200
201STAGE(load_8888) {
202 auto ptr = *(const uint32_t**)ctx + x;
203
204#if defined(__aarch64__)
205 auto to_fixed15 = [](uint8x8_t u8) {
206 // u8 * (32768/255) == u8 * 128.50196... == u8*128 + u8/2 + (u8+1)>>8 ( see SkFixed15.h)
207 //
208 // Here we do (u8*128 <rounding +> u8/2), which is the same as our canonical math for 0
209 // and 255, and never off by more than 1 in between. Thanks to NEON, it's 2 instructions!
210 auto u16 = vshll_n_u8(u8, 7); // u16 = u8*128
211 return vrsraq_n_u16(u16, u16, 8); // u16 + u16/256, with rounding
212 };
213
214 uint8x8x4_t rgba = vld4_u8((const uint8_t*)ptr);
215 r = to_fixed15(rgba.val[0]);
216 g = to_fixed15(rgba.val[1]);
217 b = to_fixed15(rgba.val[2]);
218 a = to_fixed15(rgba.val[3]);
219
220#elif defined(__ARM_NEON__)
221 auto to_fixed15 = [](uint8x8_t u8) {
222 // Same as aarch64, but only keeping the bottom 4 lanes.
223 auto u16 = vshll_n_u8(u8, 7);
224 return vget_low_u16(vrsraq_n_u16(u16, u16, 8));
225 };
226
227 // I can't get quite the code generation I want using vld4_lane_u8(),
228 // so we're going to drop into assembly to do the loads. :/
229
230 uint8x8_t R,G,B,A;
231 asm("vld4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
232 "vld4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
233 "vld4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
234 "vld4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
235 : "+r"(ptr), "=w"(R), "=w"(G), "=w"(B), "=w"(A));
236 r = to_fixed15(R);
237 g = to_fixed15(G);
238 b = to_fixed15(B);
239 a = to_fixed15(A);
240
241#else
242 auto to_fixed15 = [k](__m128i u8) {
243 F u16 = _mm256_cvtepu8_epi16(u8);
244 return (u16 << 7) + (u16 >> 1) + ((u16+k->_0x0001)>>8);
245 };
246
247 // TODO: shorter, more confusing, faster with 256-bit loads and shuffles
248
249 // Load 16 interplaced pixels.
250 auto _0123 = _mm_loadu_si128((const __m128i*)ptr + 0),
251 _4567 = _mm_loadu_si128((const __m128i*)ptr + 1),
252 _89AB = _mm_loadu_si128((const __m128i*)ptr + 2),
253 _CDEF = _mm_loadu_si128((const __m128i*)ptr + 3);
254
255 // We've got an awful lot of unpacking to do to transpose this...
256 auto _0415 = _mm_unpacklo_epi8(_0123, _4567), // r04 g04 b04 a04 r15 g15 b15 a15
257 _2637 = _mm_unpackhi_epi8(_0123, _4567), // r26 g26 b26 a26 r37 g37 b37 a37
258 _8C9D = _mm_unpacklo_epi8(_89AB, _CDEF),
259 _AEBF = _mm_unpackhi_epi8(_89AB, _CDEF);
260
261 auto _0246 = _mm_unpacklo_epi8(_0415, _2637), // r0246 g0246 b0246 a0246
262 _1357 = _mm_unpackhi_epi8(_0415, _2637), // r1357 g1357 b1357 a1357
263 _8ACE = _mm_unpacklo_epi8(_8C9D, _AEBF),
264 _9BDF = _mm_unpackhi_epi8(_8C9D, _AEBF);
265
266 auto rg_01234567 = _mm_unpacklo_epi8(_0246, _1357), // r01234567 g01234567
267 ba_01234567 = _mm_unpackhi_epi8(_0246, _1357), // b01234567 a01234567
268 rg_89ABCDEF = _mm_unpacklo_epi8(_8ACE, _9BDF), // r89ABCDEF g89ABCDEF
269 ba_89ABCDEF = _mm_unpackhi_epi8(_8ACE, _9BDF); // b89ABCDEF a89ABCDEF
270
271 r = to_fixed15(_mm_unpacklo_epi64(rg_01234567, rg_89ABCDEF));
272 g = to_fixed15(_mm_unpackhi_epi64(rg_01234567, rg_89ABCDEF));
273 b = to_fixed15(_mm_unpacklo_epi64(ba_01234567, ba_89ABCDEF));
274 a = to_fixed15(_mm_unpackhi_epi64(ba_01234567, ba_89ABCDEF));
275#endif
276}
277
278STAGE(store_8888) {
279 auto ptr = *(uint32_t**)ctx + x;
280
281#if defined(__aarch64__)
282 auto from_fixed15 = [](F v) {
283 // The canonical math for this from SkFixed15.h is (v - (v>>8)) >> 7.
284 // But what's really most important is that all bytes round trip.
285
286 // We can do this in NEON in one instruction, a saturating narrowing right shift:
287 return vqshrn_n_u16(v, 7);
288 };
289
290 uint8x8x4_t rgba = {{
291 from_fixed15(r),
292 from_fixed15(g),
293 from_fixed15(b),
294 from_fixed15(a),
295 }};
296 vst4_u8((uint8_t*)ptr, rgba);
297#elif defined(__ARM_NEON__)
298 auto from_fixed15 = [](F v) {
299 // Same as aarch64, but first we need to pad our vectors from 8 to 16 bytes.
300 F whatever;
301 return vqshrn_n_u16(vcombine_u8(v, whatever), 7);
302 };
303
304 // As in load_8888, I can't get quite the ideal code generation using vst4_lane_u8().
305 asm("vst4.8 {%1[0],%2[0],%3[0],%4[0]}, [%0]!\n"
306 "vst4.8 {%1[1],%2[1],%3[1],%4[1]}, [%0]!\n"
307 "vst4.8 {%1[2],%2[2],%3[2],%4[2]}, [%0]!\n"
308 "vst4.8 {%1[3],%2[3],%3[3],%4[3]}, [%0]!\n"
309 : "+r"(ptr)
310 : "w"(from_fixed15(r)), "w"(from_fixed15(g)), "w"(from_fixed15(b)), "w"(from_fixed15(a))
311 : "memory");
312
313#else
314 auto from_fixed15 = [](F v) {
315 // See the note in aarch64's from_fixed15(). The same roundtrip goal applies here.
316 // Here we take a different approach: (v saturated+ v) >> 8.
317 v = (v+v) >> 8;
318 return _mm_packus_epi16(_mm256_extracti128_si256(v, 0),
319 _mm256_extracti128_si256(v, 1));
320 };
321
322 auto R = from_fixed15(r),
323 G = from_fixed15(g),
324 B = from_fixed15(b),
325 A = from_fixed15(a);
326
327 auto rg_01234567 = _mm_unpacklo_epi8(R,G), // rg0 rg1 rg2 ... rg7
328 rg_89ABCDEF = _mm_unpackhi_epi8(R,G), // rg8 rg9 rgA ... rgF
329 ba_01234567 = _mm_unpacklo_epi8(B,A),
330 ba_89ABCDEF = _mm_unpackhi_epi8(B,A);
331 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi16(rg_01234567, ba_01234567));
332 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi16(rg_01234567, ba_01234567));
333 _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi16(rg_89ABCDEF, ba_89ABCDEF));
334 _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi16(rg_89ABCDEF, ba_89ABCDEF));
335#endif
336}