blob: e702aef1e29f307d41d93f2d2bb1a3053859c97d [file] [log] [blame]
mtkleinc9adb052015-03-30 10:50:27 -07001/*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8#ifndef SkNx_neon_DEFINED
9#define SkNx_neon_DEFINED
10
mtkleine18fa442016-06-09 13:40:56 -070011#include <arm_neon.h>
12
Mike Kleinc33d6142018-12-12 08:47:54 -050013namespace { // NOLINT(google-build-namespaces)
Mike Klein1e764642016-10-14 17:09:03 -040014
Chris Dalton89c5e882018-06-08 11:46:42 -060015// ARMv8 has vrndm(q)_f32 to floor floats. Here we emulate it:
mtkleine5fe9a42016-02-10 07:55:56 -080016// - roundtrip through integers via truncation
17// - subtract 1 if that's too big (possible for negative values).
18// This restricts the domain of our inputs to a maximum somehwere around 2^31. Seems plenty big.
Chris Dalton89c5e882018-06-08 11:46:42 -060019AI static float32x4_t emulate_vrndmq_f32(float32x4_t v) {
mtklein7c0db752016-07-30 14:18:49 -070020 auto roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
21 auto too_big = vcgtq_f32(roundtrip, v);
22 return vsubq_f32(roundtrip, (float32x4_t)vandq_u32(too_big, (uint32x4_t)vdupq_n_f32(1)));
mtklein126626e2016-02-09 15:41:36 -080023}
Chris Dalton89c5e882018-06-08 11:46:42 -060024AI static float32x2_t emulate_vrndm_f32(float32x2_t v) {
25 auto roundtrip = vcvt_f32_s32(vcvt_s32_f32(v));
26 auto too_big = vcgt_f32(roundtrip, v);
27 return vsub_f32(roundtrip, (float32x2_t)vand_u32(too_big, (uint32x2_t)vdup_n_f32(1)));
28}
mtklein126626e2016-02-09 15:41:36 -080029
mtkleinc9adb052015-03-30 10:50:27 -070030template <>
mtklein6c221b42015-11-20 13:53:19 -080031class SkNx<2, float> {
mtkleinc9adb052015-03-30 10:50:27 -070032public:
Mike Klein7c78f3a2016-10-19 09:21:11 -040033 AI SkNx(float32x2_t vec) : fVec(vec) {}
mtkleinc9adb052015-03-30 10:50:27 -070034
Mike Klein7c78f3a2016-10-19 09:21:11 -040035 AI SkNx() {}
36 AI SkNx(float val) : fVec(vdup_n_f32(val)) {}
37 AI SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; }
mtklein7c0db752016-07-30 14:18:49 -070038
Mike Klein7c78f3a2016-10-19 09:21:11 -040039 AI static SkNx Load(const void* ptr) { return vld1_f32((const float*)ptr); }
40 AI void store(void* ptr) const { vst1_f32((float*)ptr, fVec); }
mtkleinc9adb052015-03-30 10:50:27 -070041
Chris Dalton21f64372018-04-11 14:01:04 -060042 AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
43 float32x2x2_t xy = vld2_f32((const float*) ptr);
44 *x = xy.val[0];
45 *y = xy.val[1];
46 }
47
Chris Dalton42f02aa2018-04-08 23:58:43 -060048 AI static void Store2(void* dst, const SkNx& a, const SkNx& b) {
49 float32x2x2_t ab = {{
50 a.fVec,
51 b.fVec,
52 }};
53 vst2_f32((float*) dst, ab);
54 }
55
Chris Dalton0cb75872017-12-01 13:23:05 -070056 AI static void Store3(void* dst, const SkNx& a, const SkNx& b, const SkNx& c) {
57 float32x2x3_t abc = {{
58 a.fVec,
59 b.fVec,
60 c.fVec,
61 }};
62 vst3_f32((float*) dst, abc);
63 }
64
Chris Dalton6f8fa4e2018-02-06 17:55:30 -070065 AI static void Store4(void* dst, const SkNx& a, const SkNx& b, const SkNx& c, const SkNx& d) {
66 float32x2x4_t abcd = {{
67 a.fVec,
68 b.fVec,
69 c.fVec,
70 d.fVec,
71 }};
72 vst4_f32((float*) dst, abcd);
73 }
74
Mike Klein7c78f3a2016-10-19 09:21:11 -040075 AI SkNx invert() const {
mtklein7c0db752016-07-30 14:18:49 -070076 float32x2_t est0 = vrecpe_f32(fVec),
77 est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
78 return est1;
79 }
mtkleinc9adb052015-03-30 10:50:27 -070080
Chris Dalton7732f4f2017-08-28 14:45:40 -060081 AI SkNx operator - () const { return vneg_f32(fVec); }
82
Mike Klein7c78f3a2016-10-19 09:21:11 -040083 AI SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); }
84 AI SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); }
85 AI SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); }
86 AI SkNx operator / (const SkNx& o) const {
mtklein7c0db752016-07-30 14:18:49 -070087 #if defined(SK_CPU_ARM64)
88 return vdiv_f32(fVec, o.fVec);
89 #else
90 float32x2_t est0 = vrecpe_f32(o.fVec),
91 est1 = vmul_f32(vrecps_f32(est0, o.fVec), est0),
92 est2 = vmul_f32(vrecps_f32(est1, o.fVec), est1);
93 return vmul_f32(fVec, est2);
94 #endif
95 }
96
Mike Klein7c78f3a2016-10-19 09:21:11 -040097 AI SkNx operator==(const SkNx& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); }
98 AI SkNx operator <(const SkNx& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); }
99 AI SkNx operator >(const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); }
100 AI SkNx operator<=(const SkNx& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); }
101 AI SkNx operator>=(const SkNx& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); }
102 AI SkNx operator!=(const SkNx& o) const {
mtklein7c0db752016-07-30 14:18:49 -0700103 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec)));
104 }
mtkleinc9adb052015-03-30 10:50:27 -0700105
Mike Klein7c78f3a2016-10-19 09:21:11 -0400106 AI static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fVec); }
107 AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -0700108
Chris Dalton7732f4f2017-08-28 14:45:40 -0600109 AI SkNx abs() const { return vabs_f32(fVec); }
Chris Dalton89c5e882018-06-08 11:46:42 -0600110 AI SkNx floor() const {
111 #if defined(SK_CPU_ARM64)
112 return vrndm_f32(fVec);
113 #else
114 return emulate_vrndm_f32(fVec);
115 #endif
116 }
Chris Dalton7732f4f2017-08-28 14:45:40 -0600117
Mike Klein7c78f3a2016-10-19 09:21:11 -0400118 AI SkNx rsqrt() const {
mtkleinf8f90e42016-03-21 10:04:46 -0700119 float32x2_t est0 = vrsqrte_f32(fVec);
mtkleind7c014f2015-04-27 14:22:32 -0700120 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
121 }
mtkleinc9adb052015-03-30 10:50:27 -0700122
Mike Klein7c78f3a2016-10-19 09:21:11 -0400123 AI SkNx sqrt() const {
mtkleinc9adb052015-03-30 10:50:27 -0700124 #if defined(SK_CPU_ARM64)
125 return vsqrt_f32(fVec);
126 #else
mtkleinf8f90e42016-03-21 10:04:46 -0700127 float32x2_t est0 = vrsqrte_f32(fVec),
128 est1 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0),
129 est2 = vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
130 return vmul_f32(fVec, est2);
mtkleinc9adb052015-03-30 10:50:27 -0700131 #endif
132 }
133
Mike Klein7c78f3a2016-10-19 09:21:11 -0400134 AI float operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700135 SkASSERT(0 <= k && k < 2);
136 union { float32x2_t v; float fs[2]; } pun = {fVec};
137 return pun.fs[k&1];
mtkleinc9adb052015-03-30 10:50:27 -0700138 }
139
Mike Klein7c78f3a2016-10-19 09:21:11 -0400140 AI bool allTrue() const {
Mike Klein15eb1e92018-08-31 11:21:27 -0400141 #if defined(SK_CPU_ARM64)
Mike Klein68ff92f2018-03-26 13:04:14 -0400142 return 0 != vminv_u32(vreinterpret_u32_f32(fVec));
143 #else
mtkleinb5e86112015-06-24 15:18:39 -0700144 auto v = vreinterpret_u32_f32(fVec);
145 return vget_lane_u32(v,0) && vget_lane_u32(v,1);
Mike Klein68ff92f2018-03-26 13:04:14 -0400146 #endif
mtkleinb5e86112015-06-24 15:18:39 -0700147 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400148 AI bool anyTrue() const {
Mike Klein15eb1e92018-08-31 11:21:27 -0400149 #if defined(SK_CPU_ARM64)
Mike Klein68ff92f2018-03-26 13:04:14 -0400150 return 0 != vmaxv_u32(vreinterpret_u32_f32(fVec));
151 #else
mtkleinb5e86112015-06-24 15:18:39 -0700152 auto v = vreinterpret_u32_f32(fVec);
153 return vget_lane_u32(v,0) || vget_lane_u32(v,1);
Mike Klein68ff92f2018-03-26 13:04:14 -0400154 #endif
mtkleinb5e86112015-06-24 15:18:39 -0700155 }
156
Chris Dalton7732f4f2017-08-28 14:45:40 -0600157 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
158 return vbsl_f32(vreinterpret_u32_f32(fVec), t.fVec, e.fVec);
159 }
160
mtkleinc9adb052015-03-30 10:50:27 -0700161 float32x2_t fVec;
162};
163
mtkleinc9adb052015-03-30 10:50:27 -0700164template <>
mtklein6c221b42015-11-20 13:53:19 -0800165class SkNx<4, float> {
mtkleinc9adb052015-03-30 10:50:27 -0700166public:
Mike Klein7c78f3a2016-10-19 09:21:11 -0400167 AI SkNx(float32x4_t vec) : fVec(vec) {}
mtkleinc9adb052015-03-30 10:50:27 -0700168
Mike Klein7c78f3a2016-10-19 09:21:11 -0400169 AI SkNx() {}
170 AI SkNx(float val) : fVec(vdupq_n_f32(val)) {}
171 AI SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
mtklein7c0db752016-07-30 14:18:49 -0700172
Mike Klein7c78f3a2016-10-19 09:21:11 -0400173 AI static SkNx Load(const void* ptr) { return vld1q_f32((const float*)ptr); }
174 AI void store(void* ptr) const { vst1q_f32((float*)ptr, fVec); }
Mike Klein33cbfd72016-10-06 11:09:27 -0400175
Mike Klein213d8212017-11-30 12:07:20 -0500176 AI static void Load2(const void* ptr, SkNx* x, SkNx* y) {
177 float32x4x2_t xy = vld2q_f32((const float*) ptr);
178 *x = xy.val[0];
179 *y = xy.val[1];
180 }
181
Mike Klein7c78f3a2016-10-19 09:21:11 -0400182 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
Mike Klein33cbfd72016-10-06 11:09:27 -0400183 float32x4x4_t rgba = vld4q_f32((const float*) ptr);
184 *r = rgba.val[0];
185 *g = rgba.val[1];
186 *b = rgba.val[2];
187 *a = rgba.val[3];
188 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400189 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
Mike Klein33cbfd72016-10-06 11:09:27 -0400190 float32x4x4_t rgba = {{
191 r.fVec,
192 g.fVec,
193 b.fVec,
194 a.fVec,
195 }};
196 vst4q_f32((float*) dst, rgba);
197 }
198
Mike Klein7c78f3a2016-10-19 09:21:11 -0400199 AI SkNx invert() const {
mtklein7c0db752016-07-30 14:18:49 -0700200 float32x4_t est0 = vrecpeq_f32(fVec),
201 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
202 return est1;
203 }
mtkleinc9adb052015-03-30 10:50:27 -0700204
Chris Dalton7732f4f2017-08-28 14:45:40 -0600205 AI SkNx operator - () const { return vnegq_f32(fVec); }
206
Mike Klein7c78f3a2016-10-19 09:21:11 -0400207 AI SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); }
208 AI SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); }
209 AI SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); }
210 AI SkNx operator / (const SkNx& o) const {
mtklein7c0db752016-07-30 14:18:49 -0700211 #if defined(SK_CPU_ARM64)
212 return vdivq_f32(fVec, o.fVec);
213 #else
214 float32x4_t est0 = vrecpeq_f32(o.fVec),
215 est1 = vmulq_f32(vrecpsq_f32(est0, o.fVec), est0),
216 est2 = vmulq_f32(vrecpsq_f32(est1, o.fVec), est1);
217 return vmulq_f32(fVec, est2);
218 #endif
219 }
mtkleinc9adb052015-03-30 10:50:27 -0700220
Mike Klein7c78f3a2016-10-19 09:21:11 -0400221 AI SkNx operator==(const SkNx& o) const {return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec));}
222 AI SkNx operator <(const SkNx& o) const {return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec));}
223 AI SkNx operator >(const SkNx& o) const {return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec));}
224 AI SkNx operator<=(const SkNx& o) const {return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec));}
225 AI SkNx operator>=(const SkNx& o) const {return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec));}
226 AI SkNx operator!=(const SkNx& o) const {
mtklein7c0db752016-07-30 14:18:49 -0700227 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec)));
228 }
mtkleinc9adb052015-03-30 10:50:27 -0700229
Mike Klein7c78f3a2016-10-19 09:21:11 -0400230 AI static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.fVec); }
231 AI static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.fVec); }
mtkleinc9adb052015-03-30 10:50:27 -0700232
Mike Klein7c78f3a2016-10-19 09:21:11 -0400233 AI SkNx abs() const { return vabsq_f32(fVec); }
234 AI SkNx floor() const {
mtklein126626e2016-02-09 15:41:36 -0800235 #if defined(SK_CPU_ARM64)
236 return vrndmq_f32(fVec);
237 #else
Chris Dalton89c5e882018-06-08 11:46:42 -0600238 return emulate_vrndmq_f32(fVec);
mtklein126626e2016-02-09 15:41:36 -0800239 #endif
240 }
241
mtklein7c0db752016-07-30 14:18:49 -0700242
Mike Klein7c78f3a2016-10-19 09:21:11 -0400243 AI SkNx rsqrt() const {
mtkleinf8f90e42016-03-21 10:04:46 -0700244 float32x4_t est0 = vrsqrteq_f32(fVec);
mtkleind7c014f2015-04-27 14:22:32 -0700245 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
246 }
mtkleinc9adb052015-03-30 10:50:27 -0700247
Mike Klein7c78f3a2016-10-19 09:21:11 -0400248 AI SkNx sqrt() const {
mtkleinc9adb052015-03-30 10:50:27 -0700249 #if defined(SK_CPU_ARM64)
250 return vsqrtq_f32(fVec);
251 #else
mtkleinf8f90e42016-03-21 10:04:46 -0700252 float32x4_t est0 = vrsqrteq_f32(fVec),
253 est1 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0),
254 est2 = vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
255 return vmulq_f32(fVec, est2);
mtkleinc9adb052015-03-30 10:50:27 -0700256 #endif
257 }
258
Mike Klein7c78f3a2016-10-19 09:21:11 -0400259 AI float operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700260 SkASSERT(0 <= k && k < 4);
261 union { float32x4_t v; float fs[4]; } pun = {fVec};
262 return pun.fs[k&3];
mtkleinc9adb052015-03-30 10:50:27 -0700263 }
264
Chris Daltone3fda932018-04-11 13:18:09 -0600265 AI float min() const {
Mike Klein15eb1e92018-08-31 11:21:27 -0400266 #if defined(SK_CPU_ARM64)
Chris Daltone3fda932018-04-11 13:18:09 -0600267 return vminvq_f32(fVec);
268 #else
269 SkNx min = Min(*this, vrev64q_f32(fVec));
Mike Kleinbf45c702018-06-11 11:56:57 -0400270 return SkTMin(min[0], min[2]);
Chris Daltone3fda932018-04-11 13:18:09 -0600271 #endif
272 }
273
274 AI float max() const {
Mike Klein15eb1e92018-08-31 11:21:27 -0400275 #if defined(SK_CPU_ARM64)
Chris Daltone3fda932018-04-11 13:18:09 -0600276 return vmaxvq_f32(fVec);
277 #else
278 SkNx max = Max(*this, vrev64q_f32(fVec));
Mike Kleinbf45c702018-06-11 11:56:57 -0400279 return SkTMax(max[0], max[2]);
Chris Daltone3fda932018-04-11 13:18:09 -0600280 #endif
281 }
282
Mike Klein7c78f3a2016-10-19 09:21:11 -0400283 AI bool allTrue() const {
Mike Klein15eb1e92018-08-31 11:21:27 -0400284 #if defined(SK_CPU_ARM64)
Mike Klein68ff92f2018-03-26 13:04:14 -0400285 return 0 != vminvq_u32(vreinterpretq_u32_f32(fVec));
286 #else
mtkleinb5e86112015-06-24 15:18:39 -0700287 auto v = vreinterpretq_u32_f32(fVec);
288 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1)
289 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3);
Mike Klein68ff92f2018-03-26 13:04:14 -0400290 #endif
mtkleinb5e86112015-06-24 15:18:39 -0700291 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400292 AI bool anyTrue() const {
Mike Klein15eb1e92018-08-31 11:21:27 -0400293 #if defined(SK_CPU_ARM64)
Mike Klein68ff92f2018-03-26 13:04:14 -0400294 return 0 != vmaxvq_u32(vreinterpretq_u32_f32(fVec));
295 #else
mtkleinb5e86112015-06-24 15:18:39 -0700296 auto v = vreinterpretq_u32_f32(fVec);
297 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1)
298 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3);
Mike Klein68ff92f2018-03-26 13:04:14 -0400299 #endif
mtkleinb5e86112015-06-24 15:18:39 -0700300 }
301
Mike Klein7c78f3a2016-10-19 09:21:11 -0400302 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtkleincf4e5672015-07-27 06:12:05 -0700303 return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec);
mtklein2aab22a2015-06-26 10:46:31 -0700304 }
305
mtkleinc9adb052015-03-30 10:50:27 -0700306 float32x4_t fVec;
307};
308
Mike Kleinf0348c22016-11-03 14:43:48 -0400309#if defined(SK_CPU_ARM64)
310 AI static Sk4f SkNx_fma(const Sk4f& f, const Sk4f& m, const Sk4f& a) {
311 return vfmaq_f32(a.fVec, f.fVec, m.fVec);
312 }
313#endif
314
mtklein550e9b02016-01-20 11:55:51 -0800315// It's possible that for our current use cases, representing this as
316// half a uint16x8_t might be better than representing it as a uint16x4_t.
317// It'd make conversion to Sk4b one step simpler.
318template <>
319class SkNx<4, uint16_t> {
320public:
Mike Klein7c78f3a2016-10-19 09:21:11 -0400321 AI SkNx(const uint16x4_t& vec) : fVec(vec) {}
mtklein550e9b02016-01-20 11:55:51 -0800322
Mike Klein7c78f3a2016-10-19 09:21:11 -0400323 AI SkNx() {}
324 AI SkNx(uint16_t val) : fVec(vdup_n_u16(val)) {}
325 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
mtklein7c0db752016-07-30 14:18:49 -0700326 fVec = (uint16x4_t) { a,b,c,d };
327 }
328
Mike Klein7c78f3a2016-10-19 09:21:11 -0400329 AI static SkNx Load(const void* ptr) { return vld1_u16((const uint16_t*)ptr); }
330 AI void store(void* ptr) const { vst1_u16((uint16_t*)ptr, fVec); }
mtklein550e9b02016-01-20 11:55:51 -0800331
Mike Klein7c78f3a2016-10-19 09:21:11 -0400332 AI static void Load4(const void* ptr, SkNx* r, SkNx* g, SkNx* b, SkNx* a) {
Mike Klein33cbfd72016-10-06 11:09:27 -0400333 uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
334 *r = rgba.val[0];
335 *g = rgba.val[1];
336 *b = rgba.val[2];
337 *a = rgba.val[3];
338 }
Matt Sarett5bee0b62017-01-19 12:04:32 -0500339 AI static void Load3(const void* ptr, SkNx* r, SkNx* g, SkNx* b) {
340 uint16x4x3_t rgba = vld3_u16((const uint16_t*)ptr);
341 *r = rgba.val[0];
342 *g = rgba.val[1];
343 *b = rgba.val[2];
344 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400345 AI static void Store4(void* dst, const SkNx& r, const SkNx& g, const SkNx& b, const SkNx& a) {
Mike Klein33cbfd72016-10-06 11:09:27 -0400346 uint16x4x4_t rgba = {{
347 r.fVec,
348 g.fVec,
349 b.fVec,
350 a.fVec,
351 }};
352 vst4_u16((uint16_t*) dst, rgba);
353 }
354
Mike Klein7c78f3a2016-10-19 09:21:11 -0400355 AI SkNx operator + (const SkNx& o) const { return vadd_u16(fVec, o.fVec); }
356 AI SkNx operator - (const SkNx& o) const { return vsub_u16(fVec, o.fVec); }
357 AI SkNx operator * (const SkNx& o) const { return vmul_u16(fVec, o.fVec); }
Matt Sarett379938e2017-01-12 18:34:29 -0500358 AI SkNx operator & (const SkNx& o) const { return vand_u16(fVec, o.fVec); }
359 AI SkNx operator | (const SkNx& o) const { return vorr_u16(fVec, o.fVec); }
mtklein550e9b02016-01-20 11:55:51 -0800360
Mike Klein7c78f3a2016-10-19 09:21:11 -0400361 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
362 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
mtklein550e9b02016-01-20 11:55:51 -0800363
Mike Klein7c78f3a2016-10-19 09:21:11 -0400364 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vmin_u16(a.fVec, b.fVec); }
mtklein550e9b02016-01-20 11:55:51 -0800365
Mike Klein7c78f3a2016-10-19 09:21:11 -0400366 AI uint16_t operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700367 SkASSERT(0 <= k && k < 4);
368 union { uint16x4_t v; uint16_t us[4]; } pun = {fVec};
369 return pun.us[k&3];
370 }
mtklein550e9b02016-01-20 11:55:51 -0800371
Mike Klein7c78f3a2016-10-19 09:21:11 -0400372 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtklein550e9b02016-01-20 11:55:51 -0800373 return vbsl_u16(fVec, t.fVec, e.fVec);
374 }
375
376 uint16x4_t fVec;
377};
378
mtkleind2ffd362015-05-12 06:11:21 -0700379template <>
mtklein6c221b42015-11-20 13:53:19 -0800380class SkNx<8, uint16_t> {
mtkleind2ffd362015-05-12 06:11:21 -0700381public:
Mike Klein7c78f3a2016-10-19 09:21:11 -0400382 AI SkNx(const uint16x8_t& vec) : fVec(vec) {}
mtkleind2ffd362015-05-12 06:11:21 -0700383
Mike Klein7c78f3a2016-10-19 09:21:11 -0400384 AI SkNx() {}
385 AI SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {}
386 AI static SkNx Load(const void* ptr) { return vld1q_u16((const uint16_t*)ptr); }
mtklein7c0db752016-07-30 14:18:49 -0700387
Mike Klein7c78f3a2016-10-19 09:21:11 -0400388 AI SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
389 uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
mtklein7c0db752016-07-30 14:18:49 -0700390 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
391 }
392
Mike Klein7c78f3a2016-10-19 09:21:11 -0400393 AI void store(void* ptr) const { vst1q_u16((uint16_t*)ptr, fVec); }
mtkleind2ffd362015-05-12 06:11:21 -0700394
Mike Klein7c78f3a2016-10-19 09:21:11 -0400395 AI SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); }
396 AI SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); }
397 AI SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); }
Matt Sarett379938e2017-01-12 18:34:29 -0500398 AI SkNx operator & (const SkNx& o) const { return vandq_u16(fVec, o.fVec); }
399 AI SkNx operator | (const SkNx& o) const { return vorrq_u16(fVec, o.fVec); }
mtkleind2ffd362015-05-12 06:11:21 -0700400
Mike Klein7c78f3a2016-10-19 09:21:11 -0400401 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
402 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
mtkleind2ffd362015-05-12 06:11:21 -0700403
Mike Klein7c78f3a2016-10-19 09:21:11 -0400404 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.fVec); }
mtklein27e517a2015-05-14 17:53:04 -0700405
Mike Klein7c78f3a2016-10-19 09:21:11 -0400406 AI uint16_t operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700407 SkASSERT(0 <= k && k < 8);
408 union { uint16x8_t v; uint16_t us[8]; } pun = {fVec};
409 return pun.us[k&7];
410 }
mtkleind2ffd362015-05-12 06:11:21 -0700411
Herb Derbyd1b3c782017-11-02 13:18:38 -0400412 AI SkNx mulHi(const SkNx& m) const {
413 uint32x4_t hi = vmull_u16(vget_high_u16(fVec), vget_high_u16(m.fVec));
414 uint32x4_t lo = vmull_u16( vget_low_u16(fVec), vget_low_u16(m.fVec));
415
416 return { vcombine_u16(vshrn_n_u32(lo,16), vshrn_n_u32(hi,16)) };
417 }
418
Mike Klein7c78f3a2016-10-19 09:21:11 -0400419 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtkleincf4e5672015-07-27 06:12:05 -0700420 return vbslq_u16(fVec, t.fVec, e.fVec);
mtklein4be181e2015-07-14 10:54:19 -0700421 }
422
mtkleind2ffd362015-05-12 06:11:21 -0700423 uint16x8_t fVec;
424};
425
426template <>
mtklein6f37b4a2015-12-14 11:25:18 -0800427class SkNx<4, uint8_t> {
428public:
mtkleina5e1e332016-07-26 10:07:34 -0700429 typedef uint32_t __attribute__((aligned(1))) unaligned_uint32_t;
430
Mike Klein7c78f3a2016-10-19 09:21:11 -0400431 AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
mtklein6f37b4a2015-12-14 11:25:18 -0800432
Mike Klein7c78f3a2016-10-19 09:21:11 -0400433 AI SkNx() {}
434 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
mtklein7c0db752016-07-30 14:18:49 -0700435 fVec = (uint8x8_t){a,b,c,d, 0,0,0,0};
436 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400437 AI static SkNx Load(const void* ptr) {
mtkleina5e1e332016-07-26 10:07:34 -0700438 return (uint8x8_t)vld1_dup_u32((const unaligned_uint32_t*)ptr);
mtklein6f37b4a2015-12-14 11:25:18 -0800439 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400440 AI void store(void* ptr) const {
mtkleina5e1e332016-07-26 10:07:34 -0700441 return vst1_lane_u32((unaligned_uint32_t*)ptr, (uint32x2_t)fVec, 0);
mtklein6f37b4a2015-12-14 11:25:18 -0800442 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400443 AI uint8_t operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700444 SkASSERT(0 <= k && k < 4);
445 union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
446 return pun.us[k&3];
447 }
mtklein6f37b4a2015-12-14 11:25:18 -0800448
mtklein7c0db752016-07-30 14:18:49 -0700449 // TODO as needed
mtklein6f37b4a2015-12-14 11:25:18 -0800450
451 uint8x8_t fVec;
452};
453
454template <>
Herb Derbyd1b3c782017-11-02 13:18:38 -0400455class SkNx<8, uint8_t> {
456public:
457 AI SkNx(const uint8x8_t& vec) : fVec(vec) {}
458
459 AI SkNx() {}
460 AI SkNx(uint8_t val) : fVec(vdup_n_u8(val)) {}
461 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
462 uint8_t e, uint8_t f, uint8_t g, uint8_t h) {
463 fVec = (uint8x8_t) { a,b,c,d, e,f,g,h };
464 }
465
466 AI static SkNx Load(const void* ptr) { return vld1_u8((const uint8_t*)ptr); }
467 AI void store(void* ptr) const { vst1_u8((uint8_t*)ptr, fVec); }
468
469 AI uint8_t operator[](int k) const {
470 SkASSERT(0 <= k && k < 8);
471 union { uint8x8_t v; uint8_t us[8]; } pun = {fVec};
472 return pun.us[k&7];
473 }
474
475 uint8x8_t fVec;
476};
477
478template <>
mtklein6c221b42015-11-20 13:53:19 -0800479class SkNx<16, uint8_t> {
mtkleind2ffd362015-05-12 06:11:21 -0700480public:
Mike Klein7c78f3a2016-10-19 09:21:11 -0400481 AI SkNx(const uint8x16_t& vec) : fVec(vec) {}
mtkleind2ffd362015-05-12 06:11:21 -0700482
Mike Klein7c78f3a2016-10-19 09:21:11 -0400483 AI SkNx() {}
484 AI SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {}
485 AI SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
486 uint8_t e, uint8_t f, uint8_t g, uint8_t h,
487 uint8_t i, uint8_t j, uint8_t k, uint8_t l,
488 uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
mtklein7c0db752016-07-30 14:18:49 -0700489 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
490 }
mtkleind2ffd362015-05-12 06:11:21 -0700491
Mike Klein7c78f3a2016-10-19 09:21:11 -0400492 AI static SkNx Load(const void* ptr) { return vld1q_u8((const uint8_t*)ptr); }
493 AI void store(void* ptr) const { vst1q_u8((uint8_t*)ptr, fVec); }
mtkleind2ffd362015-05-12 06:11:21 -0700494
Mike Klein7c78f3a2016-10-19 09:21:11 -0400495 AI SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); }
mtklein04d24a32015-05-13 08:02:14 -0700496
Mike Klein7c78f3a2016-10-19 09:21:11 -0400497 AI SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); }
498 AI SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); }
Mike Klein7dfe6d92018-12-18 14:53:37 -0500499 AI SkNx operator & (const SkNx& o) const { return vandq_u8(fVec, o.fVec); }
mtkleind2ffd362015-05-12 06:11:21 -0700500
Mike Klein7c78f3a2016-10-19 09:21:11 -0400501 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fVec); }
502 AI SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); }
mtklein27e517a2015-05-14 17:53:04 -0700503
Mike Klein7c78f3a2016-10-19 09:21:11 -0400504 AI uint8_t operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700505 SkASSERT(0 <= k && k < 16);
506 union { uint8x16_t v; uint8_t us[16]; } pun = {fVec};
507 return pun.us[k&15];
508 }
mtkleind2ffd362015-05-12 06:11:21 -0700509
Mike Klein7c78f3a2016-10-19 09:21:11 -0400510 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtkleincf4e5672015-07-27 06:12:05 -0700511 return vbslq_u8(fVec, t.fVec, e.fVec);
mtkleinb5e86112015-06-24 15:18:39 -0700512 }
513
mtkleind2ffd362015-05-12 06:11:21 -0700514 uint8x16_t fVec;
515};
516
mtkleinf8f90e42016-03-21 10:04:46 -0700517template <>
mtkleind05a8752016-07-29 10:10:15 -0700518class SkNx<4, int32_t> {
mtkleinf8f90e42016-03-21 10:04:46 -0700519public:
Mike Klein7c78f3a2016-10-19 09:21:11 -0400520 AI SkNx(const int32x4_t& vec) : fVec(vec) {}
mtkleinf8f90e42016-03-21 10:04:46 -0700521
Mike Klein7c78f3a2016-10-19 09:21:11 -0400522 AI SkNx() {}
523 AI SkNx(int32_t v) {
mtklein7c0db752016-07-30 14:18:49 -0700524 fVec = vdupq_n_s32(v);
525 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400526 AI SkNx(int32_t a, int32_t b, int32_t c, int32_t d) {
mtklein7c0db752016-07-30 14:18:49 -0700527 fVec = (int32x4_t){a,b,c,d};
528 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400529 AI static SkNx Load(const void* ptr) {
mtklein7c0db752016-07-30 14:18:49 -0700530 return vld1q_s32((const int32_t*)ptr);
531 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400532 AI void store(void* ptr) const {
mtklein7c0db752016-07-30 14:18:49 -0700533 return vst1q_s32((int32_t*)ptr, fVec);
534 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400535 AI int32_t operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700536 SkASSERT(0 <= k && k < 4);
537 union { int32x4_t v; int32_t is[4]; } pun = {fVec};
538 return pun.is[k&3];
539 }
mtkleinf8f90e42016-03-21 10:04:46 -0700540
Mike Klein7c78f3a2016-10-19 09:21:11 -0400541 AI SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); }
542 AI SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); }
543 AI SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); }
mtkleinf8f90e42016-03-21 10:04:46 -0700544
Mike Klein7c78f3a2016-10-19 09:21:11 -0400545 AI SkNx operator & (const SkNx& o) const { return vandq_s32(fVec, o.fVec); }
546 AI SkNx operator | (const SkNx& o) const { return vorrq_s32(fVec, o.fVec); }
547 AI SkNx operator ^ (const SkNx& o) const { return veorq_s32(fVec, o.fVec); }
mtklein64f061a2016-06-17 12:09:16 -0700548
Mike Klein7c78f3a2016-10-19 09:21:11 -0400549 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
550 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
mtkleinf8f90e42016-03-21 10:04:46 -0700551
Mike Klein7c78f3a2016-10-19 09:21:11 -0400552 AI SkNx operator == (const SkNx& o) const {
mtklein7c0db752016-07-30 14:18:49 -0700553 return vreinterpretq_s32_u32(vceqq_s32(fVec, o.fVec));
554 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400555 AI SkNx operator < (const SkNx& o) const {
mtklein7c0db752016-07-30 14:18:49 -0700556 return vreinterpretq_s32_u32(vcltq_s32(fVec, o.fVec));
557 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400558 AI SkNx operator > (const SkNx& o) const {
mtklein7c0db752016-07-30 14:18:49 -0700559 return vreinterpretq_s32_u32(vcgtq_s32(fVec, o.fVec));
560 }
mtklein58e389b2016-07-15 07:00:11 -0700561
Mike Klein7c78f3a2016-10-19 09:21:11 -0400562 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_s32(a.fVec, b.fVec); }
Yuqian Li7da6ba22017-07-12 13:36:05 -0400563 AI static SkNx Max(const SkNx& a, const SkNx& b) { return vmaxq_s32(a.fVec, b.fVec); }
mtklein7c0db752016-07-30 14:18:49 -0700564 // TODO as needed
mtkleinf8f90e42016-03-21 10:04:46 -0700565
Mike Klein7c78f3a2016-10-19 09:21:11 -0400566 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtklein58e389b2016-07-15 07:00:11 -0700567 return vbslq_s32(vreinterpretq_u32_s32(fVec), t.fVec, e.fVec);
568 }
569
Yuqian Li7da6ba22017-07-12 13:36:05 -0400570 AI SkNx abs() const { return vabsq_s32(fVec); }
571
mtkleinf8f90e42016-03-21 10:04:46 -0700572 int32x4_t fVec;
573};
574
mtkleind05a8752016-07-29 10:10:15 -0700575template <>
576class SkNx<4, uint32_t> {
577public:
Mike Klein7c78f3a2016-10-19 09:21:11 -0400578 AI SkNx(const uint32x4_t& vec) : fVec(vec) {}
mtkleind05a8752016-07-29 10:10:15 -0700579
Mike Klein7c78f3a2016-10-19 09:21:11 -0400580 AI SkNx() {}
581 AI SkNx(uint32_t v) {
mtklein7c0db752016-07-30 14:18:49 -0700582 fVec = vdupq_n_u32(v);
583 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400584 AI SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
mtklein7c0db752016-07-30 14:18:49 -0700585 fVec = (uint32x4_t){a,b,c,d};
586 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400587 AI static SkNx Load(const void* ptr) {
mtklein7c0db752016-07-30 14:18:49 -0700588 return vld1q_u32((const uint32_t*)ptr);
589 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400590 AI void store(void* ptr) const {
mtklein7c0db752016-07-30 14:18:49 -0700591 return vst1q_u32((uint32_t*)ptr, fVec);
592 }
Mike Klein7c78f3a2016-10-19 09:21:11 -0400593 AI uint32_t operator[](int k) const {
mtklein7c0db752016-07-30 14:18:49 -0700594 SkASSERT(0 <= k && k < 4);
595 union { uint32x4_t v; uint32_t us[4]; } pun = {fVec};
596 return pun.us[k&3];
597 }
mtkleind05a8752016-07-29 10:10:15 -0700598
Mike Klein7c78f3a2016-10-19 09:21:11 -0400599 AI SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); }
600 AI SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); }
601 AI SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); }
mtkleind05a8752016-07-29 10:10:15 -0700602
Mike Klein7c78f3a2016-10-19 09:21:11 -0400603 AI SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); }
604 AI SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); }
605 AI SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); }
mtkleind05a8752016-07-29 10:10:15 -0700606
Mike Klein7c78f3a2016-10-19 09:21:11 -0400607 AI SkNx operator << (int bits) const { return fVec << SkNx(bits).fVec; }
608 AI SkNx operator >> (int bits) const { return fVec >> SkNx(bits).fVec; }
mtkleind05a8752016-07-29 10:10:15 -0700609
Mike Klein7c78f3a2016-10-19 09:21:11 -0400610 AI SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); }
611 AI SkNx operator < (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); }
612 AI SkNx operator > (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); }
mtkleind05a8752016-07-29 10:10:15 -0700613
Mike Klein7c78f3a2016-10-19 09:21:11 -0400614 AI static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); }
mtklein7c0db752016-07-30 14:18:49 -0700615 // TODO as needed
mtkleind05a8752016-07-29 10:10:15 -0700616
Herb Derby5eb15282017-10-10 17:14:18 -0400617 AI SkNx mulHi(const SkNx& m) const {
618 uint64x2_t hi = vmull_u32(vget_high_u32(fVec), vget_high_u32(m.fVec));
619 uint64x2_t lo = vmull_u32( vget_low_u32(fVec), vget_low_u32(m.fVec));
620
621 return { vcombine_u32(vshrn_n_u64(lo,32), vshrn_n_u64(hi,32)) };
622 }
623
Mike Klein7c78f3a2016-10-19 09:21:11 -0400624 AI SkNx thenElse(const SkNx& t, const SkNx& e) const {
mtkleind05a8752016-07-29 10:10:15 -0700625 return vbslq_u32(fVec, t.fVec, e.fVec);
626 }
627
628 uint32x4_t fVec;
629};
630
Mike Klein7c78f3a2016-10-19 09:21:11 -0400631template<> AI /*static*/ Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
mtkleinf8f90e42016-03-21 10:04:46 -0700632 return vcvtq_s32_f32(src.fVec);
633
634}
Mike Klein7c78f3a2016-10-19 09:21:11 -0400635template<> AI /*static*/ Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
mtkleinf8f90e42016-03-21 10:04:46 -0700636 return vcvtq_f32_s32(src.fVec);
637}
Mike Klein7c78f3a2016-10-19 09:21:11 -0400638template<> AI /*static*/ Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
mtkleind05a8752016-07-29 10:10:15 -0700639 return SkNx_cast<float>(Sk4i::Load(&src));
640}
mtkleinf8f90e42016-03-21 10:04:46 -0700641
Mike Klein7c78f3a2016-10-19 09:21:11 -0400642template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
mtkleinbe8c19e2016-02-19 09:40:24 -0800643 return vqmovn_u32(vcvtq_u32_f32(src.fVec));
644}
645
Mike Klein7c78f3a2016-10-19 09:21:11 -0400646template<> AI /*static*/ Sk4f SkNx_cast<float, uint16_t>(const Sk4h& src) {
mtkleinbe8c19e2016-02-19 09:40:24 -0800647 return vcvtq_f32_u32(vmovl_u16(src.fVec));
648}
649
Mike Klein7c78f3a2016-10-19 09:21:11 -0400650template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
mtklein6f37b4a2015-12-14 11:25:18 -0800651 uint32x4_t _32 = vcvtq_u32_f32(src.fVec);
652 uint16x4_t _16 = vqmovn_u32(_32);
653 return vqmovn_u16(vcombine_u16(_16, _16));
654}
655
Herb Derby5eb15282017-10-10 17:14:18 -0400656template<> AI /*static*/ Sk4u SkNx_cast<uint32_t, uint8_t>(const Sk4b& src) {
Mike Klein06a65e22016-11-17 12:39:09 -0500657 uint16x8_t _16 = vmovl_u8(src.fVec);
Herb Derby5eb15282017-10-10 17:14:18 -0400658 return vmovl_u16(vget_low_u16(_16));
659}
660
661template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint8_t>(const Sk4b& src) {
662 return vreinterpretq_s32_u32(SkNx_cast<uint32_t>(src).fVec);
Mike Klein06a65e22016-11-17 12:39:09 -0500663}
664
Mike Klein7c78f3a2016-10-19 09:21:11 -0400665template<> AI /*static*/ Sk4f SkNx_cast<float, uint8_t>(const Sk4b& src) {
Mike Kleinb6ab4ae2016-11-17 14:33:11 -0500666 return vcvtq_f32_s32(SkNx_cast<int32_t>(src).fVec);
mtklein6f37b4a2015-12-14 11:25:18 -0800667}
668
Mike Klein7c78f3a2016-10-19 09:21:11 -0400669template<> AI /*static*/ Sk16b SkNx_cast<uint8_t, float>(const Sk16f& src) {
mtkleinf8f90e42016-03-21 10:04:46 -0700670 Sk8f ab, cd;
671 SkNx_split(src, &ab, &cd);
672
673 Sk4f a,b,c,d;
674 SkNx_split(ab, &a, &b);
675 SkNx_split(cd, &c, &d);
676 return vuzpq_u8(vuzpq_u8((uint8x16_t)vcvtq_u32_f32(a.fVec),
677 (uint8x16_t)vcvtq_u32_f32(b.fVec)).val[0],
678 vuzpq_u8((uint8x16_t)vcvtq_u32_f32(c.fVec),
679 (uint8x16_t)vcvtq_u32_f32(d.fVec)).val[0]).val[0];
mtklein6f37b4a2015-12-14 11:25:18 -0800680}
681
Herb Derbyd1b3c782017-11-02 13:18:38 -0400682template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, int32_t>(const Sk8i& src) {
683 Sk4i a, b;
684 SkNx_split(src, &a, &b);
685 uint16x4_t a16 = vqmovun_s32(a.fVec);
686 uint16x4_t b16 = vqmovun_s32(b.fVec);
687
688 return vqmovn_u16(vcombine_u16(a16, b16));
689}
690
Mike Klein7c78f3a2016-10-19 09:21:11 -0400691template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, uint8_t>(const Sk4b& src) {
mtklein550e9b02016-01-20 11:55:51 -0800692 return vget_low_u16(vmovl_u8(src.fVec));
693}
694
Herb Derbyd1b3c782017-11-02 13:18:38 -0400695template<> AI /*static*/ Sk8h SkNx_cast<uint16_t, uint8_t>(const Sk8b& src) {
696 return vmovl_u8(src.fVec);
697}
698
Mike Klein7c78f3a2016-10-19 09:21:11 -0400699template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
mtklein550e9b02016-01-20 11:55:51 -0800700 return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
701}
702
Herb Derbyd1b3c782017-11-02 13:18:38 -0400703template<> AI /*static*/ Sk8b SkNx_cast<uint8_t, uint16_t>(const Sk8h& src) {
704 return vqmovn_u16(src.fVec);
705}
706
Mike Klein7c78f3a2016-10-19 09:21:11 -0400707template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
msarett7d3ff712016-07-12 14:55:45 -0700708 uint16x4_t _16 = vqmovun_s32(src.fVec);
709 return vqmovn_u16(vcombine_u16(_16, _16));
710}
711
Herb Derby0f96bb32017-09-13 16:46:05 -0400712template<> AI /*static*/ Sk4b SkNx_cast<uint8_t, uint32_t>(const Sk4u& src) {
713 uint16x4_t _16 = vqmovn_u32(src.fVec);
714 return vqmovn_u16(vcombine_u16(_16, _16));
715}
716
Mike Klein7c78f3a2016-10-19 09:21:11 -0400717template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
mtklein58e389b2016-07-15 07:00:11 -0700718 return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
719}
720
Mike Klein7c78f3a2016-10-19 09:21:11 -0400721template<> AI /*static*/ Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
mtklein58e389b2016-07-15 07:00:11 -0700722 return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
723}
724
Mike Klein7c78f3a2016-10-19 09:21:11 -0400725template<> AI /*static*/ Sk4i SkNx_cast<int32_t, uint32_t>(const Sk4u& src) {
msarett15ee3de2016-08-02 11:30:30 -0700726 return vreinterpretq_s32_u32(src.fVec);
727}
728
Mike Klein7c78f3a2016-10-19 09:21:11 -0400729AI static Sk4i Sk4f_round(const Sk4f& x) {
msarett7d3ff712016-07-12 14:55:45 -0700730 return vcvtq_s32_f32((x + 0.5f).fVec);
731}
732
Mike Klein1e764642016-10-14 17:09:03 -0400733} // namespace
734
mtkleinc9adb052015-03-30 10:50:27 -0700735#endif//SkNx_neon_DEFINED