blob: 5cf0ac46224c6b90aa782e8e77c86ca2d52ebdf2 [file] [log] [blame]
digit@google.com3ada0ef2012-08-13 14:06:34 +00001/*
2 * Copyright 2012 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +00008#include <arm_neon.h>
digit@google.com3ada0ef2012-08-13 14:06:34 +00009#include "SkColorPriv.h"
10
11/*
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000012 * Filter_32_opaque
13 *
14 * There is no hard-n-fast rule that the filtering must produce
15 * exact results for the color components, but if the 4 incoming colors are
16 * all opaque, then the output color must also be opaque. Subsequent parts of
17 * the drawing pipeline may rely on this (e.g. which blitrow proc to use).
commit-bot@chromium.orgf71be962014-05-13 14:47:11 +000018 *
digit@google.com3ada0ef2012-08-13 14:06:34 +000019 */
commit-bot@chromium.orgf71be962014-05-13 14:47:11 +000020// Chrome on Android uses -Os so we need to force these inline. Otherwise
21// calling the function in the inner loops will cause significant overhead on
22// some platforms.
23static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y,
24 SkPMColor a00, SkPMColor a01,
25 SkPMColor a10, SkPMColor a11,
26 SkPMColor *dst) {
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000027 uint8x8_t vy, vconst16_8, v16_y, vres;
28 uint16x4_t vx, vconst16_16, v16_x, tmp;
29 uint32x2_t va0, va1;
30 uint16x8_t tmp1, tmp2;
rmistry@google.comfbfcd562012-08-23 18:09:54 +000031
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000032 vy = vdup_n_u8(y); // duplicate y into vy
33 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
34 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
rmistry@google.comfbfcd562012-08-23 18:09:54 +000035
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000036 va0 = vdup_n_u32(a00); // duplicate a00
37 va1 = vdup_n_u32(a10); // duplicate a10
38 va0 = vset_lane_u32(a01, va0, 1); // set top to a01
39 va1 = vset_lane_u32(a11, va1, 1); // set top to a11
rmistry@google.comfbfcd562012-08-23 18:09:54 +000040
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000041 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
42 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y
rmistry@google.comfbfcd562012-08-23 18:09:54 +000043
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000044 vx = vdup_n_u16(x); // duplicate x into vx
45 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
46 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
47
48 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
49 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
50 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
51 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
52
53 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
54 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
digit@google.com3ada0ef2012-08-13 14:06:34 +000055}
56
commit-bot@chromium.orgf71be962014-05-13 14:47:11 +000057static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y,
58 SkPMColor a00, SkPMColor a01,
59 SkPMColor a10, SkPMColor a11,
60 SkPMColor *dst,
61 uint16_t scale) {
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000062 uint8x8_t vy, vconst16_8, v16_y, vres;
63 uint16x4_t vx, vconst16_16, v16_x, tmp, vscale;
64 uint32x2_t va0, va1;
65 uint16x8_t tmp1, tmp2;
rmistry@google.comfbfcd562012-08-23 18:09:54 +000066
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000067 vy = vdup_n_u8(y); // duplicate y into vy
68 vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8
69 v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y
rmistry@google.comfbfcd562012-08-23 18:09:54 +000070
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000071 va0 = vdup_n_u32(a00); // duplicate a00
72 va1 = vdup_n_u32(a10); // duplicate a10
73 va0 = vset_lane_u32(a01, va0, 1); // set top to a01
74 va1 = vset_lane_u32(a11, va1, 1); // set top to a11
rmistry@google.comfbfcd562012-08-23 18:09:54 +000075
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000076 tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y)
77 tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y
rmistry@google.comfbfcd562012-08-23 18:09:54 +000078
commit-bot@chromium.org2db4eab2013-09-13 12:39:09 +000079 vx = vdup_n_u16(x); // duplicate x into vx
80 vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16
81 v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x
82
83 tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x
84 tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x
85 tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x)
86 tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x)
87
88 vscale = vdup_n_u16(scale); // duplicate scale
89 tmp = vshr_n_u16(tmp, 8); // shift down result by 8
90 tmp = vmul_u16(tmp, vscale); // multiply result by scale
91
92 vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8
93 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result
digit@google.com3ada0ef2012-08-13 14:06:34 +000094}