digit@google.com | 3ada0ef | 2012-08-13 14:06:34 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2012 The Android Open Source Project |
| 3 | * |
| 4 | * Use of this source code is governed by a BSD-style license that can be |
| 5 | * found in the LICENSE file. |
| 6 | */ |
| 7 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 8 | #include <arm_neon.h> |
digit@google.com | 3ada0ef | 2012-08-13 14:06:34 +0000 | [diff] [blame] | 9 | #include "SkColorPriv.h" |
| 10 | |
| 11 | /* |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 12 | * Filter_32_opaque |
| 13 | * |
| 14 | * There is no hard-n-fast rule that the filtering must produce |
| 15 | * exact results for the color components, but if the 4 incoming colors are |
| 16 | * all opaque, then the output color must also be opaque. Subsequent parts of |
| 17 | * the drawing pipeline may rely on this (e.g. which blitrow proc to use). |
commit-bot@chromium.org | f71be96 | 2014-05-13 14:47:11 +0000 | [diff] [blame] | 18 | * |
digit@google.com | 3ada0ef | 2012-08-13 14:06:34 +0000 | [diff] [blame] | 19 | */ |
commit-bot@chromium.org | f71be96 | 2014-05-13 14:47:11 +0000 | [diff] [blame] | 20 | // Chrome on Android uses -Os so we need to force these inline. Otherwise |
| 21 | // calling the function in the inner loops will cause significant overhead on |
| 22 | // some platforms. |
| 23 | static SK_ALWAYS_INLINE void Filter_32_opaque_neon(unsigned x, unsigned y, |
| 24 | SkPMColor a00, SkPMColor a01, |
| 25 | SkPMColor a10, SkPMColor a11, |
| 26 | SkPMColor *dst) { |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 27 | uint8x8_t vy, vconst16_8, v16_y, vres; |
| 28 | uint16x4_t vx, vconst16_16, v16_x, tmp; |
| 29 | uint32x2_t va0, va1; |
| 30 | uint16x8_t tmp1, tmp2; |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 31 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 32 | vy = vdup_n_u8(y); // duplicate y into vy |
| 33 | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 34 | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 35 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 36 | va0 = vdup_n_u32(a00); // duplicate a00 |
| 37 | va1 = vdup_n_u32(a10); // duplicate a10 |
| 38 | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
| 39 | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 40 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 41 | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
| 42 | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 43 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 44 | vx = vdup_n_u16(x); // duplicate x into vx |
| 45 | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
| 46 | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
| 47 | |
| 48 | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 49 | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 50 | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 51 | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 52 | |
| 53 | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8 |
| 54 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
digit@google.com | 3ada0ef | 2012-08-13 14:06:34 +0000 | [diff] [blame] | 55 | } |
| 56 | |
commit-bot@chromium.org | f71be96 | 2014-05-13 14:47:11 +0000 | [diff] [blame] | 57 | static SK_ALWAYS_INLINE void Filter_32_alpha_neon(unsigned x, unsigned y, |
| 58 | SkPMColor a00, SkPMColor a01, |
| 59 | SkPMColor a10, SkPMColor a11, |
| 60 | SkPMColor *dst, |
| 61 | uint16_t scale) { |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 62 | uint8x8_t vy, vconst16_8, v16_y, vres; |
| 63 | uint16x4_t vx, vconst16_16, v16_x, tmp, vscale; |
| 64 | uint32x2_t va0, va1; |
| 65 | uint16x8_t tmp1, tmp2; |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 66 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 67 | vy = vdup_n_u8(y); // duplicate y into vy |
| 68 | vconst16_8 = vmov_n_u8(16); // set up constant in vconst16_8 |
| 69 | v16_y = vsub_u8(vconst16_8, vy); // v16_y = 16-y |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 70 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 71 | va0 = vdup_n_u32(a00); // duplicate a00 |
| 72 | va1 = vdup_n_u32(a10); // duplicate a10 |
| 73 | va0 = vset_lane_u32(a01, va0, 1); // set top to a01 |
| 74 | va1 = vset_lane_u32(a11, va1, 1); // set top to a11 |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 75 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 76 | tmp1 = vmull_u8(vreinterpret_u8_u32(va0), v16_y); // tmp1 = [a01|a00] * (16-y) |
| 77 | tmp2 = vmull_u8(vreinterpret_u8_u32(va1), vy); // tmp2 = [a11|a10] * y |
rmistry@google.com | fbfcd56 | 2012-08-23 18:09:54 +0000 | [diff] [blame] | 78 | |
commit-bot@chromium.org | 2db4eab | 2013-09-13 12:39:09 +0000 | [diff] [blame] | 79 | vx = vdup_n_u16(x); // duplicate x into vx |
| 80 | vconst16_16 = vmov_n_u16(16); // set up constant in vconst16_16 |
| 81 | v16_x = vsub_u16(vconst16_16, vx); // v16_x = 16-x |
| 82 | |
| 83 | tmp = vmul_u16(vget_high_u16(tmp1), vx); // tmp = a01 * x |
| 84 | tmp = vmla_u16(tmp, vget_high_u16(tmp2), vx); // tmp += a11 * x |
| 85 | tmp = vmla_u16(tmp, vget_low_u16(tmp1), v16_x); // tmp += a00 * (16-x) |
| 86 | tmp = vmla_u16(tmp, vget_low_u16(tmp2), v16_x); // tmp += a10 * (16-x) |
| 87 | |
| 88 | vscale = vdup_n_u16(scale); // duplicate scale |
| 89 | tmp = vshr_n_u16(tmp, 8); // shift down result by 8 |
| 90 | tmp = vmul_u16(tmp, vscale); // multiply result by scale |
| 91 | |
| 92 | vres = vshrn_n_u16(vcombine_u16(tmp, vcreate_u16(0)), 8); // shift down result by 8 |
| 93 | vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); // store result |
digit@google.com | 3ada0ef | 2012-08-13 14:06:34 +0000 | [diff] [blame] | 94 | } |