| /* |
| * Copyright 2014 Google Inc. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include <arm_neon.h> |
| |
| #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale) |
| #define SCALE_FILTER_NAME MAKENAME(_filter_scale) |
| #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine) |
| #define AFFINE_FILTER_NAME MAKENAME(_filter_affine) |
| |
| #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x) |
| #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y) |
| #define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4) |
| #define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4) |
| |
| #ifndef PREAMBLE |
| #define PREAMBLE(state) |
| #define PREAMBLE_PARAM_X |
| #define PREAMBLE_PARAM_Y |
| #define PREAMBLE_ARG_X |
| #define PREAMBLE_ARG_Y |
| #endif |
| |
| static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, |
| uint32_t xy[], int count, int x, int y) { |
| SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
| SkMatrix::kScale_Mask)) == 0); |
| |
| PREAMBLE(s); |
| |
| // we store y, x, x, x, x, x |
| const unsigned maxX = s.fPixmap.width() - 1; |
| SkFractionalInt fx; |
| { |
| const SkBitmapProcStateAutoMapper mapper(s, x, y); |
| const unsigned maxY = s.fPixmap.height() - 1; |
| *xy++ = TILEY_PROCF(mapper.fixedY(), maxY); |
| fx = mapper.fractionalIntX(); |
| } |
| |
| if (0 == maxX) { |
| // all of the following X values must be 0 |
| memset(xy, 0, count * sizeof(uint16_t)); |
| return; |
| } |
| |
| const SkFractionalInt dx = s.fInvSxFractionalInt; |
| |
| #ifdef CHECK_FOR_DECAL |
| // test if we don't need to apply the tile proc |
| const SkFixed fixedFx = SkFractionalIntToFixed(fx); |
| const SkFixed fixedDx = SkFractionalIntToFixed(dx); |
| if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) { |
| decal_nofilter_scale_neon(xy, fixedFx, fixedDx, count); |
| return; |
| } |
| #endif |
| |
| if (count >= 8) { |
| SkFractionalInt dx2 = dx+dx; |
| SkFractionalInt dx4 = dx2+dx2; |
| SkFractionalInt dx8 = dx4+dx4; |
| |
| // now build fx/fx+dx/fx+2dx/fx+3dx |
| SkFractionalInt fx1, fx2, fx3; |
| int32x4_t lbase, hbase; |
| int16_t *dst16 = (int16_t *)xy; |
| |
| fx1 = fx+dx; |
| fx2 = fx1+dx; |
| fx3 = fx2+dx; |
| |
| lbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); |
| lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1); |
| lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2); |
| lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3); |
| hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); |
| |
| // store & bump |
| while (count >= 8) { |
| |
| int16x8_t fx8; |
| |
| fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX); |
| |
| vst1q_s16(dst16, fx8); |
| |
| // but preserving base & on to the next |
| lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); |
| hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); |
| dst16 += 8; |
| count -= 8; |
| fx += dx8; |
| }; |
| xy = (uint32_t *) dst16; |
| } |
| |
| uint16_t* xx = (uint16_t*)xy; |
| for (int i = count; i > 0; --i) { |
| *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); |
| fx += dx; |
| } |
| } |
| |
| static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, |
| uint32_t xy[], int count, int x, int y) { |
| SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); |
| SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
| SkMatrix::kScale_Mask | |
| SkMatrix::kAffine_Mask)) == 0); |
| |
| PREAMBLE(s); |
| const SkBitmapProcStateAutoMapper mapper(s, x, y); |
| |
| SkFractionalInt fx = mapper.fractionalIntX(); |
| SkFractionalInt fy = mapper.fractionalIntY(); |
| SkFractionalInt dx = s.fInvSxFractionalInt; |
| SkFractionalInt dy = s.fInvKyFractionalInt; |
| int maxX = s.fPixmap.width() - 1; |
| int maxY = s.fPixmap.height() - 1; |
| |
| if (count >= 8) { |
| SkFractionalInt dx4 = dx * 4; |
| SkFractionalInt dy4 = dy * 4; |
| SkFractionalInt dx8 = dx * 8; |
| SkFractionalInt dy8 = dy * 8; |
| |
| int32x4_t xbase, ybase; |
| int32x4_t x2base, y2base; |
| int16_t *dst16 = (int16_t *) xy; |
| |
| // now build fx, fx+dx, fx+2dx, fx+3dx |
| xbase = vdupq_n_s32(SkFractionalIntToFixed(fx)); |
| xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1); |
| xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2); |
| xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3); |
| |
| // same for fy |
| ybase = vdupq_n_s32(SkFractionalIntToFixed(fy)); |
| ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1); |
| ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2); |
| ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3); |
| |
| x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4))); |
| y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4))); |
| |
| // store & bump |
| do { |
| int16x8x2_t hi16; |
| |
| hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX); |
| hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY); |
| |
| vst2q_s16(dst16, hi16); |
| |
| // moving base and on to the next |
| xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8))); |
| ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8))); |
| x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8))); |
| y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8))); |
| |
| dst16 += 16; // 8x32 aka 16x16 |
| count -= 8; |
| fx += dx8; |
| fy += dy8; |
| } while (count >= 8); |
| xy = (uint32_t *) dst16; |
| } |
| |
| for (int i = count; i > 0; --i) { |
| *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) | |
| TILEX_PROCF(SkFractionalIntToFixed(fx), maxX); |
| fx += dx; fy += dy; |
| } |
| } |
| |
| static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max, |
| SkFixed one PREAMBLE_PARAM_Y) { |
| unsigned i = TILEY_PROCF(f, max); |
| i = (i << 4) | EXTRACT_LOW_BITS(f, max); |
| return (i << 14) | (TILEY_PROCF((f + one), max)); |
| } |
| |
| static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, |
| SkFixed one PREAMBLE_PARAM_X) { |
| unsigned i = TILEX_PROCF(f, max); |
| i = (i << 4) | EXTRACT_LOW_BITS(f, max); |
| return (i << 14) | (TILEX_PROCF((f + one), max)); |
| } |
| |
| static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max, |
| SkFixed one PREAMBLE_PARAM_X) { |
| int32x4_t ret, res, wide_one; |
| |
| // Prepare constants |
| wide_one = vdupq_n_s32(one); |
| |
| // Step 1 |
| res = TILEX_PROCF_NEON4(f, max); |
| |
| // Step 2 |
| ret = EXTRACT_LOW_BITS_NEON4(f, max); |
| ret = vsliq_n_s32(ret, res, 4); |
| |
| // Step 3 |
| res = TILEX_PROCF_NEON4(f + wide_one, max); |
| ret = vorrq_s32(vshlq_n_s32(ret, 14), res); |
| |
| return ret; |
| } |
| |
| static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max, |
| SkFixed one PREAMBLE_PARAM_X) { |
| int32x4_t ret, res, wide_one; |
| |
| // Prepare constants |
| wide_one = vdupq_n_s32(one); |
| |
| // Step 1 |
| res = TILEY_PROCF_NEON4(f, max); |
| |
| // Step 2 |
| ret = EXTRACT_LOW_BITS_NEON4(f, max); |
| ret = vsliq_n_s32(ret, res, 4); |
| |
| // Step 3 |
| res = TILEY_PROCF_NEON4(f + wide_one, max); |
| ret = vorrq_s32(vshlq_n_s32(ret, 14), res); |
| |
| return ret; |
| } |
| |
| static void SCALE_FILTER_NAME(const SkBitmapProcState& s, |
| uint32_t xy[], int count, int x, int y) { |
| SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
| SkMatrix::kScale_Mask)) == 0); |
| SkASSERT(s.fInvKy == 0); |
| |
| PREAMBLE(s); |
| |
| const unsigned maxX = s.fPixmap.width() - 1; |
| const SkFixed one = s.fFilterOneX; |
| const SkFractionalInt dx = s.fInvSxFractionalInt; |
| SkFractionalInt fx; |
| |
| { |
| const SkBitmapProcStateAutoMapper mapper(s, x, y); |
| const SkFixed fy = mapper.fixedY(); |
| const unsigned maxY = s.fPixmap.height() - 1; |
| // compute our two Y values up front |
| *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y); |
| // now initialize fx |
| fx = mapper.fractionalIntX(); |
| } |
| |
| #ifdef CHECK_FOR_DECAL |
| // test if we don't need to apply the tile proc |
| const SkFixed fixedFx = SkFractionalIntToFixed(fx); |
| const SkFixed fixedDx = SkFractionalIntToFixed(dx); |
| if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) { |
| decal_filter_scale_neon(xy, fixedFx, fixedDx, count); |
| return; |
| } |
| #endif |
| { |
| |
| if (count >= 4) { |
| int32x4_t wide_fx; |
| |
| wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx)); |
| wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1); |
| wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2); |
| wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3); |
| |
| while (count >= 4) { |
| int32x4_t res; |
| |
| res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X); |
| |
| vst1q_u32(xy, vreinterpretq_u32_s32(res)); |
| |
| wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx)); |
| fx += dx+dx+dx+dx; |
| xy += 4; |
| count -= 4; |
| } |
| } |
| |
| while (--count >= 0) { |
| *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X); |
| fx += dx; |
| } |
| |
| } |
| } |
| |
| static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, |
| uint32_t xy[], int count, int x, int y) { |
| SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); |
| SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | |
| SkMatrix::kScale_Mask | |
| SkMatrix::kAffine_Mask)) == 0); |
| |
| PREAMBLE(s); |
| const SkBitmapProcStateAutoMapper mapper(s, x, y); |
| |
| SkFixed oneX = s.fFilterOneX; |
| SkFixed oneY = s.fFilterOneY; |
| SkFixed fx = mapper.fixedX(); |
| SkFixed fy = mapper.fixedY(); |
| SkFixed dx = s.fInvSx; |
| SkFixed dy = s.fInvKy; |
| unsigned maxX = s.fPixmap.width() - 1; |
| unsigned maxY = s.fPixmap.height() - 1; |
| |
| if (count >= 4) { |
| int32x4_t wide_fy, wide_fx; |
| |
| wide_fx = vdupq_n_s32(fx); |
| wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); |
| wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); |
| wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); |
| |
| wide_fy = vdupq_n_s32(fy); |
| wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1); |
| wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2); |
| wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3); |
| |
| while (count >= 4) { |
| int32x4x2_t vxy; |
| |
| // do the X side, then the Y side, then interleave them |
| vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y); |
| vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X); |
| |
| // interleave as YXYXYXYX as part of the storing |
| vst2q_s32((int32_t*)xy, vxy); |
| |
| // prepare next iteration |
| wide_fx += vdupq_n_s32(dx+dx+dx+dx); |
| fx += dx + dx + dx + dx; |
| wide_fy += vdupq_n_s32(dy+dy+dy+dy); |
| fy += dy+dy+dy+dy; |
| xy += 8; // 4 x's, 4 y's |
| count -= 4; |
| } |
| } |
| |
| while (--count >= 0) { |
| // NB: writing Y/X |
| *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y); |
| fy += dy; |
| *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X); |
| fx += dx; |
| } |
| } |
| |
| const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = { |
| SCALE_NOFILTER_NAME, |
| SCALE_FILTER_NAME, |
| AFFINE_NOFILTER_NAME, |
| AFFINE_FILTER_NAME, |
| }; |
| |
| #undef TILEX_PROCF_NEON8 |
| #undef TILEY_PROCF_NEON8 |
| #undef TILEX_PROCF_NEON4 |
| #undef TILEY_PROCF_NEON4 |
| #undef EXTRACT_LOW_BITS_NEON4 |
| |
| #undef MAKENAME |
| #undef TILEX_PROCF |
| #undef TILEY_PROCF |
| #ifdef CHECK_FOR_DECAL |
| #undef CHECK_FOR_DECAL |
| #endif |
| |
| #undef SCALE_NOFILTER_NAME |
| #undef SCALE_FILTER_NAME |
| #undef AFFINE_NOFILTER_NAME |
| #undef AFFINE_FILTER_NAME |
| |
| #undef PREAMBLE |
| #undef PREAMBLE_PARAM_X |
| #undef PREAMBLE_PARAM_Y |
| #undef PREAMBLE_ARG_X |
| #undef PREAMBLE_ARG_Y |
| |
| #undef EXTRACT_LOW_BITS |