blob: fabe45d4c3a4f215d15eb5d4a20942285079f772 [file] [log] [blame]
/*
* Copyright 2014 Google Inc.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include <arm_neon.h>
#define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
#define SCALE_FILTER_NAME MAKENAME(_filter_scale)
#define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
#define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
#define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
#define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
#ifndef PREAMBLE
#define PREAMBLE(state)
#define PREAMBLE_PARAM_X
#define PREAMBLE_PARAM_Y
#define PREAMBLE_ARG_X
#define PREAMBLE_ARG_Y
#endif
static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
SkMatrix::kScale_Mask)) == 0);
PREAMBLE(s);
// we store y, x, x, x, x, x
const unsigned maxX = s.fPixmap.width() - 1;
SkFractionalInt fx;
{
const SkBitmapProcStateAutoMapper mapper(s, x, y);
const unsigned maxY = s.fPixmap.height() - 1;
*xy++ = TILEY_PROCF(mapper.fixedY(), maxY);
fx = mapper.fractionalIntX();
}
if (0 == maxX) {
// all of the following X values must be 0
memset(xy, 0, count * sizeof(uint16_t));
return;
}
const SkFractionalInt dx = s.fInvSxFractionalInt;
#ifdef CHECK_FOR_DECAL
// test if we don't need to apply the tile proc
const SkFixed fixedFx = SkFractionalIntToFixed(fx);
const SkFixed fixedDx = SkFractionalIntToFixed(dx);
if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) {
decal_nofilter_scale_neon(xy, fixedFx, fixedDx, count);
return;
}
#endif
if (count >= 8) {
SkFractionalInt dx2 = dx+dx;
SkFractionalInt dx4 = dx2+dx2;
SkFractionalInt dx8 = dx4+dx4;
// now build fx/fx+dx/fx+2dx/fx+3dx
SkFractionalInt fx1, fx2, fx3;
int32x4_t lbase, hbase;
int16_t *dst16 = (int16_t *)xy;
fx1 = fx+dx;
fx2 = fx1+dx;
fx3 = fx2+dx;
lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
// store & bump
while (count >= 8) {
int16x8_t fx8;
fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
vst1q_s16(dst16, fx8);
// but preserving base & on to the next
lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
dst16 += 8;
count -= 8;
fx += dx8;
};
xy = (uint32_t *) dst16;
}
uint16_t* xx = (uint16_t*)xy;
for (int i = count; i > 0; --i) {
*xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
fx += dx;
}
}
static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
SkMatrix::kScale_Mask |
SkMatrix::kAffine_Mask)) == 0);
PREAMBLE(s);
const SkBitmapProcStateAutoMapper mapper(s, x, y);
SkFractionalInt fx = mapper.fractionalIntX();
SkFractionalInt fy = mapper.fractionalIntY();
SkFractionalInt dx = s.fInvSxFractionalInt;
SkFractionalInt dy = s.fInvKyFractionalInt;
int maxX = s.fPixmap.width() - 1;
int maxY = s.fPixmap.height() - 1;
if (count >= 8) {
SkFractionalInt dx4 = dx * 4;
SkFractionalInt dy4 = dy * 4;
SkFractionalInt dx8 = dx * 8;
SkFractionalInt dy8 = dy * 8;
int32x4_t xbase, ybase;
int32x4_t x2base, y2base;
int16_t *dst16 = (int16_t *) xy;
// now build fx, fx+dx, fx+2dx, fx+3dx
xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
// same for fy
ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
// store & bump
do {
int16x8x2_t hi16;
hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
vst2q_s16(dst16, hi16);
// moving base and on to the next
xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
dst16 += 16; // 8x32 aka 16x16
count -= 8;
fx += dx8;
fy += dy8;
} while (count >= 8);
xy = (uint32_t *) dst16;
}
for (int i = count; i > 0; --i) {
*xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
fx += dx; fy += dy;
}
}
static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
SkFixed one PREAMBLE_PARAM_Y) {
unsigned i = TILEY_PROCF(f, max);
i = (i << 4) | EXTRACT_LOW_BITS(f, max);
return (i << 14) | (TILEY_PROCF((f + one), max));
}
static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
SkFixed one PREAMBLE_PARAM_X) {
unsigned i = TILEX_PROCF(f, max);
i = (i << 4) | EXTRACT_LOW_BITS(f, max);
return (i << 14) | (TILEX_PROCF((f + one), max));
}
static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
SkFixed one PREAMBLE_PARAM_X) {
int32x4_t ret, res, wide_one;
// Prepare constants
wide_one = vdupq_n_s32(one);
// Step 1
res = TILEX_PROCF_NEON4(f, max);
// Step 2
ret = EXTRACT_LOW_BITS_NEON4(f, max);
ret = vsliq_n_s32(ret, res, 4);
// Step 3
res = TILEX_PROCF_NEON4(f + wide_one, max);
ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
return ret;
}
static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
SkFixed one PREAMBLE_PARAM_X) {
int32x4_t ret, res, wide_one;
// Prepare constants
wide_one = vdupq_n_s32(one);
// Step 1
res = TILEY_PROCF_NEON4(f, max);
// Step 2
ret = EXTRACT_LOW_BITS_NEON4(f, max);
ret = vsliq_n_s32(ret, res, 4);
// Step 3
res = TILEY_PROCF_NEON4(f + wide_one, max);
ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
return ret;
}
static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
SkMatrix::kScale_Mask)) == 0);
SkASSERT(s.fInvKy == 0);
PREAMBLE(s);
const unsigned maxX = s.fPixmap.width() - 1;
const SkFixed one = s.fFilterOneX;
const SkFractionalInt dx = s.fInvSxFractionalInt;
SkFractionalInt fx;
{
const SkBitmapProcStateAutoMapper mapper(s, x, y);
const SkFixed fy = mapper.fixedY();
const unsigned maxY = s.fPixmap.height() - 1;
// compute our two Y values up front
*xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
// now initialize fx
fx = mapper.fractionalIntX();
}
#ifdef CHECK_FOR_DECAL
// test if we don't need to apply the tile proc
const SkFixed fixedFx = SkFractionalIntToFixed(fx);
const SkFixed fixedDx = SkFractionalIntToFixed(dx);
if (can_truncate_to_fixed_for_decal(fixedFx, fixedDx, count, maxX)) {
decal_filter_scale_neon(xy, fixedFx, fixedDx, count);
return;
}
#endif
{
if (count >= 4) {
int32x4_t wide_fx;
wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
while (count >= 4) {
int32x4_t res;
res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
vst1q_u32(xy, vreinterpretq_u32_s32(res));
wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
fx += dx+dx+dx+dx;
xy += 4;
count -= 4;
}
}
while (--count >= 0) {
*xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
fx += dx;
}
}
}
static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
uint32_t xy[], int count, int x, int y) {
SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
SkMatrix::kScale_Mask |
SkMatrix::kAffine_Mask)) == 0);
PREAMBLE(s);
const SkBitmapProcStateAutoMapper mapper(s, x, y);
SkFixed oneX = s.fFilterOneX;
SkFixed oneY = s.fFilterOneY;
SkFixed fx = mapper.fixedX();
SkFixed fy = mapper.fixedY();
SkFixed dx = s.fInvSx;
SkFixed dy = s.fInvKy;
unsigned maxX = s.fPixmap.width() - 1;
unsigned maxY = s.fPixmap.height() - 1;
if (count >= 4) {
int32x4_t wide_fy, wide_fx;
wide_fx = vdupq_n_s32(fx);
wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
wide_fy = vdupq_n_s32(fy);
wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
while (count >= 4) {
int32x4x2_t vxy;
// do the X side, then the Y side, then interleave them
vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
// interleave as YXYXYXYX as part of the storing
vst2q_s32((int32_t*)xy, vxy);
// prepare next iteration
wide_fx += vdupq_n_s32(dx+dx+dx+dx);
fx += dx + dx + dx + dx;
wide_fy += vdupq_n_s32(dy+dy+dy+dy);
fy += dy+dy+dy+dy;
xy += 8; // 4 x's, 4 y's
count -= 4;
}
}
while (--count >= 0) {
// NB: writing Y/X
*xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
fy += dy;
*xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
fx += dx;
}
}
const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
SCALE_NOFILTER_NAME,
SCALE_FILTER_NAME,
AFFINE_NOFILTER_NAME,
AFFINE_FILTER_NAME,
};
#undef TILEX_PROCF_NEON8
#undef TILEY_PROCF_NEON8
#undef TILEX_PROCF_NEON4
#undef TILEY_PROCF_NEON4
#undef EXTRACT_LOW_BITS_NEON4
#undef MAKENAME
#undef TILEX_PROCF
#undef TILEY_PROCF
#ifdef CHECK_FOR_DECAL
#undef CHECK_FOR_DECAL
#endif
#undef SCALE_NOFILTER_NAME
#undef SCALE_FILTER_NAME
#undef AFFINE_NOFILTER_NAME
#undef AFFINE_FILTER_NAME
#undef PREAMBLE
#undef PREAMBLE_PARAM_X
#undef PREAMBLE_PARAM_Y
#undef PREAMBLE_ARG_X
#undef PREAMBLE_ARG_Y
#undef EXTRACT_LOW_BITS