grab from latest android



git-svn-id: http://skia.googlecode.com/svn/trunk@27 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/effects/SkGradientShader.cpp b/src/effects/SkGradientShader.cpp
new file mode 100644
index 0000000..a1674cb
--- /dev/null
+++ b/src/effects/SkGradientShader.cpp
@@ -0,0 +1,1562 @@
+/* libs/graphics/effects/SkGradientShader.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include "SkGradientShader.h"
+#include "SkColorPriv.h"
+#include "SkUnitMapper.h"
+#include "SkUtils.h"
+
+/*
+    ToDo
+
+    - not sure we still need the full Rec struct, now that we're using a cache
+    - detect const-alpha (but not opaque) in getFlags()
+*/
+
+/* dither seems to look better, but not stuningly yet, and it slows us down a little
+    so its not on by default yet.
+*/
+#define TEST_GRADIENT_DITHER
+
+///////////////////////////////////////////////////////////////////////////
+
+typedef SkFixed (*TileProc)(SkFixed);
+
+static SkFixed clamp_tileproc(SkFixed x)
+{
+    return SkClampMax(x, 0xFFFF);
+}
+
+static SkFixed repeat_tileproc(SkFixed x)
+{
+    return x & 0xFFFF;
+}
+
+static inline SkFixed mirror_tileproc(SkFixed x)
+{
+    int s = x << 15 >> 31;
+    return (x ^ s) & 0xFFFF;
+}
+
+static const TileProc gTileProcs[] = {
+    clamp_tileproc,
+    repeat_tileproc,
+    mirror_tileproc
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+static inline int repeat_6bits(int x)
+{
+    return x & 63;
+}
+
+static inline int mirror_6bits(int x)
+{
+#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
+    if (x & 64)
+        x = ~x;
+    return x & 63;
+#else
+    int s = x << 25 >> 31;
+    return (x ^ s) & 63;
+#endif
+}
+
+static inline int repeat_8bits(int x)
+{
+    return x & 0xFF;
+}
+
+static inline int mirror_8bits(int x)
+{
+#ifdef SK_CPU_HAS_CONDITIONAL_INSTR
+    if (x & 256)
+        x = ~x;
+    return x & 255;
+#else
+    int s = x << 23 >> 31;
+    return (x ^ s) & 0xFF;
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////////
+
+class Gradient_Shader : public SkShader {
+public:
+    Gradient_Shader(const SkColor colors[], const SkScalar pos[],
+                    int colorCount, SkShader::TileMode mode, SkUnitMapper* mapper);
+    virtual ~Gradient_Shader();
+
+    // overrides
+    virtual bool setContext(const SkBitmap&, const SkPaint&, const SkMatrix&);
+    virtual uint32_t getFlags() { return fFlags; }
+
+protected:
+    Gradient_Shader(SkFlattenableReadBuffer& );
+    SkUnitMapper* fMapper;
+    SkMatrix    fPtsToUnit;     // set by subclass
+    SkMatrix    fDstToIndex;
+    SkMatrix::MapXYProc fDstToIndexProc;
+    SkPMColor*  fARGB32;
+    TileMode    fTileMode;
+    TileProc    fTileProc;
+    uint16_t    fColorCount;
+    uint8_t     fDstToIndexClass;
+    uint8_t     fFlags;
+
+    struct Rec {
+        SkFixed     fPos;   // 0...1
+        uint32_t    fScale; // (1 << 24) / range
+    };
+    Rec*        fRecs;
+
+    enum {
+        kCache16Bits    = 6,    // seems like enough for visual accuracy
+        kCache16Count   = 1 << kCache16Bits,
+        kCache32Bits    = 8,    // pretty much should always be 8
+        kCache32Count   = 1 << kCache32Bits
+    };
+    virtual void flatten(SkFlattenableWriteBuffer& );
+    const uint16_t*     getCache16();
+    const SkPMColor*    getCache32();
+
+private:
+    enum {
+        kColorStorageCount = 4, // more than this many colors, and we'll use sk_malloc for the space
+
+        kStorageSize = kColorStorageCount * (sizeof(SkColor) + sizeof(SkPMColor) + sizeof(Rec))
+    };
+    SkColor     fStorage[(kStorageSize + 3) >> 2];
+    SkColor*    fOrigColors;
+
+    uint16_t*   fCache16;   // working ptr. If this is NULL, we need to recompute the cache values
+    SkPMColor*  fCache32;   // working ptr. If this is NULL, we need to recompute the cache values
+
+    uint16_t*   fCache16Storage;    // storage for fCache16, allocated on demand
+    SkPMColor*  fCache32Storage;    // storage for fCache32, allocated on demand
+    unsigned    fCacheAlpha;        // the alpha value we used when we computed the cache. larger than 8bits so we can store uninitialized value
+
+    typedef SkShader INHERITED;
+};
+
+static inline unsigned scalarToU16(SkScalar x)
+{
+    SkASSERT(x >= 0 && x <= SK_Scalar1);
+
+#ifdef SK_SCALAR_IS_FLOAT
+    return (unsigned)(x * 0xFFFF);
+#else
+    return x - (x >> 16);   // probably should be x - (x > 0x7FFF) but that is slower
+#endif
+}
+
+Gradient_Shader::Gradient_Shader(const SkColor colors[], const SkScalar pos[], int colorCount,
+                      SkShader::TileMode mode, SkUnitMapper* mapper)
+{
+    SkASSERT(colorCount > 1);
+
+    fCacheAlpha = 256;  // init to a value that paint.getAlpha() can't return
+
+    fMapper = mapper;
+    mapper->safeRef();
+
+    fCache16 = fCache16Storage = NULL;
+    fCache32 = fCache32Storage = NULL;
+
+    fColorCount = SkToU16(colorCount);
+    if (colorCount > kColorStorageCount)
+        fOrigColors = (SkColor*)sk_malloc_throw((sizeof(SkColor) + sizeof(SkPMColor) + sizeof(Rec)) * colorCount);
+    else
+        fOrigColors = fStorage;
+    memcpy(fOrigColors, colors, colorCount * sizeof(SkColor));
+    // our premul colors point to the 2nd half of the array
+    // these are assigned each time in setContext
+    fARGB32 = fOrigColors + colorCount;
+
+    SkASSERT((unsigned)mode < SkShader::kTileModeCount);
+    SkASSERT(SkShader::kTileModeCount == SK_ARRAY_COUNT(gTileProcs));
+    fTileMode = mode;
+    fTileProc = gTileProcs[mode];
+
+    fRecs = (Rec*)(fARGB32 + colorCount);
+    if (colorCount > 2)
+    {
+        Rec* recs = fRecs;
+
+        recs[0].fPos = 0;
+    //  recs[0].fScale = 0; // unused;
+        if (pos)
+        {
+            /*  We need to convert the user's array of relative positions into
+                fixed-point positions and scale factors. We need these results
+                to be strictly monotonic (no two values equal or out of order).
+                Hence this complex loop that just jams a zero for the scale
+                value if it sees a segment out of order, and it assures that
+                we start at 0 and end at 1.0
+            */
+            SkFixed prev = 0;
+            for (int i = 1; i < colorCount; i++)
+            {
+                // force the last value to be 1.0
+                SkFixed curr;
+                if (i == colorCount - 1)
+                    curr = SK_Fixed1;
+                else
+                {
+                    curr = SkScalarToFixed(pos[i]);
+                    // pin curr withing range
+                    if (curr < 0)
+                        curr = 0;
+                    else if (curr > SK_Fixed1)
+                        curr = SK_Fixed1;
+                }
+                recs[i].fPos = curr;
+                if (curr > prev)
+                    recs[i].fScale = (1 << 24) / (curr - prev);
+                else
+                    recs[i].fScale = 0; // ignore this segment
+                // get ready for the next value
+                prev = curr;
+            }
+        }
+        else    // assume even distribution
+        {
+            SkFixed dp = SK_Fixed1 / (colorCount - 1);
+            SkFixed p = dp;
+            SkFixed scale = (colorCount - 1) << 8;  // (1 << 24) / dp
+            for (int i = 1; i < colorCount; i++)
+            {
+                recs[i].fPos   = p;
+                recs[i].fScale = scale;
+                p += dp;
+            }
+        }
+    }
+}
+
+Gradient_Shader::Gradient_Shader(SkFlattenableReadBuffer& buffer) :
+    INHERITED(buffer)
+{
+    fCacheAlpha = 256;
+
+    fMapper = static_cast<SkUnitMapper*>(buffer.readFlattenable());
+
+    fCache16 = fCache16Storage = NULL;
+    fCache32 = fCache32Storage = NULL;
+
+    int colorCount = fColorCount = buffer.readU16();
+    if (colorCount > kColorStorageCount)
+        fOrigColors = (SkColor*)sk_malloc_throw((sizeof(SkColor) + sizeof(SkPMColor) + sizeof(Rec)) * colorCount);
+    else
+        fOrigColors = fStorage;
+    buffer.read(fOrigColors, colorCount * sizeof(SkColor));
+    fARGB32 = fOrigColors + colorCount;
+
+    fTileMode = (TileMode)buffer.readU8();
+    fTileProc = gTileProcs[fTileMode];
+    fRecs = (Rec*)(fARGB32 + colorCount);
+    if (colorCount > 2) {
+        Rec* recs = fRecs;
+        recs[0].fPos = 0;
+        for (int i = 1; i < colorCount; i++) {
+            recs[i].fPos = buffer.readS32();
+            recs[i].fScale = buffer.readU32();
+        }
+    }
+    buffer.read(&fPtsToUnit, sizeof(SkMatrix));
+}
+
+Gradient_Shader::~Gradient_Shader()
+{
+    if (fCache16Storage)
+        sk_free(fCache16Storage);
+    if (fCache32Storage)
+        sk_free(fCache32Storage);
+    if (fOrigColors != fStorage)
+        sk_free(fOrigColors);
+    fMapper->safeUnref();
+}
+
+void Gradient_Shader::flatten(SkFlattenableWriteBuffer& buffer)
+{
+    this->INHERITED::flatten(buffer);
+    buffer.writeFlattenable(fMapper);
+    buffer.write16(fColorCount);
+    buffer.writeMul4(fOrigColors, fColorCount * sizeof(SkColor));
+    buffer.write8(fTileMode);
+    if (fColorCount > 2) {
+        Rec* recs = fRecs;
+        for (int i = 1; i < fColorCount; i++) {
+            buffer.write32(recs[i].fPos);
+            buffer.write32(recs[i].fScale);
+        }
+    }
+    buffer.writeMul4(&fPtsToUnit, sizeof(SkMatrix));
+}
+
+bool Gradient_Shader::setContext(const SkBitmap& device,
+                                 const SkPaint& paint,
+                                 const SkMatrix& matrix)
+{
+    if (!this->INHERITED::setContext(device, paint, matrix))
+        return false;
+
+    const SkMatrix& inverse = this->getTotalInverse();
+
+    if (!fDstToIndex.setConcat(fPtsToUnit, inverse)) {
+        return false;
+    }
+
+    fDstToIndexProc = fDstToIndex.getMapXYProc();
+    fDstToIndexClass = (uint8_t)SkShader::ComputeMatrixClass(fDstToIndex);
+
+    // now convert our colors in to PMColors
+    unsigned paintAlpha = this->getPaintAlpha();
+    unsigned colorAlpha = 0xFF;
+
+    for (unsigned i = 0; i < fColorCount; i++) {
+        SkColor src = fOrigColors[i];
+        unsigned sa = SkColorGetA(src);
+        colorAlpha &= sa;
+        
+        // now modulate it by the paint for our resulting ARGB32 array
+        sa = SkMulDiv255Round(sa, paintAlpha);
+        fARGB32[i] = SkPreMultiplyARGB(sa, SkColorGetR(src), SkColorGetG(src),
+                                       SkColorGetB(src));
+    }
+
+    fFlags = this->INHERITED::getFlags();
+    if ((colorAlpha & paintAlpha) == 0xFF) {
+        fFlags |= kOpaqueAlpha_Flag;
+    }
+    // we can do span16 as long as our individual colors are opaque,
+    // regardless of the paint's alpha
+    if (0xFF == colorAlpha) {
+        fFlags |= kHasSpan16_Flag;
+    }
+
+    // if the new alpha differs from the previous time we were called, inval our cache
+    // this will trigger the cache to be rebuilt.
+    // we don't care about the first time, since the cache ptrs will already be NULL
+    if (fCacheAlpha != paintAlpha) {
+        fCache16 = NULL;                // inval the cache
+        fCache32 = NULL;                // inval the cache
+        fCacheAlpha = paintAlpha;       // record the new alpha
+    }
+    return true;
+}
+
+static inline int blend8(int a, int b, int scale)
+{
+    SkASSERT(a == SkToU8(a));
+    SkASSERT(b == SkToU8(b));
+    SkASSERT(scale >= 0 && scale <= 256);
+
+    return a + ((b - a) * scale >> 8);
+}
+
+static inline uint32_t dot8_blend_packed32(uint32_t s0, uint32_t s1, int blend)
+{
+#if 0
+    int a = blend8(SkGetPackedA32(s0), SkGetPackedA32(s1), blend);
+    int r = blend8(SkGetPackedR32(s0), SkGetPackedR32(s1), blend);
+    int g = blend8(SkGetPackedG32(s0), SkGetPackedG32(s1), blend);
+    int b = blend8(SkGetPackedB32(s0), SkGetPackedB32(s1), blend);
+
+    return SkPackARGB32(a, r, g, b);
+#else
+    int otherBlend = 256 - blend;
+
+#if 0
+    U32 t0 = (((s0 & 0xFF00FF) * blend + (s1 & 0xFF00FF) * otherBlend) >> 8) & 0xFF00FF;
+    U32 t1 = (((s0 >> 8) & 0xFF00FF) * blend + ((s1 >> 8) & 0xFF00FF) * otherBlend) & 0xFF00FF00;
+    SkASSERT((t0 & t1) == 0);
+    return t0 | t1;
+#else
+    return  ((((s0 & 0xFF00FF) * blend + (s1 & 0xFF00FF) * otherBlend) >> 8) & 0xFF00FF) |
+            ((((s0 >> 8) & 0xFF00FF) * blend + ((s1 >> 8) & 0xFF00FF) * otherBlend) & 0xFF00FF00);
+#endif
+
+#endif
+}
+
+#define Fixed_To_Dot8(x)        (((x) + 0x80) >> 8)
+
+/** We take the original colors, not our premultiplied PMColors, since we can build a 16bit table
+    as long as the original colors are opaque, even if the paint specifies a non-opaque alpha.
+*/
+static void build_16bit_cache(uint16_t cache[], SkColor c0, SkColor c1, int count)
+{
+    SkASSERT(count > 1);
+    SkASSERT(SkColorGetA(c0) == 0xFF);
+    SkASSERT(SkColorGetA(c1) == 0xFF);
+
+    SkFixed r = SkColorGetR(c0);
+    SkFixed g = SkColorGetG(c0);
+    SkFixed b = SkColorGetB(c0);
+
+    SkFixed dr = SkIntToFixed(SkColorGetR(c1) - r) / (count - 1);
+    SkFixed dg = SkIntToFixed(SkColorGetG(c1) - g) / (count - 1);
+    SkFixed db = SkIntToFixed(SkColorGetB(c1) - b) / (count - 1);
+
+    r = SkIntToFixed(r) + 0x8000;
+    g = SkIntToFixed(g) + 0x8000;
+    b = SkIntToFixed(b) + 0x8000;
+
+    do {
+        unsigned rr = r >> 16;
+        unsigned gg = g >> 16;
+        unsigned bb = b >> 16;
+        cache[0] = SkPackRGB16(SkR32ToR16(rr), SkG32ToG16(gg), SkB32ToB16(bb));
+        cache[64] = SkDitherPack888ToRGB16(rr, gg, bb);
+        cache += 1;
+        r += dr;
+        g += dg;
+        b += db;
+    } while (--count != 0);
+}
+
+static void build_32bit_cache(SkPMColor cache[], SkPMColor c0, SkPMColor c1, int count)
+{
+    SkASSERT(count > 1);
+
+    SkFixed a = SkGetPackedA32(c0);
+    SkFixed r = SkGetPackedR32(c0);
+    SkFixed g = SkGetPackedG32(c0);
+    SkFixed b = SkGetPackedB32(c0);
+
+    SkFixed da = SkIntToFixed(SkGetPackedA32(c1) - a) / (count - 1);
+    SkFixed dr = SkIntToFixed(SkGetPackedR32(c1) - r) / (count - 1);
+    SkFixed dg = SkIntToFixed(SkGetPackedG32(c1) - g) / (count - 1);
+    SkFixed db = SkIntToFixed(SkGetPackedB32(c1) - b) / (count - 1);
+
+    a = SkIntToFixed(a) + 0x8000;
+    r = SkIntToFixed(r) + 0x8000;
+    g = SkIntToFixed(g) + 0x8000;
+    b = SkIntToFixed(b) + 0x8000;
+
+    do {
+        *cache++ = SkPackARGB32(a >> 16, r >> 16, g >> 16, b >> 16);
+        a += da;
+        r += dr;
+        g += dg;
+        b += db;
+    } while (--count != 0);
+}
+
+static inline int SkFixedToFFFF(SkFixed x)
+{
+    SkASSERT((unsigned)x <= SK_Fixed1);
+    return x - (x >> 16);
+}
+
+static inline U16CPU dot6to16(unsigned x)
+{
+    SkASSERT(x < 64);
+    return (x << 10) | (x << 4) | (x >> 2);
+}
+
+const uint16_t* Gradient_Shader::getCache16()
+{
+    if (fCache16 == NULL)
+    {
+        if (fCache16Storage == NULL) // set the storage and our working ptr
+#ifdef TEST_GRADIENT_DITHER
+            fCache16Storage = (uint16_t*)sk_malloc_throw(sizeof(uint16_t) * kCache16Count * 2);
+#else
+            fCache16Storage = (uint16_t*)sk_malloc_throw(sizeof(uint16_t) * kCache16Count);
+#endif
+        fCache16 = fCache16Storage;
+        if (fColorCount == 2)
+            build_16bit_cache(fCache16, fOrigColors[0], fOrigColors[1], kCache16Count);
+        else
+        {
+            Rec* rec = fRecs;
+            int prevIndex = 0;
+            for (unsigned i = 1; i < fColorCount; i++)
+            {
+                int nextIndex = SkFixedToFFFF(rec[i].fPos) >> (16 - kCache16Bits);
+                SkASSERT(nextIndex < kCache16Count);
+
+                if (nextIndex > prevIndex)
+                    build_16bit_cache(fCache16 + prevIndex, fOrigColors[i-1], fOrigColors[i], nextIndex - prevIndex + 1);
+                prevIndex = nextIndex;
+            }
+            SkASSERT(prevIndex == kCache16Count - 1);
+        }
+
+        if (fMapper)
+        {
+#ifdef TEST_GRADIENT_DITHER
+            fCache16Storage = (uint16_t*)sk_malloc_throw(sizeof(uint16_t) * kCache16Count * 2);
+#else
+            fCache16Storage = (uint16_t*)sk_malloc_throw(sizeof(uint16_t) * kCache16Count);
+#endif
+            uint16_t* linear = fCache16;         // just computed linear data
+            uint16_t* mapped = fCache16Storage;  // storage for mapped data
+            SkUnitMapper* map = fMapper;
+            for (int i = 0; i < 64; i++)
+            {
+                int index = map->mapUnit16(dot6to16(i)) >> 10;
+                mapped[i] = linear[index];
+#ifdef TEST_GRADIENT_DITHER
+                mapped[i + 64] = linear[index + 64];
+#endif
+            }
+            sk_free(fCache16);
+            fCache16 = fCache16Storage;
+        }
+    }
+    return fCache16;
+}
+
+const SkPMColor* Gradient_Shader::getCache32()
+{
+    if (fCache32 == NULL)
+    {
+        if (fCache32Storage == NULL) // set the storage and our working ptr
+            fCache32Storage = (SkPMColor*)sk_malloc_throw(sizeof(SkPMColor) * kCache32Count);
+
+        fCache32 = fCache32Storage;
+        if (fColorCount == 2)
+            build_32bit_cache(fCache32, fARGB32[0], fARGB32[1], kCache32Count);
+        else
+        {
+            Rec* rec = fRecs;
+            int prevIndex = 0;
+            for (unsigned i = 1; i < fColorCount; i++)
+            {
+                int nextIndex = SkFixedToFFFF(rec[i].fPos) >> (16 - kCache32Bits);
+                SkASSERT(nextIndex < kCache32Count);
+
+                if (nextIndex > prevIndex)
+                    build_32bit_cache(fCache32 + prevIndex, fARGB32[i-1], fARGB32[i], nextIndex - prevIndex + 1);
+                prevIndex = nextIndex;
+            }
+            SkASSERT(prevIndex == kCache32Count - 1);
+        }
+
+        if (fMapper)
+        {
+            fCache32Storage = (SkPMColor*)sk_malloc_throw(sizeof(SkPMColor) * kCache32Count);
+            SkPMColor* linear = fCache32;           // just computed linear data
+            SkPMColor* mapped = fCache32Storage;    // storage for mapped data
+            SkUnitMapper* map = fMapper;
+            for (int i = 0; i < 256; i++)
+                mapped[i] = linear[map->mapUnit16((i << 8) | i) >> 8];
+            sk_free(fCache32);
+            fCache32 = fCache32Storage;
+        }
+    }
+    return fCache32;
+}
+
+///////////////////////////////////////////////////////////////////////////
+
+static void pts_to_unit_matrix(const SkPoint pts[2], SkMatrix* matrix)
+{
+    SkVector    vec = pts[1] - pts[0];
+    SkScalar    mag = vec.length();
+    SkScalar    inv = mag ? SkScalarInvert(mag) : 0;
+
+    vec.scale(inv);
+    matrix->setSinCos(-vec.fY, vec.fX, pts[0].fX, pts[0].fY);
+    matrix->postTranslate(-pts[0].fX, -pts[0].fY);
+    matrix->postScale(inv, inv);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+class Linear_Gradient : public Gradient_Shader {
+public:
+    Linear_Gradient(const SkPoint pts[2],
+                    const SkColor colors[], const SkScalar pos[], int colorCount,
+                    SkShader::TileMode mode, SkUnitMapper* mapper)
+        : Gradient_Shader(colors, pos, colorCount, mode, mapper)
+    {
+        pts_to_unit_matrix(pts, &fPtsToUnit);
+    }
+    virtual void shadeSpan(int x, int y, SkPMColor dstC[], int count);
+    virtual void shadeSpan16(int x, int y, uint16_t dstC[], int count);
+    virtual bool asABitmap(SkBitmap*, SkMatrix*, TileMode*);
+
+    static SkFlattenable* CreateProc(SkFlattenableReadBuffer& buffer) { 
+        return SkNEW_ARGS(Linear_Gradient, (buffer));
+    }
+
+protected:
+    Linear_Gradient(SkFlattenableReadBuffer& buffer) : Gradient_Shader(buffer) {};
+    virtual Factory getFactory() { return CreateProc; }
+
+private:
+    typedef Gradient_Shader INHERITED;
+};
+
+//  Return true if fx, fx+dx, fx+2*dx, ... is always in range
+static bool no_need_for_clamp(int fx, int dx, int count)
+{
+    SkASSERT(count > 0);
+    return (unsigned)((fx | (fx + (count - 1) * dx)) >> 8) <= 0xFF;
+}
+
+void Linear_Gradient::shadeSpan(int x, int y, SkPMColor dstC[], int count)
+{
+    SkASSERT(count > 0);
+
+    SkPoint             srcPt;
+    SkMatrix::MapXYProc dstProc = fDstToIndexProc;
+    TileProc            proc = fTileProc;
+    const SkPMColor*    cache = this->getCache32();
+
+    if (fDstToIndexClass != kPerspective_MatrixClass)
+    {
+        dstProc(fDstToIndex, SkIntToScalar(x), SkIntToScalar(y), &srcPt);
+        SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
+
+        if (fDstToIndexClass == kFixedStepInX_MatrixClass)
+        {
+            SkFixed dxStorage[1];
+            (void)fDstToIndex.fixedStepInX(SkIntToScalar(y), dxStorage, NULL);
+            dx = dxStorage[0];
+        }
+        else
+        {
+            SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
+            dx = SkScalarToFixed(fDstToIndex.getScaleX());
+        }
+
+        if (SkFixedNearlyZero(dx))  // we're a vertical gradient, so no change in a span
+        {
+            unsigned fi = proc(fx);
+            SkASSERT(fi <= 0xFFFF);
+            sk_memset32(dstC, cache[fi >> (16 - kCache32Bits)], count);
+        }
+        else if (proc == clamp_tileproc)
+        {
+#if 0
+            if (no_need_for_clamp(fx, dx, count))
+            {
+                unsigned fi;
+                while ((count -= 4) >= 0)
+                {
+                    fi = fx >> 8; SkASSERT(fi <= 0xFF); fx += dx; *dstC++ = cache[fi];
+                    fi = fx >> 8; SkASSERT(fi <= 0xFF); fx += dx; *dstC++ = cache[fi];
+                    fi = fx >> 8; SkASSERT(fi <= 0xFF); fx += dx; *dstC++ = cache[fi];
+                    fi = fx >> 8; SkASSERT(fi <= 0xFF); fx += dx; *dstC++ = cache[fi];
+                }
+                SkASSERT(count <= -1 && count >= -4);
+                count += 4;
+                while (--count >= 0)
+                {
+                    fi = fx >> 8;
+                    SkASSERT(fi <= 0xFF);
+                    fx += dx;
+                    *dstC++ = cache[fi];
+                }
+            }
+            else
+#endif
+                do {
+                    unsigned fi = SkClampMax(fx >> 8, 0xFF);
+                    SkASSERT(fi <= 0xFF);
+                    fx += dx;
+                    *dstC++ = cache[fi];
+                } while (--count != 0);
+        }
+        else if (proc == mirror_tileproc)
+        {
+            do {
+                unsigned fi = mirror_8bits(fx >> 8);
+                SkASSERT(fi <= 0xFF);
+                fx += dx;
+                *dstC++ = cache[fi];
+            } while (--count != 0);
+        }
+        else
+        {
+            SkASSERT(proc == repeat_tileproc);
+            do {
+                unsigned fi = repeat_8bits(fx >> 8);
+                SkASSERT(fi <= 0xFF);
+                fx += dx;
+                *dstC++ = cache[fi];
+            } while (--count != 0);
+        }
+    }
+    else
+    {
+        SkScalar    dstX = SkIntToScalar(x);
+        SkScalar    dstY = SkIntToScalar(y);
+        do {
+            dstProc(fDstToIndex, dstX, dstY, &srcPt);
+            unsigned fi = proc(SkScalarToFixed(srcPt.fX));
+            SkASSERT(fi <= 0xFFFF);
+            *dstC++ = cache[fi >> (16 - kCache32Bits)];
+            dstX += SK_Scalar1;
+        } while (--count != 0);
+    }
+}
+
+bool Linear_Gradient::asABitmap(SkBitmap* bitmap, SkMatrix* matrix,
+                                TileMode xy[]) {
+    if (bitmap) {
+        bitmap->setConfig(SkBitmap::kARGB_8888_Config, kCache32Count, 1);
+        bitmap->allocPixels();  // share with shader???
+        memcpy(bitmap->getPixels(), this->getCache32(), kCache32Count * 4);
+    }
+    if (matrix) {
+        matrix->setScale(SkIntToScalar(kCache32Count), SK_Scalar1);
+        matrix->preConcat(fPtsToUnit);
+    }
+    if (xy) {
+        xy[0] = fTileMode;
+        xy[1] = kClamp_TileMode;
+    }
+    return true;
+}
+
+#ifdef TEST_GRADIENT_DITHER
+static void dither_memset16(uint16_t dst[], uint16_t value, uint16_t other, int count)
+{
+    if ((unsigned)dst & 2)
+    {
+        *dst++ = value;
+        count -= 1;
+        SkTSwap(value, other);
+    }
+
+    sk_memset32((uint32_t*)dst, (value << 16) | other, count >> 1);
+    
+    if (count & 1)
+        dst[count - 1] = value;
+}
+#endif
+
+void Linear_Gradient::shadeSpan16(int x, int y, uint16_t dstC[], int count)
+{
+    SkASSERT(count > 0);
+
+    SkPoint             srcPt;
+    SkMatrix::MapXYProc dstProc = fDstToIndexProc;
+    TileProc            proc = fTileProc;
+    const uint16_t*     cache = this->getCache16();
+#ifdef TEST_GRADIENT_DITHER
+    int                 toggle = ((x ^ y) & 1) << kCache16Bits;
+#endif
+
+    if (fDstToIndexClass != kPerspective_MatrixClass)
+    {
+        dstProc(fDstToIndex, SkIntToScalar(x), SkIntToScalar(y), &srcPt);
+        SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
+
+        if (fDstToIndexClass == kFixedStepInX_MatrixClass)
+        {
+            SkFixed dxStorage[1];
+            (void)fDstToIndex.fixedStepInX(SkIntToScalar(y), dxStorage, NULL);
+            dx = dxStorage[0];
+        }
+        else
+        {
+            SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
+            dx = SkScalarToFixed(fDstToIndex.getScaleX());
+        }
+
+        if (SkFixedNearlyZero(dx))  // we're a vertical gradient, so no change in a span
+        {
+            unsigned fi = proc(fx) >> 10;
+            SkASSERT(fi <= 63);
+#ifdef TEST_GRADIENT_DITHER
+            dither_memset16(dstC, cache[toggle + fi], cache[(toggle ^ (1 << kCache16Bits)) + fi], count);
+#else
+            sk_memset16(dstC, cache[fi], count);
+#endif
+        }
+        else if (proc == clamp_tileproc)
+        {
+            do {
+                unsigned fi = SkClampMax(fx >> 10, 63);
+                SkASSERT(fi <= 63);
+                fx += dx;
+#ifdef TEST_GRADIENT_DITHER
+                *dstC++ = cache[toggle + fi];
+                toggle ^= (1 << kCache16Bits);
+#else
+                *dstC++ = cache[fi];
+#endif
+            } while (--count != 0);
+        }
+        else if (proc == mirror_tileproc)
+        {
+            do {
+                unsigned fi = mirror_6bits(fx >> 10);
+                SkASSERT(fi <= 0x3F);
+                fx += dx;
+#ifdef TEST_GRADIENT_DITHER
+                *dstC++ = cache[toggle + fi];
+                toggle ^= (1 << kCache16Bits);
+#else
+                *dstC++ = cache[fi];
+#endif
+            } while (--count != 0);
+        }
+        else
+        {
+            SkASSERT(proc == repeat_tileproc);
+            do {
+                unsigned fi = repeat_6bits(fx >> 10);
+                SkASSERT(fi <= 0x3F);
+                fx += dx;
+#ifdef TEST_GRADIENT_DITHER
+                *dstC++ = cache[toggle + fi];
+                toggle ^= (1 << kCache16Bits);
+#else
+                *dstC++ = cache[fi];
+#endif
+            } while (--count != 0);
+        }
+    }
+    else
+    {
+        SkScalar    dstX = SkIntToScalar(x);
+        SkScalar    dstY = SkIntToScalar(y);
+        do {
+            dstProc(fDstToIndex, dstX, dstY, &srcPt);
+            unsigned fi = proc(SkScalarToFixed(srcPt.fX));
+            SkASSERT(fi <= 0xFFFF);
+
+            int index = fi >> (16 - kCache16Bits);
+#ifdef TEST_GRADIENT_DITHER
+            *dstC++ = cache[toggle + index];
+            toggle ^= (1 << kCache16Bits);
+#else
+            *dstC++ = cache[index];
+#endif
+
+            dstX += SK_Scalar1;
+        } while (--count != 0);
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define kSQRT_TABLE_BITS    11
+#define kSQRT_TABLE_SIZE    (1 << kSQRT_TABLE_BITS)
+
+#include "SkRadialGradient_Table.h"
+
+#if defined(SK_BUILD_FOR_WIN32) && defined(SK_DEBUG)
+
+#include <stdio.h>
+
+void SkRadialGradient_BuildTable()
+{
+    // build it 0..127 x 0..127, so we use 2^15 - 1 in the numerator for our "fixed" table
+
+    FILE* file = ::fopen("SkRadialGradient_Table.h", "w");
+    SkASSERT(file);
+    ::fprintf(file, "static const uint8_t gSqrt8Table[] = {\n");
+
+    for (int i = 0; i < kSQRT_TABLE_SIZE; i++)
+    {
+        if ((i & 15) == 0)
+            ::fprintf(file, "\t");
+
+        uint8_t value = SkToU8(SkFixedSqrt(i * SK_Fixed1 / kSQRT_TABLE_SIZE) >> 8);
+
+        ::fprintf(file, "0x%02X", value);
+        if (i < kSQRT_TABLE_SIZE-1)
+            ::fprintf(file, ", ");
+        if ((i & 15) == 15)
+            ::fprintf(file, "\n");
+    }
+    ::fprintf(file, "};\n");
+    ::fclose(file);
+}
+
+#endif
+
+
+static void rad_to_unit_matrix(const SkPoint& center, SkScalar radius, SkMatrix* matrix)
+{
+    SkScalar    inv = SkScalarInvert(radius);
+
+    matrix->setTranslate(-center.fX, -center.fY);
+    matrix->postScale(inv, inv);
+}
+
+class Radial_Gradient : public Gradient_Shader {
+public:
+    Radial_Gradient(const SkPoint& center, SkScalar radius,
+                    const SkColor colors[], const SkScalar pos[], int colorCount,
+                    SkShader::TileMode mode, SkUnitMapper* mapper)
+        : Gradient_Shader(colors, pos, colorCount, mode, mapper)
+    {
+        // make sure our table is insync with our current #define for kSQRT_TABLE_SIZE
+        SkASSERT(sizeof(gSqrt8Table) == kSQRT_TABLE_SIZE);
+
+        rad_to_unit_matrix(center, radius, &fPtsToUnit);
+    }
+    virtual void shadeSpan(int x, int y, SkPMColor dstC[], int count)
+    {
+        SkASSERT(count > 0);
+
+        SkPoint             srcPt;
+        SkMatrix::MapXYProc dstProc = fDstToIndexProc;
+        TileProc            proc = fTileProc;
+        const SkPMColor*    cache = this->getCache32();
+
+        if (fDstToIndexClass != kPerspective_MatrixClass)
+        {
+            dstProc(fDstToIndex, SkIntToScalar(x), SkIntToScalar(y), &srcPt);
+            SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
+            SkFixed dy, fy = SkScalarToFixed(srcPt.fY);
+
+            if (fDstToIndexClass == kFixedStepInX_MatrixClass)
+            {
+                SkFixed storage[2];
+                (void)fDstToIndex.fixedStepInX(SkIntToScalar(y), &storage[0], &storage[1]);
+                dx = storage[0];
+                dy = storage[1];
+            }
+            else
+            {
+                SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
+                dx = SkScalarToFixed(fDstToIndex.getScaleX());
+                dy = SkScalarToFixed(fDstToIndex.getSkewY());
+            }
+
+            if (proc == clamp_tileproc)
+            {
+                const uint8_t* sqrt_table = gSqrt8Table;
+                fx >>= 1;
+                dx >>= 1;
+                fy >>= 1;
+                dy >>= 1;
+                do {
+                    unsigned xx = SkPin32(fx, -0xFFFF >> 1, 0xFFFF >> 1);
+                    unsigned fi = SkPin32(fy, -0xFFFF >> 1, 0xFFFF >> 1);
+                    fi = (xx * xx + fi * fi) >> (14 + 16 - kSQRT_TABLE_BITS);
+                    fi = SkFastMin32(fi, 0xFFFF >> (16 - kSQRT_TABLE_BITS));
+                    *dstC++ = cache[sqrt_table[fi] >> (8 - kCache32Bits)];
+                    fx += dx;
+                    fy += dy;
+                } while (--count != 0);
+            }
+            else if (proc == mirror_tileproc)
+            {
+                do {
+                    SkFixed dist = SkFixedSqrt(SkFixedSquare(fx) + SkFixedSquare(fy));
+                    unsigned fi = mirror_tileproc(dist);
+                    SkASSERT(fi <= 0xFFFF);
+                    *dstC++ = cache[fi >> (16 - kCache32Bits)];
+                    fx += dx;
+                    fy += dy;
+                } while (--count != 0);
+            }
+            else
+            {
+                SkASSERT(proc == repeat_tileproc);
+                do {
+                    SkFixed dist = SkFixedSqrt(SkFixedSquare(fx) + SkFixedSquare(fy));
+                    unsigned fi = repeat_tileproc(dist);
+                    SkASSERT(fi <= 0xFFFF);
+                    *dstC++ = cache[fi >> (16 - kCache32Bits)];
+                    fx += dx;
+                    fy += dy;
+                } while (--count != 0);
+            }
+        }
+        else    // perspective case
+        {
+            SkScalar dstX = SkIntToScalar(x);
+            SkScalar dstY = SkIntToScalar(y);
+            do {
+                dstProc(fDstToIndex, dstX, dstY, &srcPt);
+                unsigned fi = proc(SkScalarToFixed(srcPt.length()));
+                SkASSERT(fi <= 0xFFFF);
+                *dstC++ = cache[fi >> (16 - kCache32Bits)];
+                dstX += SK_Scalar1;
+            } while (--count != 0);
+        }
+    }
+    virtual void shadeSpan16(int x, int y, uint16_t dstC[], int count)
+    {
+        SkASSERT(count > 0);
+
+        SkPoint             srcPt;
+        SkMatrix::MapXYProc dstProc = fDstToIndexProc;
+        TileProc            proc = fTileProc;
+        const uint16_t*     cache = this->getCache16();
+#ifdef TEST_GRADIENT_DITHER
+        int                 toggle = ((x ^ y) & 1) << kCache16Bits;
+#endif
+
+        if (fDstToIndexClass != kPerspective_MatrixClass)
+        {
+            dstProc(fDstToIndex, SkIntToScalar(x), SkIntToScalar(y), &srcPt);
+            SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
+            SkFixed dy, fy = SkScalarToFixed(srcPt.fY);
+
+            if (fDstToIndexClass == kFixedStepInX_MatrixClass)
+            {
+                SkFixed storage[2];
+                (void)fDstToIndex.fixedStepInX(SkIntToScalar(y), &storage[0], &storage[1]);
+                dx = storage[0];
+                dy = storage[1];
+            }
+            else
+            {
+                SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
+                dx = SkScalarToFixed(fDstToIndex.getScaleX());
+                dy = SkScalarToFixed(fDstToIndex.getSkewY());
+            }
+
+            if (proc == clamp_tileproc)
+            {
+                const uint8_t* sqrt_table = gSqrt8Table;
+
+                /* knock these down so we can pin against +- 0x7FFF, which is an immediate load,
+                    rather than 0xFFFF which is slower. This is a compromise, since it reduces our
+                    precision, but that appears to be visually OK. If we decide this is OK for
+                    all of our cases, we could (it seems) put this scale-down into fDstToIndex,
+                    to avoid having to do these extra shifts each time.
+                */
+                fx >>= 1;
+                dx >>= 1;
+                fy >>= 1;
+                dy >>= 1;
+                if (dy == 0)    // might perform this check for the other modes, but the win will be a smaller % of the total
+                {
+                    fy = SkPin32(fy, -0xFFFF >> 1, 0xFFFF >> 1);
+                    fy *= fy;
+                    do {
+                        unsigned xx = SkPin32(fx, -0xFFFF >> 1, 0xFFFF >> 1);
+                        unsigned fi = (xx * xx + fy) >> (14 + 16 - kSQRT_TABLE_BITS);
+                        fi = SkFastMin32(fi, 0xFFFF >> (16 - kSQRT_TABLE_BITS));
+                        fx += dx;
+#ifdef TEST_GRADIENT_DITHER
+                        *dstC++ = cache[toggle + (sqrt_table[fi] >> (8 - kCache16Bits))];
+                        toggle ^= (1 << kCache16Bits);
+#else
+                        *dstC++ = cache[sqrt_table[fi] >> (8 - kCache16Bits)];
+#endif
+                    } while (--count != 0);
+                }
+                else
+                {
+                    do {
+                        unsigned xx = SkPin32(fx, -0xFFFF >> 1, 0xFFFF >> 1);
+                        unsigned fi = SkPin32(fy, -0xFFFF >> 1, 0xFFFF >> 1);
+                        fi = (xx * xx + fi * fi) >> (14 + 16 - kSQRT_TABLE_BITS);
+                        fi = SkFastMin32(fi, 0xFFFF >> (16 - kSQRT_TABLE_BITS));
+                        fx += dx;
+                        fy += dy;
+#ifdef TEST_GRADIENT_DITHER
+                        *dstC++ = cache[toggle + (sqrt_table[fi] >> (8 - kCache16Bits))];
+                        toggle ^= (1 << kCache16Bits);
+#else
+                        *dstC++ = cache[sqrt_table[fi] >> (8 - kCache16Bits)];
+#endif
+                    } while (--count != 0);
+                }
+            }
+            else if (proc == mirror_tileproc)
+            {
+                do {
+                    SkFixed dist = SkFixedSqrt(SkFixedSquare(fx) + SkFixedSquare(fy));
+                    unsigned fi = mirror_tileproc(dist);
+                    SkASSERT(fi <= 0xFFFF);
+                    fx += dx;
+                    fy += dy;
+#ifdef TEST_GRADIENT_DITHER
+                    *dstC++ = cache[toggle + (fi >> (16 - kCache16Bits))];
+                    toggle ^= (1 << kCache16Bits);
+#else
+                    *dstC++ = cache[fi >> (16 - kCache16Bits)];
+#endif
+                } while (--count != 0);
+            }
+            else
+            {
+                SkASSERT(proc == repeat_tileproc);
+                do {
+                    SkFixed dist = SkFixedSqrt(SkFixedSquare(fx) + SkFixedSquare(fy));
+                    unsigned fi = repeat_tileproc(dist);
+                    SkASSERT(fi <= 0xFFFF);
+                    fx += dx;
+                    fy += dy;
+#ifdef TEST_GRADIENT_DITHER
+                    *dstC++ = cache[toggle + (fi >> (16 - kCache16Bits))];
+                    toggle ^= (1 << kCache16Bits);
+#else
+                    *dstC++ = cache[fi >> (16 - kCache16Bits)];
+#endif
+                } while (--count != 0);
+            }
+        }
+        else    // perspective case
+        {
+            SkScalar dstX = SkIntToScalar(x);
+            SkScalar dstY = SkIntToScalar(y);
+            do {
+                dstProc(fDstToIndex, dstX, dstY, &srcPt);
+                unsigned fi = proc(SkScalarToFixed(srcPt.length()));
+                SkASSERT(fi <= 0xFFFF);
+
+                int index = fi >> (16 - kCache16Bits);
+#ifdef TEST_GRADIENT_DITHER
+                *dstC++ = cache[toggle + index];
+                toggle ^= (1 << kCache16Bits);
+#else
+                *dstC++ = cache[index];
+#endif
+
+                dstX += SK_Scalar1;
+            } while (--count != 0);
+        }
+    }
+
+    static SkFlattenable* CreateProc(SkFlattenableReadBuffer& buffer) { 
+        return SkNEW_ARGS(Radial_Gradient, (buffer));
+    }
+
+protected:
+    Radial_Gradient(SkFlattenableReadBuffer& buffer) : Gradient_Shader(buffer) {};
+    virtual Factory getFactory() { return CreateProc; }
+
+private:
+    typedef Gradient_Shader INHERITED;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class Sweep_Gradient : public Gradient_Shader {
+public:
+    Sweep_Gradient(SkScalar cx, SkScalar cy, const SkColor colors[],
+                   const SkScalar pos[], int count, SkUnitMapper* mapper)
+    : Gradient_Shader(colors, pos, count, SkShader::kClamp_TileMode, mapper)
+    {
+        fPtsToUnit.setTranslate(-cx, -cy);
+    }
+    virtual void shadeSpan(int x, int y, SkPMColor dstC[], int count);
+    virtual void shadeSpan16(int x, int y, uint16_t dstC[], int count);
+    
+    static SkFlattenable* CreateProc(SkFlattenableReadBuffer& buffer) {
+        return SkNEW_ARGS(Sweep_Gradient, (buffer));
+    }
+
+protected:
+    Sweep_Gradient(SkFlattenableReadBuffer& buffer) : Gradient_Shader(buffer) {}
+
+    virtual Factory getFactory() { return CreateProc; }
+
+private:
+    typedef Gradient_Shader INHERITED;
+};
+
+#ifdef COMPUTE_SWEEP_TABLE
+#define PI  3.14159265
+static bool gSweepTableReady;
+static uint8_t gSweepTable[65];
+
+/*  Our table stores precomputed values for atan: [0...1] -> [0..PI/4]
+    We scale the results to [0..32]
+*/
+static const uint8_t* build_sweep_table()
+{
+    if (!gSweepTableReady)
+    {
+        const int N = 65;
+        const double DENOM = N - 1;
+        
+        for (int i = 0; i < N; i++)
+        {
+            double arg = i / DENOM;
+            double v = atan(arg);
+            int iv = (int)round(v * DENOM * 2 / PI);
+//            printf("[%d] atan(%g) = %g %d\n", i, arg, v, iv);
+            printf("%d, ", iv);
+            gSweepTable[i] = iv;
+        }
+        gSweepTableReady = true;
+    }
+    return gSweepTable;
+}
+#else
+static const uint8_t gSweepTable[] = {
+    0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9,
+    10, 11, 11, 12, 12, 13, 13, 14, 15, 15, 16, 16, 17, 17, 18, 18,
+    19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 25, 26,
+    26, 27, 27, 27, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32,
+    32
+};
+static const uint8_t* build_sweep_table() { return gSweepTable; }
+#endif
+
+// divide numer/denom, with a bias of 6bits. Assumes numer <= denom
+// and denom != 0. Since our table is 6bits big (+1), this is a nice fit.
+// Same as (but faster than) SkFixedDiv(numer, denom) >> 10
+
+//unsigned div_64(int numer, int denom);
+static unsigned div_64(int numer, int denom)
+{
+    SkASSERT(numer <= denom);
+    SkASSERT(numer > 0);
+    SkASSERT(denom > 0);
+        
+    int nbits = SkCLZ(numer);
+    int dbits = SkCLZ(denom);
+    int bits = 6 - nbits + dbits;
+    SkASSERT(bits <= 6);
+    
+    if (bits < 0)   // detect underflow
+        return 0;
+
+    denom <<= dbits - 1;
+    numer <<= nbits - 1;
+
+    unsigned result = 0;
+
+    // do the first one
+    if ((numer -= denom) >= 0)
+        result = 1;
+    else
+        numer += denom;
+    
+    // Now fall into our switch statement if there are more bits to compute
+    if (bits > 0)
+    {
+        // make room for the rest of the answer bits
+        result <<= bits;
+        switch (bits) {
+        case 6:
+            if ((numer = (numer << 1) - denom) >= 0)
+                result |= 32;
+            else
+                numer += denom;
+        case 5:
+            if ((numer = (numer << 1) - denom) >= 0)
+                result |= 16;
+            else
+                numer += denom;
+        case 4:
+            if ((numer = (numer << 1) - denom) >= 0)
+                result |= 8;
+            else
+                numer += denom;
+        case 3:
+            if ((numer = (numer << 1) - denom) >= 0)
+                result |= 4;
+            else
+                numer += denom;
+        case 2:
+            if ((numer = (numer << 1) - denom) >= 0)
+                result |= 2;
+            else
+                numer += denom;
+        case 1:
+        default:    // not strictly need, but makes GCC make better ARM code
+            if ((numer = (numer << 1) - denom) >= 0)
+                result |= 1;
+            else
+                numer += denom;
+        }
+    }
+    return result;
+}
+
+// Given x,y in the first quadrant, return 0..63 for the angle [0..90]
+static unsigned atan_0_90(SkFixed y, SkFixed x)
+{
+#ifdef SK_DEBUG
+    {
+        static bool gOnce;
+        if (!gOnce)
+        {
+            gOnce = true;
+            SkASSERT(div_64(55, 55) == 64);
+            SkASSERT(div_64(128, 256) == 32);
+            SkASSERT(div_64(2326528, 4685824) == 31);
+            SkASSERT(div_64(753664, 5210112) == 9);
+            SkASSERT(div_64(229376, 4882432) == 3);
+            SkASSERT(div_64(2, 64) == 2);
+            SkASSERT(div_64(1, 64) == 1);
+            // test that we handle underflow correctly
+            SkASSERT(div_64(12345, 0x54321234) == 0);
+        }
+    }
+#endif
+
+    SkASSERT(y > 0 && x > 0);
+    const uint8_t* table = build_sweep_table();
+
+    unsigned result;
+    bool swap = (x < y);
+    if (swap)
+    {
+        // first part of the atan(v) = PI/2 - atan(1/v) identity
+        // since our div_64 and table want v <= 1, where v = y/x
+        SkTSwap<SkFixed>(x, y);
+    }
+
+    result = div_64(y, x);
+    
+#ifdef SK_DEBUG
+    {
+        unsigned result2 = SkDivBits(y, x, 6);
+        SkASSERT(result2 == result ||
+                 (result == 1 && result2 == 0));
+    }
+#endif
+
+    SkASSERT(result < SK_ARRAY_COUNT(gSweepTable));
+    result = table[result];
+
+    if (swap)
+    {
+        // complete the atan(v) = PI/2 - atan(1/v) identity
+        result = 64 - result;
+        // pin to 63
+        result -= result >> 6;
+    }
+
+    SkASSERT(result <= 63);
+    return result;
+}
+
+//  returns angle in a circle [0..2PI) -> [0..255]
+static unsigned SkATan2_255(SkFixed y, SkFixed x)
+{
+    if (x == 0)
+    {
+        if (y == 0)
+            return 0;
+        return y < 0 ? 192 : 64;
+    }
+    if (y == 0)
+        return x < 0 ? 128 : 0;
+    
+    /*  Find the right quadrant for x,y
+        Since atan_0_90 only handles the first quadrant, we rotate x,y
+        appropriately before calling it, and then add the right amount
+        to account for the real quadrant.
+        quadrant 0 : add 0                  | x > 0 && y > 0
+        quadrant 1 : add 64 (90 degrees)    | x < 0 && y > 0
+        quadrant 2 : add 128 (180 degrees)  | x < 0 && y < 0
+        quadrant 3 : add 192 (270 degrees)  | x > 0 && y < 0
+        
+        map x<0 to (1 << 6)
+        map y<0 to (3 << 6)
+        add = map_x ^ map_y
+    */
+    int xsign = x >> 31;
+    int ysign = y >> 31;
+    int add = ((-xsign) ^ (ysign & 3)) << 6;
+
+#ifdef SK_DEBUG
+    if (0 == add)
+        SkASSERT(x > 0 && y > 0);
+    else if (64 == add)
+        SkASSERT(x < 0 && y > 0);
+    else if (128 == add)
+        SkASSERT(x < 0 && y < 0);
+    else if (192 == add)
+        SkASSERT(x > 0 && y < 0);
+    else
+        SkASSERT(!"bad value for add");
+#endif
+    
+    /*  This ^ trick makes x, y positive, and the swap<> handles quadrants
+        where we need to rotate x,y by 90 or -90
+    */
+    x = (x ^ xsign) - xsign;
+    y = (y ^ ysign) - ysign;
+    if (add & 64)               // quads 1 or 3 need to swap x,y
+        SkTSwap<SkFixed>(x, y);
+
+    unsigned result = add + atan_0_90(y, x);
+    SkASSERT(result < 256);
+    return result;
+}
+
+void Sweep_Gradient::shadeSpan(int x, int y, SkPMColor dstC[], int count)
+{
+    SkMatrix::MapXYProc proc = fDstToIndexProc;
+    const SkMatrix&     matrix = fDstToIndex;
+    const SkPMColor*    cache = this->getCache32();
+    SkPoint             srcPt;
+    
+    if (fDstToIndexClass != kPerspective_MatrixClass)
+    {
+        proc(matrix, SkIntToScalar(x) + SK_ScalarHalf,
+                     SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+        SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
+        SkFixed dy, fy = SkScalarToFixed(srcPt.fY);
+        
+        if (fDstToIndexClass == kFixedStepInX_MatrixClass)
+        {
+            SkFixed storage[2];
+            (void)matrix.fixedStepInX(SkIntToScalar(y) + SK_ScalarHalf,
+                                      &storage[0], &storage[1]);
+            dx = storage[0];
+            dy = storage[1];
+        }
+        else
+        {
+            SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
+            dx = SkScalarToFixed(matrix.getScaleX());
+            dy = SkScalarToFixed(matrix.getSkewY());
+        }
+        
+        for (; count > 0; --count)
+        {
+            *dstC++ = cache[SkATan2_255(fy, fx)];
+            fx += dx;
+            fy += dy;
+        }
+    }
+    else    // perspective case
+    {
+        for (int stop = x + count; x < stop; x++)
+        {
+            proc(matrix, SkIntToScalar(x) + SK_ScalarHalf,
+                 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+            
+            int index = SkATan2_255(SkScalarToFixed(srcPt.fY),
+                                    SkScalarToFixed(srcPt.fX));
+            *dstC++ = cache[index];
+        }
+    }
+}
+
+void Sweep_Gradient::shadeSpan16(int x, int y, uint16_t dstC[], int count)
+{
+    SkMatrix::MapXYProc proc = fDstToIndexProc;
+    const SkMatrix&     matrix = fDstToIndex;
+    const uint16_t*     cache = this->getCache16();
+    int                 toggle = ((x ^ y) & 1) << kCache16Bits;
+    SkPoint             srcPt;
+
+    if (fDstToIndexClass != kPerspective_MatrixClass)
+    {
+        proc(matrix, SkIntToScalar(x) + SK_ScalarHalf,
+                     SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+        SkFixed dx, fx = SkScalarToFixed(srcPt.fX);
+        SkFixed dy, fy = SkScalarToFixed(srcPt.fY);
+        
+        if (fDstToIndexClass == kFixedStepInX_MatrixClass)
+        {
+            SkFixed storage[2];
+            (void)matrix.fixedStepInX(SkIntToScalar(y) + SK_ScalarHalf,
+                                      &storage[0], &storage[1]);
+            dx = storage[0];
+            dy = storage[1];
+        }
+        else
+        {
+            SkASSERT(fDstToIndexClass == kLinear_MatrixClass);
+            dx = SkScalarToFixed(matrix.getScaleX());
+            dy = SkScalarToFixed(matrix.getSkewY());
+        }
+        
+        for (; count > 0; --count)
+        {
+            int index = SkATan2_255(fy, fx) >> (8 - kCache16Bits);
+            *dstC++ = cache[toggle + index];
+            toggle ^= (1 << kCache16Bits);
+            fx += dx;
+            fy += dy;
+        }
+    }
+    else    // perspective case
+    {
+        for (int stop = x + count; x < stop; x++)
+        {
+            proc(matrix, SkIntToScalar(x) + SK_ScalarHalf,
+                         SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
+            
+            int index = SkATan2_255(SkScalarToFixed(srcPt.fY),
+                                    SkScalarToFixed(srcPt.fX));
+            index >>= (8 - kCache16Bits);
+            *dstC++ = cache[toggle + index];
+            toggle ^= (1 << kCache16Bits);
+        }
+    }
+}
+
+///////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+
+// assumes colors is SkColor* and pos is SkScalar*
+#define EXPAND_1_COLOR(count)               \
+    SkColor tmp[2];                         \
+    do {                                    \
+        if (1 == count) {                   \
+            tmp[0] = tmp[1] = colors[0];    \
+            colors = tmp;                   \
+            pos = NULL;                     \
+            count = 2;                      \
+        }                                   \
+    } while (0)
+
+SkShader* SkGradientShader::CreateLinear(   const SkPoint pts[2],
+                                            const SkColor colors[], const SkScalar pos[], int colorCount,
+                                            SkShader::TileMode mode, SkUnitMapper* mapper)
+{
+    if (NULL == pts || NULL == colors || colorCount < 1) {
+        return NULL;
+    }
+    EXPAND_1_COLOR(colorCount);
+
+    return SkNEW_ARGS(Linear_Gradient, (pts, colors, pos, colorCount, mode, mapper));
+}
+
+SkShader* SkGradientShader::CreateRadial(   const SkPoint& center, SkScalar radius,
+                                            const SkColor colors[], const SkScalar pos[], int colorCount,
+                                            SkShader::TileMode mode, SkUnitMapper* mapper)
+{
+    if (radius <= 0 || NULL == colors || colorCount < 1) {
+        return NULL;
+    }
+    EXPAND_1_COLOR(colorCount);
+
+    return SkNEW_ARGS(Radial_Gradient, (center, radius, colors, pos, colorCount, mode, mapper));
+}
+
+SkShader* SkGradientShader::CreateSweep(SkScalar cx, SkScalar cy,
+                                        const SkColor colors[],
+                                        const SkScalar pos[],
+                                        int count, SkUnitMapper* mapper)
+{
+    if (NULL == colors || count < 1) {
+        return NULL;
+    }
+    EXPAND_1_COLOR(count);
+
+    return SkNEW_ARGS(Sweep_Gradient, (cx, cy, colors, pos, count, mapper));
+}
+
+static SkFlattenable::Registrar gLinearGradientReg("Linear_Gradient",
+                                                   Linear_Gradient::CreateProc);
+
+static SkFlattenable::Registrar gRadialGradientReg("Radial_Gradient",
+                                                   Radial_Gradient::CreateProc);
+
+static SkFlattenable::Registrar gSweepGradientReg("Sweep_Gradient",
+                                                   Sweep_Gradient::CreateProc);
+