simplify/unify xferproc api

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1721223002

Review URL: https://codereview.chromium.org/1721223002
diff --git a/bench/Xfer4fBench.cpp b/bench/Xfer4fBench.cpp
index f2bcedc..bc234a7 100644
--- a/bench/Xfer4fBench.cpp
+++ b/bench/Xfer4fBench.cpp
@@ -15,20 +15,22 @@
 #define INNER_LOOPS 1000
 
 // Benchmark that draws non-AA rects or AA text with an SkXfermode::Mode.
-class Xfer4fBench : public Benchmark {
+class XferD32Bench : public Benchmark {
 public:
-    Xfer4fBench(SkXfermode::Mode mode, const char name[], bool doN, uint32_t flags)
+    XferD32Bench(SkXfermode::Mode mode, const char name[], bool doN, uint32_t flags)
         : fDoN(doN)
         , fFlags(flags & ~USE_AA)
     {
-        fProc1 = SkXfermode::GetPM4fProc1(mode, fFlags);
-        fProcN = SkXfermode::GetPM4fProcN(mode, fFlags);
+        fXfer.reset(SkXfermode::Create(mode));
+        
+        fProc1 = SkXfermode::GetD32Proc(fXfer, fFlags | SkXfermode::kSrcIsSingle_D32Flag);
+        fProcN = SkXfermode::GetD32Proc(fXfer, fFlags);
         fName.printf("xfer4f_%s_%s_%c_%s_%s",
                      name,
                      (flags & USE_AA) ? "aa" : "bw",
                      fDoN ? 'N' : '1',
-                     (flags & SkXfermode::kSrcIsOpaque_PM4fFlag) ? "opaque" : "alpha",
-                     (flags & SkXfermode::kDstIsSRGB_PM4fFlag) ? "srgb" : "linear");
+                     (flags & SkXfermode::kSrcIsOpaque_D32Flag) ? "opaque" : "alpha",
+                     (flags & SkXfermode::kDstIsSRGB_D32Flag) ? "srgb" : "linear");
 
         for (int i = 0; i < N; ++i) {
             fSrc[i] = {{ 1, 1, 1, 1 }};
@@ -49,24 +51,23 @@
     const char* onGetName() override { return fName.c_str(); }
 
     void onDraw(int loops, SkCanvas*) override {
-        const SkXfermode::PM4fState state{ nullptr, fFlags };
-
         for (int i = 0; i < loops * INNER_LOOPS; ++i) {
             if (fDoN) {
-                fProcN(state, fDst, fSrc, N, fAA);
+                fProcN(fXfer, fDst, fSrc, N, fAA);
             } else {
-                fProc1(state, fDst, fSrc[0], N, fAA);
+                fProc1(fXfer, fDst, fSrc, N, fAA);
             }
         }
     }
 
 private:
-    SkString        fName;
-    SkXfermode::PM4fProc1 fProc1;
-    SkXfermode::PM4fProcN fProcN;
-    const SkAlpha*  fAA;
-    bool            fDoN;
-    uint32_t        fFlags;
+    SkAutoTUnref<SkXfermode> fXfer;
+    SkString             fName;
+    SkXfermode::D32Proc  fProc1;
+    SkXfermode::D32Proc  fProcN;
+    const SkAlpha*       fAA;
+    bool                 fDoN;
+    uint32_t             fFlags;
 
     enum {
         N = 1000,
@@ -79,26 +80,26 @@
 };
 
 #define F00 0
-#define F01 (SkXfermode::kSrcIsOpaque_PM4fFlag)
-#define F10 (SkXfermode::kDstIsSRGB_PM4fFlag)
-#define F11 (SkXfermode::kSrcIsOpaque_PM4fFlag | SkXfermode::kDstIsSRGB_PM4fFlag)
+#define F01 (SkXfermode::kSrcIsOpaque_D32Flag)
+#define F10 (SkXfermode::kDstIsSRGB_D32Flag)
+#define F11 (SkXfermode::kSrcIsOpaque_D32Flag | SkXfermode::kDstIsSRGB_D32Flag)
 
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F10); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F00); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F11); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F01); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F10); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F00); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F11); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F01); )
 
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F10); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F00); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F11); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F01); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F10); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F00); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F11); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F01); )
 
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F10 | USE_AA); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F00 | USE_AA); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F11 | USE_AA); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", false, F01 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F10 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F00 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F11 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", false, F01 | USE_AA); )
 
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F10 | USE_AA); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F00 | USE_AA); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F11 | USE_AA); )
-DEF_BENCH( return new Xfer4fBench(SkXfermode::kSrcOver_Mode, "srcover", true,  F01 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F10 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F00 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F11 | USE_AA); )
+DEF_BENCH( return new XferD32Bench(SkXfermode::kSrcOver_Mode, "srcover", true,  F01 | USE_AA); )
diff --git a/bench/XferU64Bench.cpp b/bench/XferU64Bench.cpp
index b81bc1e..a57ba60 100644
--- a/bench/XferU64Bench.cpp
+++ b/bench/XferU64Bench.cpp
@@ -15,22 +15,22 @@
 #define INNER_LOOPS 1000
 
 // Benchmark that draws non-AA rects or AA text with an SkXfermode::Mode.
-class XferU64Bench : public Benchmark {
+class XferD64Bench : public Benchmark {
 public:
-    XferU64Bench(SkXfermode::Mode mode, const char name[], bool doN, uint32_t flags)
+    XferD64Bench(SkXfermode::Mode mode, const char name[], bool doN, uint32_t flags)
         : fDoN(doN)
         , fFlags(flags & ~USE_AA)
     {
         fXfer.reset(SkXfermode::Create(mode));
 
-        fProc1 = SkXfermode::GetU64Proc1(mode, fFlags);
-        fProcN = SkXfermode::GetU64ProcN(mode, fFlags);
+        fProc1 = SkXfermode::GetD64Proc(fXfer, fFlags | SkXfermode::kSrcIsSingle_D64Flag);
+        fProcN = SkXfermode::GetD64Proc(fXfer, fFlags);
         fName.printf("xferu64_%s_%s_%c_%s_%s",
                      name,
                      (flags & USE_AA) ? "aa" : "bw",
                      fDoN ? 'N' : '1',
-                     (flags & SkXfermode::kSrcIsOpaque_U64Flag) ? "opaque" : "alpha",
-                     (flags & SkXfermode::kDstIsFloat16_U64Flag) ? "f16" : "u16");
+                     (flags & SkXfermode::kSrcIsOpaque_D64Flag) ? "opaque" : "alpha",
+                     (flags & SkXfermode::kDstIsFloat16_D64Flag) ? "f16" : "u16");
 
         for (int i = 0; i < N; ++i) {
             fSrc[i] = {{ 1, 1, 1, 1 }};
@@ -51,25 +51,23 @@
     const char* onGetName() override { return fName.c_str(); }
 
     void onDraw(int loops, SkCanvas*) override {
-        const SkXfermode::U64State state{ fXfer, fFlags };
-
         for (int i = 0; i < loops * INNER_LOOPS; ++i) {
             if (fDoN) {
-                fProcN(state, fDst, fSrc, N, fAA);
+                fProcN(fXfer, fDst, fSrc, N, fAA);
             } else {
-                fProc1(state, fDst, fSrc[0], N, fAA);
+                fProc1(fXfer, fDst, fSrc, N, fAA);
             }
         }
     }
 
 private:
     SkAutoTUnref<SkXfermode> fXfer;
-    SkString             fName;
-    SkXfermode::U64Proc1 fProc1;
-    SkXfermode::U64ProcN fProcN;
-    const SkAlpha*  fAA;
-    bool            fDoN;
-    uint32_t        fFlags;
+    SkString            fName;
+    SkXfermode::D64Proc fProc1;
+    SkXfermode::D64Proc fProcN;
+    const SkAlpha*      fAA;
+    bool                fDoN;
+    uint32_t            fFlags;
 
     enum {
         N = 1000,
@@ -82,29 +80,29 @@
 };
 
 #define F00 0
-#define F01 (SkXfermode::kSrcIsOpaque_U64Flag)
-#define F10 (SkXfermode::kDstIsFloat16_U64Flag)
-#define F11 (SkXfermode::kDstIsFloat16_U64Flag | SkXfermode::kSrcIsOpaque_U64Flag)
+#define F01 (SkXfermode::kSrcIsOpaque_D64Flag)
+#define F10 (SkXfermode::kDstIsFloat16_D64Flag)
+#define F11 (SkXfermode::kDstIsFloat16_D64Flag | SkXfermode::kSrcIsOpaque_D64Flag)
 
 #define MODE    SkXfermode::kSrcOver_Mode
 #define NAME    "srcover"
 
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F10 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F11 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F10); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F11); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F10 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F11 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F10); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F11); )
 
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F00 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F01 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F00); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, true,  F01); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F00 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F01 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F00); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, true,  F01); )
 
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F10 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F11 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F10); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F11); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F10 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F11 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F10); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F11); )
 
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F00 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F01 | USE_AA); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F00); )
-DEF_BENCH( return new XferU64Bench(MODE, NAME, false, F01); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F00 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F01 | USE_AA); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F00); )
+DEF_BENCH( return new XferD64Bench(MODE, NAME, false, F01); )
diff --git a/gm/SkLinearBitmapPipelineGM.cpp b/gm/SkLinearBitmapPipelineGM.cpp
index c8fcfc9..7406817 100644
--- a/gm/SkLinearBitmapPipelineGM.cpp
+++ b/gm/SkLinearBitmapPipelineGM.cpp
@@ -115,8 +115,7 @@
     //if (kSRGB_SkColorProfileType == profile) {
         //flags |= SkXfermode::kDstIsSRGB_PM4fFlag;
     //}
-    const SkXfermode::PM4fState state { nullptr, flags };
-    auto procN = SkXfermode::GetPM4fProcN(SkXfermode::kSrcOver_Mode, flags);
+    auto procN = SkXfermode::GetD32Proc(nullptr, flags);
 
     SkLinearBitmapPipeline pipeline{
             inv, filterQuality,
@@ -124,7 +123,7 @@
 
     for (int y = 0; y < ir.height(); y++) {
         pipeline.shadeSpan4f(0, y, dstBits, ir.width());
-        procN(state, pmdst.writable_addr32(0, y), dstBits, ir.width(), nullptr);
+        procN(nullptr, pmdst.writable_addr32(0, y), dstBits, ir.width(), nullptr);
     }
 
     delete [] dstBits;
diff --git a/gm/xfer4f.cpp b/gm/xfer4f.cpp
deleted file mode 100644
index dc7196d..0000000
--- a/gm/xfer4f.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright 2016 Google Inc.
- *
- * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file.
- */
-
-#include "gm.h"
-#include "SkCanvas.h"
-#include "SkImageInfo.h"
-#include "SkXfermode.h"
-#include "SkPM4f.h"
-
-static void draw_rect(SkCanvas* canvas, const SkRect& r, SkColor c, SkColorProfileType profile,
-                      const SkAlpha aa[]) {
-    const SkIRect ir = r.round();
-
-    SkBitmap bm;
-    bm.allocN32Pixels(ir.width(), ir.height());
-    bm.eraseColor(0xFFFFFFFF);
-    SkPixmap pm;
-    bm.peekPixels(&pm);
-
-    uint32_t flags = 0;
-    if (SkColorGetA(c) == 0xFF) {
-        flags |= SkXfermode::kSrcIsOpaque_PM4fFlag;
-    }
-    if (kSRGB_SkColorProfileType == profile) {
-        flags |= SkXfermode::kDstIsSRGB_PM4fFlag;
-    }
-
-    const SkXfermode::PM4fState state { nullptr, flags };
-
-    const SkPM4f src = SkColor4f::FromColor(c).premul();
-    auto proc1 = SkXfermode::GetPM4fProc1(SkXfermode::kSrcOver_Mode, flags);
-    for (int y = 0; y < ir.height()/2; ++y) {
-        proc1(state, pm.writable_addr32(0, y), src, ir.width(), aa);
-    }
-
-    SkPM4f srcRow[1000];
-    for (int i = 0; i < ir.width(); ++i) {
-        srcRow[i] = src;
-    }
-    auto procN = SkXfermode::GetPM4fProcN(SkXfermode::kSrcOver_Mode, flags);
-    // +1 to skip a row, so we can see the boundary between proc1 and procN
-    for (int y = ir.height()/2 + 1; y < ir.height(); ++y) {
-        procN(state, pm.writable_addr32(0, y), srcRow, ir.width(), aa);
-    }
-
-    canvas->drawBitmap(bm, r.left(), r.top(), nullptr);
-}
-
-/*
- *  Test SkXfer4fProcs directly for src-over, comparing them to current SkColor blits.
- */
-DEF_SIMPLE_GM(xfer4f_srcover, canvas, 580, 760) {
-    const int IW = 50;
-    const SkScalar W = IW;
-    const SkScalar H = 100;
-
-    const int profiles[] = {
-        -1,
-        kLinear_SkColorProfileType,
-        kSRGB_SkColorProfileType,
-    };
-    const SkColor colors[] = {
-        SK_ColorBLACK, SK_ColorRED, SK_ColorGREEN, SK_ColorBLUE,
-        0x88000000, 0x88FF0000, 0x8800FF00, 0x880000FF
-    };
-    
-    uint8_t aa_scanline[IW];
-    for (int i = 0; i < IW; ++i) {
-        aa_scanline[i] = i * 255 / (IW - 1);
-    }
-    uint8_t const* aa_table[] = { nullptr, aa_scanline };
-
-    SkBitmap mask;
-    mask.installPixels(SkImageInfo::MakeA8(IW, 1), aa_scanline, IW);
-
-    canvas->translate(20, 20);
-
-    const SkRect r = SkRect::MakeWH(W, H);
-    for (const uint8_t* aa : aa_table) {
-        canvas->save();
-        for (auto profile : profiles) {
-            canvas->save();
-            for (SkColor c : colors) {
-                if (profile < 0) {
-                    SkPaint p;
-                    p.setColor(c);
-                    if (aa) {
-                        canvas->drawBitmapRect(mask, r, &p);
-                    } else {
-                        canvas->drawRect(r, p);
-                    }
-                } else {
-                    draw_rect(canvas, r, c, (SkColorProfileType)profile, aa);
-                }
-                canvas->translate(W + 20, 0);
-            }
-            canvas->restore();
-            canvas->translate(0, H + 20);
-        }
-        canvas->restore();
-        canvas->translate(0, (H + 20) * SK_ARRAY_COUNT(profiles) + 20);
-    }
-}
diff --git a/include/core/SkXfermode.h b/include/core/SkXfermode.h
index 04630e7..2c4da5f 100644
--- a/include/core/SkXfermode.h
+++ b/include/core/SkXfermode.h
@@ -220,38 +220,23 @@
     SK_DECLARE_FLATTENABLE_REGISTRAR_GROUP()
     SK_DEFINE_FLATTENABLE_TYPE(SkXfermode)
 
-    enum PM4fFlags {
-        kSrcIsOpaque_PM4fFlag  = 1 << 0,
-        kDstIsSRGB_PM4fFlag    = 1 << 1,
+    enum D32Flags {
+        kSrcIsOpaque_D32Flag  = 1 << 0,
+        kSrcIsSingle_D32Flag  = 1 << 1,
+        kDstIsSRGB_D32Flag    = 1 << 2,
     };
-    struct PM4fState {
-        const SkXfermode* fXfer;
-        uint32_t          fFlags;
-    };
-    typedef void (*PM4fProc1)(const PM4fState&, uint32_t dst[], const SkPM4f& src,
-                              int count, const SkAlpha coverage[]);
-    typedef void (*PM4fProcN)(const PM4fState&, uint32_t dst[], const SkPM4f src[],
-                              int count, const SkAlpha coverage[]);
+    typedef void (*D32Proc)(const SkXfermode*, uint32_t dst[], const SkPM4f src[],
+                            int count, const SkAlpha coverage[]);
+    static D32Proc GetD32Proc(SkXfermode*, uint32_t flags);
 
-    static PM4fProc1 GetPM4fProc1(Mode, uint32_t flags);
-    static PM4fProcN GetPM4fProcN(Mode, uint32_t flags);
-    virtual PM4fProc1 getPM4fProc1(uint32_t flags) const;
-    virtual PM4fProcN getPM4fProcN(uint32_t flags) const;
-
-    enum U64Flags {
-        kSrcIsOpaque_U64Flag  = 1 << 0,
-        kDstIsFloat16_U64Flag = 1 << 1, // else U16 bit components
+    enum D64Flags {
+        kSrcIsOpaque_D64Flag  = 1 << 0,
+        kSrcIsSingle_D64Flag  = 1 << 1,
+        kDstIsFloat16_D64Flag = 1 << 2, // else U16 bit components
     };
-    struct U64State {
-        const SkXfermode* fXfer;
-        uint32_t          fFlags;
-    };
-    typedef void (*U64Proc1)(const U64State&, uint64_t dst[], const SkPM4f& src, int count,
-                             const SkAlpha coverage[]);
-    typedef void (*U64ProcN)(const U64State&, uint64_t dst[], const SkPM4f src[], int count,
-                             const SkAlpha coverage[]);
-    static U64Proc1 GetU64Proc1(Mode, uint32_t flags);
-    static U64ProcN GetU64ProcN(Mode, uint32_t flags);
+    typedef void (*D64Proc)(const SkXfermode*, uint64_t dst[], const SkPM4f src[], int count,
+                            const SkAlpha coverage[]);
+    static D64Proc GetD64Proc(SkXfermode*, uint32_t flags);
 
     enum LCDFlags {
         kSrcIsOpaque_LCDFlag    = 1 << 0,   // else src(s) may have alpha < 1
@@ -275,6 +260,9 @@
     */
     virtual SkPMColor xferColor(SkPMColor src, SkPMColor dst) const;
 
+    virtual D32Proc onGetD32Proc(uint32_t flags) const;
+    virtual D64Proc onGetD64Proc(uint32_t flags) const;
+
 private:
     enum {
         kModeCount = kLastMode + 1
diff --git a/src/core/SkBitmapProcShader.h b/src/core/SkBitmapProcShader.h
index 05c5955..b897c6e 100644
--- a/src/core/SkBitmapProcShader.h
+++ b/src/core/SkBitmapProcShader.h
@@ -75,7 +75,7 @@
 // an Sk3DBlitter in SkDraw.cpp
 // Note that some contexts may contain other contexts (e.g. for compose shaders), but we've not
 // yet found a situation where the size below isn't big enough.
-typedef SkSmallAllocator<3, 1280> SkTBlitterAllocator;
+typedef SkSmallAllocator<3, 1500> SkTBlitterAllocator;
 
 // If alloc is non-nullptr, it will be used to allocate the returned SkShader, and MUST outlive
 // the SkShader.
diff --git a/src/core/SkBlitter_PM4f.cpp b/src/core/SkBlitter_PM4f.cpp
index 24fc4a3..179aec3 100644
--- a/src/core/SkBlitter_PM4f.cpp
+++ b/src/core/SkBlitter_PM4f.cpp
@@ -27,7 +27,8 @@
     void blitH(int x, int y, int width) override {
         SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
         
-        fState.fProc1(fState, State::WritableAddr(fDevice, x, y), fState.fPM4f, width, nullptr);
+        fState.fProc1(fState.fXfer, State::WritableAddr(fDevice, x, y),
+                      &fState.fPM4f, width, nullptr);
     }
 
     void blitV(int x, int y, int height, SkAlpha alpha) override {
@@ -37,7 +38,7 @@
         size_t                 deviceRB = fDevice.rowBytes();
         
         for (int i = 0; i < height; ++i) {
-            fState.fProc1(fState, device, fState.fPM4f, 1, &alpha);
+            fState.fProc1(fState.fXfer, device, &fState.fPM4f, 1, &alpha);
             device = (typename State::DstType*)((char*)device + deviceRB);
         }
     }
@@ -50,7 +51,7 @@
         size_t        deviceRB = fDevice.rowBytes();
         
         do {
-            fState.fProc1(fState, device, fState.fPM4f, width, nullptr);
+            fState.fProc1(fState.fXfer, device, &fState.fPM4f, width, nullptr);
             y += 1;
             device = (typename State::DstType*)((char*)device + deviceRB);
         } while (--height > 0);
@@ -67,10 +68,10 @@
             int aa = *antialias;
             if (aa) {
                 if (aa == 255) {
-                    fState.fProc1(fState, device, fState.fPM4f, count, nullptr);
+                    fState.fProc1(fState.fXfer, device, &fState.fPM4f, count, nullptr);
                 } else {
                     for (int i = 0; i < count; ++i) {
-                        fState.fProc1(fState, &device[i], fState.fPM4f, 1, antialias);
+                        fState.fProc1(fState.fXfer, &device[i], &fState.fPM4f, 1, antialias);
                     }
                 }
             }
@@ -124,7 +125,7 @@
         const size_t maskRB = mask.fRowBytes;
         
         for (int i = 0; i < height; ++i) {
-            fState.fProc1(fState, device, fState.fPM4f, width, maskRow);
+            fState.fProc1(fState.fXfer, device, &fState.fPM4f, width, maskRow);
             device = (typename State::DstType*)((char*)device + dstRB);
             maskRow += maskRB;
         }
@@ -146,7 +147,7 @@
         
         typename State::DstType* device = State::WritableAddr(fDevice, x, y);
         fShaderContext->shadeSpan4f(x, y, fState.fBuffer, width);
-        fState.fProcN(fState, device, fState.fBuffer, width, nullptr);
+        fState.fProcN(fState.fXfer, device, fState.fBuffer, width, nullptr);
     }
 
     void blitV(int x, int y, int height, SkAlpha alpha) override {
@@ -163,7 +164,7 @@
             if (!fConstInY) {
                 fShaderContext->shadeSpan4f(x, y, fState.fBuffer, 1);
             }
-            fState.fProcN(fState, device, fState.fBuffer, 1, &alpha);
+            fState.fProcN(fState.fXfer, device, fState.fBuffer, 1, &alpha);
             device = (typename State::DstType*)((char*)device + deviceRB);
         }
     }
@@ -183,7 +184,7 @@
             if (!fConstInY) {
                 fShaderContext->shadeSpan4f(x, y, fState.fBuffer, width);
             }
-            fState.fProcN(fState, device, fState.fBuffer, width, nullptr);
+            fState.fProcN(fState.fXfer, device, fState.fBuffer, width, nullptr);
             device = (typename State::DstType*)((char*)device + deviceRB);
         }
     }
@@ -200,10 +201,10 @@
             if (aa) {
                 fShaderContext->shadeSpan4f(x, y, fState.fBuffer, count);
                 if (aa == 255) {
-                    fState.fProcN(fState, device, fState.fBuffer, count, nullptr);
+                    fState.fProcN(fState.fXfer, device, fState.fBuffer, count, nullptr);
                 } else {
                     for (int i = 0; i < count; ++i) {
-                        fState.fProcN(fState, &device[i], &fState.fBuffer[i], 1, antialias);
+                        fState.fProcN(fState.fXfer, &device[i], &fState.fBuffer[i], 1, antialias);
                     }
                 }
             }
@@ -267,7 +268,7 @@
             if (!fConstInY) {
                 fShaderContext->shadeSpan4f(x, y, fState.fBuffer, width);
             }
-            fState.fProcN(fState, device, fState.fBuffer, width, maskRow);
+            fState.fProcN(fState.fXfer, device, fState.fBuffer, width, maskRow);
             device = (typename State::DstType*)((char*)device + deviceRB);
             maskRow += maskRB;
         }
@@ -286,47 +287,45 @@
     : 0xFF == paint.getAlpha();
 }
 
-struct State32 : SkXfermode::PM4fState {
-    typedef uint32_t        DstType;
-    
-    SkXfermode::PM4fProc1   fProc1;
-    SkXfermode::PM4fProcN   fProcN;
-    SkPM4f                  fPM4f;
-    SkPM4f*                 fBuffer;
-    
-    State32(const SkImageInfo& info, const SkPaint& paint, const SkShader::Context* shaderContext) {
-        fXfer = SkSafeRef(paint.getXfermode());
-        fFlags = 0;
-        if (is_opaque(paint, shaderContext)) {
-            fFlags |= SkXfermode::kSrcIsOpaque_PM4fFlag;
-        }
-        if (info.isSRGB()) {
-            fFlags |= SkXfermode::kDstIsSRGB_PM4fFlag;
-        }
-        if (fXfer) {
-            fProc1 = fXfer->getPM4fProc1(fFlags);
-            fProcN = fXfer->getPM4fProcN(fFlags);
-        } else {
-            fProc1 = SkXfermode::GetPM4fProc1(SkXfermode::kSrcOver_Mode, fFlags);
-            fProcN = SkXfermode::GetPM4fProcN(SkXfermode::kSrcOver_Mode, fFlags);
-        }
-
-        fBuffer = nullptr;
+struct State4f {
+    State4f(const SkImageInfo& info, const SkPaint& paint, const SkShader::Context* shaderContext) {
+        fXfer = paint.getXfermode();
         if (shaderContext) {
-            fBuffer = new SkPM4f[info.width()];
+            fBuffer.reset(info.width());
         } else {
             fPM4f = SkColor4f::FromColor(paint.getColor()).premul();
         }
-    }
-    
-    ~State32() {
-        SkSafeUnref(fXfer);
-        delete[] fBuffer;
+        fFlags = 0;
     }
 
+    SkXfermode*             fXfer;
+    SkPM4f                  fPM4f;
+    SkAutoTMalloc<SkPM4f>   fBuffer;
+    uint32_t                fFlags;
+};
+
+struct State32 : State4f {
+    typedef uint32_t    DstType;
+    
+    SkXfermode::D32Proc fProc1;
+    SkXfermode::D32Proc fProcN;
+    
+    State32(const SkImageInfo& info, const SkPaint& paint, const SkShader::Context* shaderContext)
+        : State4f(info, paint, shaderContext)
+    {
+        if (is_opaque(paint, shaderContext)) {
+            fFlags |= SkXfermode::kSrcIsOpaque_D32Flag;
+        }
+        if (info.isSRGB()) {
+            fFlags |= SkXfermode::kDstIsSRGB_D32Flag;
+        }
+        fProc1 = SkXfermode::GetD32Proc(fXfer, fFlags | SkXfermode::kSrcIsSingle_D32Flag);
+        fProcN = SkXfermode::GetD32Proc(fXfer, fFlags);
+    }
+    
     SkXfermode::LCD32Proc getLCDProc(uint32_t oneOrManyFlag) const {
         uint32_t flags = fFlags & 1;
-        if (!(fFlags & SkXfermode::kDstIsSRGB_PM4fFlag)) {
+        if (!(fFlags & SkXfermode::kDstIsSRGB_D32Flag)) {
             flags |= SkXfermode::kDstIsLinearInt_LCDFlag;
         }
         return SkXfermode::GetLCD32Proc(flags | oneOrManyFlag);
@@ -337,47 +336,28 @@
     }
 };
 
-struct State64 : SkXfermode::U64State {
-    typedef uint64_t        DstType;
+struct State64 : State4f {
+    typedef uint64_t    DstType;
     
-    SkXfermode::U64Proc1    fProc1;
-    SkXfermode::U64ProcN    fProcN;
-    SkPM4f                  fPM4f;
-    SkPM4f*                 fBuffer;
+    SkXfermode::D64Proc fProc1;
+    SkXfermode::D64Proc fProcN;
     
-    State64(const SkImageInfo& info, const SkPaint& paint, const SkShader::Context* shaderContext) {
-        fXfer = SkSafeRef(paint.getXfermode());
-        fFlags = 0;
+    State64(const SkImageInfo& info, const SkPaint& paint, const SkShader::Context* shaderContext)
+        : State4f(info, paint, shaderContext)
+    {
         if (is_opaque(paint, shaderContext)) {
-            fFlags |= SkXfermode::kSrcIsOpaque_PM4fFlag;
+            fFlags |= SkXfermode::kSrcIsOpaque_D64Flag;
         }
         if (kRGBA_F16_SkColorType == info.colorType()) {
-            fFlags |= SkXfermode::kDstIsFloat16_U64Flag;
+            fFlags |= SkXfermode::kDstIsFloat16_D64Flag;
         }
-        
-        SkXfermode::Mode mode;
-        if (!SkXfermode::AsMode(fXfer, &mode)) {
-            mode = SkXfermode::kSrcOver_Mode;
-        }
-        fProc1 = SkXfermode::GetU64Proc1(mode, fFlags);
-        fProcN = SkXfermode::GetU64ProcN(mode, fFlags);
-        
-        fBuffer = nullptr;
-        if (shaderContext) {
-            fBuffer = new SkPM4f[info.width()];
-        } else {
-            fPM4f = SkColor4f::FromColor(paint.getColor()).premul();
-        }
+        fProc1 = SkXfermode::GetD64Proc(fXfer, fFlags | SkXfermode::kSrcIsSingle_D64Flag);
+        fProcN = SkXfermode::GetD64Proc(fXfer, fFlags);
     }
-    
-    ~State64() {
-        SkSafeUnref(fXfer);
-        delete[] fBuffer;
-    }
-    
+
     SkXfermode::LCD64Proc getLCDProc(uint32_t oneOrManyFlag) const {
         uint32_t flags = fFlags & 1;
-        if (!(fFlags & SkXfermode::kDstIsFloat16_U64Flag)) {
+        if (!(fFlags & SkXfermode::kDstIsFloat16_D64Flag)) {
             flags |= SkXfermode::kDstIsLinearInt_LCDFlag;
         }
         return SkXfermode::GetLCD64Proc(flags | oneOrManyFlag);
diff --git a/src/core/SkSpriteBlitter4f.cpp b/src/core/SkSpriteBlitter4f.cpp
index fc4b480..474bf4d 100644
--- a/src/core/SkSpriteBlitter4f.cpp
+++ b/src/core/SkSpriteBlitter4f.cpp
@@ -13,12 +13,14 @@
 class Sprite_4f : public SkSpriteBlitter {
 public:
     Sprite_4f(const SkPixmap& src, const SkPaint& paint) : INHERITED(src) {
+        fXfer = paint.getXfermode();
         fLoader = SkLoadSpanProc_Choose(src.info());
         fFilter = SkFilterSpanProc_Choose(paint);
         fBuffer.reset(src.width());
     }
     
 protected:
+    SkXfermode*             fXfer;
     SkLoadSpanProc          fLoader;
     SkFilterSpanProc        fFilter;
     SkAutoTMalloc<SkPM4f>   fBuffer;
@@ -29,22 +31,14 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-static SkXfermode::Mode get_mode(const SkXfermode* xfer) {
-    SkXfermode::Mode mode;
-    if (!SkXfermode::AsMode(xfer, &mode)) {
-        mode = SkXfermode::kSrcOver_Mode;
-    }
-    return mode;
-}
-
 class Sprite_F16 : public Sprite_4f {
 public:
     Sprite_F16(const SkPixmap& src, const SkPaint& paint) : INHERITED(src, paint) {
-        fState = { paint.getXfermode(), SkXfermode::kDstIsFloat16_U64Flag };
+        uint32_t flags = SkXfermode::kDstIsFloat16_D64Flag;
         if (src.isOpaque()) {
-            fState.fFlags |= SkXfermode::kSrcIsOpaque_U64Flag;
+            flags |= SkXfermode::kSrcIsOpaque_D64Flag;
         }
-        fXfer = SkXfermode::GetU64ProcN(get_mode(fState.fXfer), fState.fFlags);
+        fWriter = SkXfermode::GetD64Proc(fXfer, flags);
     }
 
     void blitRect(int x, int y, int width, int height) override {
@@ -55,14 +49,13 @@
         for (int bottom = y + height; y < bottom; ++y) {
             fLoader(fSource, x - fLeft, y - fTop, fBuffer, width);
             fFilter(*fPaint, fBuffer, width);
-            fXfer(fState, dst, fBuffer, width, nullptr);
+            fWriter(fXfer, dst, fBuffer, width, nullptr);
             dst = (uint64_t* SK_RESTRICT)((char*)dst + dstRB);
         }
     }
     
 private:
-    SkXfermode::U64State    fState;
-    SkXfermode::U64ProcN    fXfer;
+    SkXfermode::D64Proc fWriter;
 
     typedef Sprite_4f INHERITED;
 };
@@ -90,11 +83,11 @@
 class Sprite_sRGB : public Sprite_4f {
 public:
     Sprite_sRGB(const SkPixmap& src, const SkPaint& paint) : INHERITED(src, paint) {
-        fState = { paint.getXfermode(), SkXfermode::kDstIsSRGB_PM4fFlag };
+        uint32_t flags = SkXfermode::kDstIsSRGB_D32Flag;
         if (src.isOpaque()) {
-            fState.fFlags |= SkXfermode::kSrcIsOpaque_PM4fFlag;
+            flags |= SkXfermode::kSrcIsOpaque_D32Flag;
         }
-        fXfer = SkXfermode::GetPM4fProcN(get_mode(fState.fXfer), fState.fFlags);
+        fWriter = SkXfermode::GetD32Proc(fXfer, flags);
     }
     
     void blitRect(int x, int y, int width, int height) override {
@@ -105,14 +98,13 @@
         for (int bottom = y + height; y < bottom; ++y) {
             fLoader(fSource, x - fLeft, y - fTop, fBuffer, width);
             fFilter(*fPaint, fBuffer, width);
-            fXfer(fState, dst, fBuffer, width, nullptr);
+            fWriter(fXfer, dst, fBuffer, width, nullptr);
             dst = (uint32_t* SK_RESTRICT)((char*)dst + dstRB);
         }
     }
     
 protected:
-    SkXfermode::PM4fState   fState;
-    SkXfermode::PM4fProcN   fXfer;
+    SkXfermode::D32Proc fWriter;
     
 private:
     typedef Sprite_4f INHERITED;
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index 7002e9a..262e526 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -928,8 +928,9 @@
 
 static void assert_unit(const SkPM4f& r) {
 #ifdef SK_DEBUG
-    const float min = 0;
-    const float max = 1;
+    const float eps = 0.00001f;
+    const float min = 0 - eps;
+    const float max = 1 + eps;
     for (int i = 0; i < 4; ++i) {
         SkASSERT(r.fVec[i] >= min && r.fVec[i] <= max);
     }
diff --git a/src/core/SkXfermode4f.cpp b/src/core/SkXfermode4f.cpp
index 8aa2ce2..46eed05 100644
--- a/src/core/SkXfermode4f.cpp
+++ b/src/core/SkXfermode4f.cpp
@@ -9,11 +9,6 @@
 #include "SkUtils.h"
 #include "SkXfermode.h"
 
-struct XferProcPair {
-    SkXfermode::PM4fProc1 fP1;
-    SkXfermode::PM4fProcN fPN;
-};
-
 enum DstType {
     kLinear_Dst,
     kSRGB_Dst,
@@ -45,29 +40,29 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <DstType D> void general_1(const SkXfermode::PM4fState& state, uint32_t dst[],
-                                    const SkPM4f& src, int count, const SkAlpha aa[]) {
-    SkXfermodeProc4f proc = state.fXfer->getProc4f();
+template <DstType D> void general_1(const SkXfermode* xfer, uint32_t dst[],
+                                    const SkPM4f* src, int count, const SkAlpha aa[]) {
+    SkXfermodeProc4f proc = xfer->getProc4f();
     SkPM4f d;
     if (aa) {
         for (int i = 0; i < count; ++i) {
             Sk4f d4 = load_dst<D>(dst[i]);
             d4.store(d.fVec);
-            Sk4f r4 = Sk4f::Load(proc(src, d).fVec);
+            Sk4f r4 = Sk4f::Load(proc(*src, d).fVec);
             dst[i] = store_dst<D>(lerp(r4, d4, aa[i]));
         }
     } else {
         for (int i = 0; i < count; ++i) {
             load_dst<D>(dst[i]).store(d.fVec);
-            Sk4f r4 = Sk4f::Load(proc(src, d).fVec);
+            Sk4f r4 = Sk4f::Load(proc(*src, d).fVec);
             dst[i] = store_dst<D>(r4);
         }
     }
 }
 
-template <DstType D> void general_n(const SkXfermode::PM4fState& state, uint32_t dst[],
+template <DstType D> void general_n(const SkXfermode* xfer, uint32_t dst[],
                                     const SkPM4f src[], int count, const SkAlpha aa[]) {
-    SkXfermodeProc4f proc = state.fXfer->getProc4f();
+    SkXfermodeProc4f proc = xfer->getProc4f();
     SkPM4f d;
     if (aa) {
         for (int i = 0; i < count; ++i) {
@@ -85,16 +80,16 @@
     }
 }
 
-const XferProcPair gProcs_General[] = {
-    { general_1<kLinear_Dst>,   general_n<kLinear_Dst>  },   // linear   alpha
-    { general_1<kLinear_Dst>,   general_n<kLinear_Dst>  },   // linear   opaque
-    { general_1<kSRGB_Dst>,     general_n<kSRGB_Dst>    },   // srgb     alpha
-    { general_1<kSRGB_Dst>,     general_n<kSRGB_Dst>    },   // srgb     opaque
+const SkXfermode::D32Proc gProcs_General[] = {
+    general_n<kLinear_Dst>, general_n<kLinear_Dst>,
+    general_1<kLinear_Dst>, general_1<kLinear_Dst>,
+    general_n<kSRGB_Dst>,   general_n<kSRGB_Dst>,
+    general_1<kSRGB_Dst>,   general_1<kSRGB_Dst>,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-static void clear_linear_n(const SkXfermode::PM4fState& state, uint32_t dst[], const SkPM4f[],
+static void clear_linear(const SkXfermode*, uint32_t dst[], const SkPM4f[],
                            int count, const SkAlpha aa[]) {
     if (aa) {
         for (int i = 0; i < count; ++i) {
@@ -109,45 +104,34 @@
             }
         }
     } else {
-        sk_bzero(dst, count * sizeof(SkPMColor));
+        sk_memset32(dst, 0, count);
     }
 }
 
-static void clear_linear_1(const SkXfermode::PM4fState& state, uint32_t dst[], const SkPM4f&,
-                           int count, const SkAlpha coverage[]) {
-    clear_linear_n(state, dst, nullptr, count, coverage);
-}
-
-static void clear_srgb_n(const SkXfermode::PM4fState& state, uint32_t dst[], const SkPM4f[],
-                           int count, const SkAlpha aa[]) {
+static void clear_srgb(const SkXfermode*, uint32_t dst[], const SkPM4f[],
+                       int count, const SkAlpha aa[]) {
     if (aa) {
         for (int i = 0; i < count; ++i) {
-            unsigned a = aa[i];
-            if (a) {
-                Sk4f d = Sk4f_fromS32(dst[i]) * Sk4f((255 - a) * (1/255.0f));
+            if (aa[i]) {
+                Sk4f d = Sk4f_fromS32(dst[i]) * Sk4f((255 - aa[i]) * (1/255.0f));
                 dst[i] = Sk4f_toS32(d);
             }
         }
     } else {
-        sk_bzero(dst, count * sizeof(SkPMColor));
+        sk_memset32(dst, 0, count);
     }
 }
 
-static void clear_srgb_1(const SkXfermode::PM4fState& state, uint32_t dst[], const SkPM4f&,
-                           int count, const SkAlpha coverage[]) {
-    clear_srgb_n(state, dst, nullptr, count, coverage);
-}
-
-const XferProcPair gProcs_Clear[] = {
-    { clear_linear_1, clear_linear_n },       // linear   [alpha]
-    { clear_linear_1, clear_linear_n },       // linear   [opaque]
-    { clear_srgb_1,   clear_srgb_n   },       // srgb     [alpha]
-    { clear_srgb_1,   clear_srgb_n   },       // srgb     [opaque]
+const SkXfermode::D32Proc gProcs_Clear[] = {
+    clear_linear,   clear_linear,
+    clear_linear,   clear_linear,
+    clear_srgb,     clear_srgb,
+    clear_srgb,     clear_srgb,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <DstType D> void src_n(const SkXfermode::PM4fState& state, uint32_t dst[],
+template <DstType D> void src_n(const SkXfermode*, uint32_t dst[],
                                 const SkPM4f src[], int count, const SkAlpha aa[]) {
     for (int i = 0; i < count; ++i) {
         unsigned a = 0xFF;
@@ -170,9 +154,9 @@
     return dst + (src - dst) * src_scale;
 }
 
-template <DstType D> void src_1(const SkXfermode::PM4fState& state, uint32_t dst[],
-                                const SkPM4f& src, int count, const SkAlpha aa[]) {
-    const Sk4f s4 = Sk4f::Load(src.fVec);
+template <DstType D> void src_1(const SkXfermode*, uint32_t dst[],
+                                const SkPM4f* src, int count, const SkAlpha aa[]) {
+    const Sk4f s4 = Sk4f::Load(src->fVec);
 
     if (aa) {
         if (D == kLinear_Dst) {
@@ -223,31 +207,24 @@
     }
 }
 
-const XferProcPair gProcs_Src[] = {
-    { src_1<kLinear_Dst>, src_n<kLinear_Dst> },       // linear   [alpha]
-    { src_1<kLinear_Dst>, src_n<kLinear_Dst> },       // linear   [opaque]
-    { src_1<kSRGB_Dst>,   src_n<kSRGB_Dst>   },       // srgb     [alpha]
-    { src_1<kSRGB_Dst>,   src_n<kSRGB_Dst>   },       // srgb     [opaque]
+const SkXfermode::D32Proc gProcs_Src[] = {
+    src_n<kLinear_Dst>, src_n<kLinear_Dst>,
+    src_1<kLinear_Dst>, src_1<kLinear_Dst>,
+    src_n<kSRGB_Dst>,   src_n<kSRGB_Dst>,
+    src_1<kSRGB_Dst>,   src_1<kSRGB_Dst>,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-static void dst_n(const SkXfermode::PM4fState& state, uint32_t dst[], const SkPM4f[],
-                         int count, const SkAlpha aa[]) {}
+static void dst(const SkXfermode*, uint32_t dst[], const SkPM4f[], int count, const SkAlpha aa[]) {}
 
-static void dst_1(const SkXfermode::PM4fState& state, uint32_t dst[], const SkPM4f&,
-                  int count, const SkAlpha coverage[]) {}
-
-const XferProcPair gProcs_Dst[] = {
-    { dst_1, dst_n },
-    { dst_1, dst_n },
-    { dst_1, dst_n },
-    { dst_1, dst_n },
+const SkXfermode::D32Proc gProcs_Dst[] = {
+    dst, dst, dst, dst, dst, dst, dst, dst,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <DstType D> void srcover_n(const SkXfermode::PM4fState& state, uint32_t dst[],
+template <DstType D> void srcover_n(const SkXfermode*, uint32_t dst[],
                                     const SkPM4f src[], int count, const SkAlpha aa[]) {
     if (aa) {
         for (int i = 0; i < count; ++i) {
@@ -273,9 +250,9 @@
     }
 }
 
-static void srcover_linear_dst_1(const SkXfermode::PM4fState& state, uint32_t dst[],
-                                 const SkPM4f& src, int count, const SkAlpha aa[]) {
-    const Sk4f s4 = Sk4f::Load(src.fVec);
+static void srcover_linear_dst_1(const SkXfermode*, uint32_t dst[],
+                                 const SkPM4f* src, int count, const SkAlpha aa[]) {
+    const Sk4f s4 = Sk4f::Load(src->fVec);
     const Sk4f dst_scale = Sk4f(1 - get_alpha(s4));
     
     if (aa) {
@@ -316,9 +293,9 @@
     }
 }
 
-static void srcover_srgb_dst_1(const SkXfermode::PM4fState& state, uint32_t dst[],
-                               const SkPM4f& src, int count, const SkAlpha aa[]) {
-    Sk4f s4 = Sk4f::Load(src.fVec);
+static void srcover_srgb_dst_1(const SkXfermode*, uint32_t dst[],
+                               const SkPM4f* src, int count, const SkAlpha aa[]) {
+    Sk4f s4 = Sk4f::Load(src->fVec);
     Sk4f dst_scale = Sk4f(1 - get_alpha(s4));
 
     if (aa) {
@@ -358,18 +335,19 @@
     }
 }
 
-const XferProcPair gProcs_SrcOver[] = {
-    { srcover_linear_dst_1, srcover_n<kLinear_Dst> },   // linear   alpha
-    { src_1<kLinear_Dst>,   src_n<kLinear_Dst>     },   // linear   opaque [ we are src-mode ]
-    { srcover_srgb_dst_1,   srcover_n<kSRGB_Dst>   },   // srgb     alpha
-    { src_1<kSRGB_Dst>,     src_n<kSRGB_Dst>       },   // srgb     opaque [ we are src-mode ]
+const SkXfermode::D32Proc gProcs_SrcOver[] = {
+    srcover_n<kLinear_Dst>, src_n<kLinear_Dst>,
+    srcover_linear_dst_1,   src_1<kLinear_Dst>,
+
+    srcover_n<kSRGB_Dst>,   src_n<kSRGB_Dst>,
+    srcover_srgb_dst_1,     src_1<kSRGB_Dst>,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-static XferProcPair find_procs(SkXfermode::Mode mode, uint32_t flags) {
-    SkASSERT(0 == (flags & ~3));
-    flags &= 3;
+static SkXfermode::D32Proc find_proc(SkXfermode::Mode mode, uint32_t flags) {
+    SkASSERT(0 == (flags & ~7));
+    flags &= 7;
 
     switch (mode) {
         case SkXfermode::kClear_Mode:   return gProcs_Clear[flags];
@@ -382,28 +360,16 @@
     return gProcs_General[flags];
 }
 
-SkXfermode::PM4fProc1 SkXfermode::GetPM4fProc1(Mode mode, uint32_t flags) {
-    return find_procs(mode, flags).fP1;
-}
-
-SkXfermode::PM4fProcN SkXfermode::GetPM4fProcN(Mode mode, uint32_t flags) {
-    return find_procs(mode, flags).fPN;
-}
-
-SkXfermode::PM4fProc1 SkXfermode::getPM4fProc1(uint32_t flags) const {
-    SkASSERT(0 == (flags & ~3));
-    flags &= 3;
+SkXfermode::D32Proc SkXfermode::onGetD32Proc(uint32_t flags) const {
+    SkASSERT(0 == (flags & ~7));
+    flags &= 7;
 
     Mode mode;
-    return this->asMode(&mode) ? GetPM4fProc1(mode, flags) : gProcs_General[flags].fP1;
+    return this->asMode(&mode) ? find_proc(mode, flags) : gProcs_General[flags];
 }
 
-SkXfermode::PM4fProcN SkXfermode::getPM4fProcN(uint32_t flags) const {
-    SkASSERT(0 == (flags & ~3));
-    flags &= 3;
-
-    Mode mode;
-    return this->asMode(&mode) ? GetPM4fProcN(mode, flags) : gProcs_General[flags].fPN;
+SkXfermode::D32Proc SkXfermode::GetD32Proc(SkXfermode* xfer, uint32_t flags) {
+    return xfer ? xfer->onGetD32Proc(flags) : find_proc(SkXfermode::kSrcOver_Mode, flags);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/SkXfermodeU64.cpp b/src/core/SkXfermodeU64.cpp
index 5d260c1..2609f89 100644
--- a/src/core/SkXfermodeU64.cpp
+++ b/src/core/SkXfermodeU64.cpp
@@ -16,11 +16,6 @@
     }
 }
 
-struct U64ProcPair {
-    SkXfermode::U64Proc1 fP1;
-    SkXfermode::U64ProcN fPN;
-};
-
 enum DstType {
     kU16_Dst,
     kF16_Dst,
@@ -72,29 +67,29 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <DstType D> void xfer_u64_1(const SkXfermode::U64State& state, uint64_t dst[],
-                                     const SkPM4f& src, int count, const SkAlpha aa[]) {
-    SkXfermodeProc4f proc = state.fXfer->getProc4f();
+template <DstType D> void xfer_u64_1(const SkXfermode* xfer, uint64_t dst[],
+                                     const SkPM4f* src, int count, const SkAlpha aa[]) {
+    SkXfermodeProc4f proc = xfer->getProc4f();
     SkPM4f d;
     if (aa) {
         for (int i = 0; i < count; ++i) {
             Sk4f d4 = bias_to_unit<D>(load_from_dst<D>(dst[i]));
             d4.store(d.fVec);
-            Sk4f r4 = unit_to_bias<D>(Sk4f::Load(proc(src, d).fVec));
+            Sk4f r4 = unit_to_bias<D>(Sk4f::Load(proc(*src, d).fVec));
             dst[i] = store_to_dst<D>(lerp_by_coverage(r4, d4, aa[i]));
         }
     } else {
         for (int i = 0; i < count; ++i) {
             bias_to_unit<D>(load_from_dst<D>(dst[i])).store(d.fVec);
-            Sk4f r4 = unit_to_bias<D>(Sk4f::Load(proc(src, d).fVec));
+            Sk4f r4 = unit_to_bias<D>(Sk4f::Load(proc(*src, d).fVec));
             dst[i] = store_to_dst<D>(r4);
         }
     }
 }
 
-template <DstType D> void xfer_u64_n(const SkXfermode::U64State& state, uint64_t dst[],
+template <DstType D> void xfer_u64_n(const SkXfermode* xfer, uint64_t dst[],
                                      const SkPM4f src[], int count, const SkAlpha aa[]) {
-    SkXfermodeProc4f proc = state.fXfer->getProc4f();
+    SkXfermodeProc4f proc = xfer->getProc4f();
     SkPM4f d;
     if (aa) {
         for (int i = 0; i < count; ++i) {
@@ -112,18 +107,41 @@
     }
 }
 
-const U64ProcPair gU64Procs_General[] = {
-    { xfer_u64_1<kU16_Dst>, xfer_u64_n<kU16_Dst> },   // U16     alpha
-    { xfer_u64_1<kU16_Dst>, xfer_u64_n<kU16_Dst> },   // U16     opaque
-    { xfer_u64_1<kF16_Dst>, xfer_u64_n<kF16_Dst> },   // F16     alpha
-    { xfer_u64_1<kF16_Dst>, xfer_u64_n<kF16_Dst> },   // F16     opaque
+const SkXfermode::D64Proc gProcs_General[] = {
+    xfer_u64_n<kU16_Dst>,   xfer_u64_n<kU16_Dst>,
+    xfer_u64_1<kU16_Dst>,   xfer_u64_1<kU16_Dst>,
+    xfer_u64_n<kF16_Dst>,   xfer_u64_n<kF16_Dst>,
+    xfer_u64_1<kF16_Dst>,   xfer_u64_1<kF16_Dst>,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <DstType D> void src_1(const SkXfermode::U64State& state, uint64_t dst[],
-                                const SkPM4f& src, int count, const SkAlpha aa[]) {
-    const Sk4f s4 = pm_to_rgba_order(unit_to_bias<D>(Sk4f::Load(src.fVec)));
+template <DstType D> void clear(const SkXfermode*, uint64_t dst[],
+                                const SkPM4f*, int count, const SkAlpha aa[]) {
+    if (aa) {
+        for (int i = 0; i < count; ++i) {
+            if (aa[i]) {
+                const Sk4f d4 = load_from_dst<D>(dst[i]);
+                dst[i] = store_to_dst<D>(d4 * Sk4f((255 - aa[i]) * 1.0f/255));
+            }
+        }
+    } else {
+        sk_memset64(dst, 0, count);
+    }
+}
+
+const SkXfermode::D64Proc gProcs_Clear[] = {
+    clear<kU16_Dst>,    clear<kU16_Dst>,
+    clear<kU16_Dst>,    clear<kU16_Dst>,
+    clear<kF16_Dst>,    clear<kF16_Dst>,
+    clear<kF16_Dst>,    clear<kF16_Dst>,
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <DstType D> void src_1(const SkXfermode*, uint64_t dst[],
+                                const SkPM4f* src, int count, const SkAlpha aa[]) {
+    const Sk4f s4 = pm_to_rgba_order(unit_to_bias<D>(Sk4f::Load(src->fVec)));
     if (aa) {
         for (int i = 0; i < count; ++i) {
             const Sk4f d4 = load_from_dst<D>(dst[i]);
@@ -134,7 +152,7 @@
     }
 }
 
-template <DstType D> void src_n(const SkXfermode::U64State& state, uint64_t dst[],
+template <DstType D> void src_n(const SkXfermode*, uint64_t dst[],
                                 const SkPM4f src[], int count, const SkAlpha aa[]) {
     if (aa) {
         for (int i = 0; i < count; ++i) {
@@ -150,18 +168,26 @@
     }
 }
 
-const U64ProcPair gU64Procs_Src[] = {
-    { src_1<kU16_Dst>, src_n<kU16_Dst>  },   // U16     alpha
-    { src_1<kU16_Dst>, src_n<kU16_Dst>  },   // U16     opaque
-    { src_1<kF16_Dst>, src_n<kF16_Dst>  },   // F16     alpha
-    { src_1<kF16_Dst>, src_n<kF16_Dst>  },   // F16     opaque
+const SkXfermode::D64Proc gProcs_Src[] = {
+    src_n<kU16_Dst>,    src_n<kU16_Dst>,
+    src_1<kU16_Dst>,    src_1<kU16_Dst>,
+    src_n<kF16_Dst>,    src_n<kF16_Dst>,
+    src_1<kF16_Dst>,    src_1<kF16_Dst>,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <DstType D> void srcover_1(const SkXfermode::U64State& state, uint64_t dst[],
-                                    const SkPM4f& src, int count, const SkAlpha aa[]) {
-    const Sk4f s4 = pm_to_rgba_order(Sk4f::Load(src.fVec));
+static void dst(const SkXfermode*, uint64_t*, const SkPM4f*, int count, const SkAlpha[]) {}
+
+const SkXfermode::D64Proc gProcs_Dst[] = {
+    dst, dst, dst, dst, dst, dst, dst, dst,
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <DstType D> void srcover_1(const SkXfermode*, uint64_t dst[],
+                                    const SkPM4f* src, int count, const SkAlpha aa[]) {
+    const Sk4f s4 = pm_to_rgba_order(Sk4f::Load(src->fVec));
     const Sk4f dst_scale = Sk4f(1 - get_alpha(s4));
     const Sk4f s4bias = unit_to_bias<D>(s4);
     for (int i = 0; i < count; ++i) {
@@ -175,7 +201,7 @@
     }
 }
 
-template <DstType D> void srcover_n(const SkXfermode::U64State& state, uint64_t dst[],
+template <DstType D> void srcover_n(const SkXfermode*, uint64_t dst[],
                                     const SkPM4f src[], int count, const SkAlpha aa[]) {
     for (int i = 0; i < count; ++i) {
         const Sk4f s4 = pm_to_rgba_order(Sk4f::Load(src[i].fVec));
@@ -191,32 +217,39 @@
     }
 }
 
-const U64ProcPair gU64Procs_SrcOver[] = {
-    { srcover_1<kU16_Dst>,  srcover_n<kU16_Dst> },   // U16     alpha
-    { src_1<kU16_Dst>,      src_n<kU16_Dst>     },   // U16     opaque
-    { srcover_1<kF16_Dst>,  srcover_n<kF16_Dst> },   // F16     alpha
-    { src_1<kF16_Dst>,      src_n<kF16_Dst>     },   // F16     opaque
+const SkXfermode::D64Proc gProcs_SrcOver[] = {
+    srcover_n<kU16_Dst>,    src_n<kU16_Dst>,
+    srcover_1<kU16_Dst>,    src_1<kU16_Dst>,
+    srcover_n<kF16_Dst>,    src_n<kF16_Dst>,
+    srcover_1<kF16_Dst>,    src_1<kF16_Dst>,
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
-static U64ProcPair find_procs(SkXfermode::Mode mode, uint32_t flags) {
-    SkASSERT(0 == (flags & ~3));
-    flags &= 3;
+static SkXfermode::D64Proc find_proc(SkXfermode::Mode mode, uint32_t flags) {
+    SkASSERT(0 == (flags & ~7));
+    flags &= 7;
 
     switch (mode) {
-        case SkXfermode::kSrc_Mode:     return gU64Procs_Src[flags];
-        case SkXfermode::kSrcOver_Mode: return gU64Procs_SrcOver[flags];
+        case SkXfermode::kClear_Mode:   return gProcs_Clear[flags];
+        case SkXfermode::kSrc_Mode:     return gProcs_Src[flags];
+        case SkXfermode::kDst_Mode:     return gProcs_Dst[flags];
+        case SkXfermode::kSrcOver_Mode: return gProcs_SrcOver[flags];
         default:
             break;
     }
-    return gU64Procs_General[flags];
+    return gProcs_General[flags];
 }
 
-SkXfermode::U64Proc1 SkXfermode::GetU64Proc1(Mode mode, uint32_t flags) {
-    return find_procs(mode, flags).fP1;
+SkXfermode::D64Proc SkXfermode::onGetD64Proc(uint32_t flags) const {
+    SkASSERT(0 == (flags & ~7));
+    flags &= 7;
+    
+    Mode mode;
+    return this->asMode(&mode) ? find_proc(mode, flags) : gProcs_General[flags];
 }
 
-SkXfermode::U64ProcN SkXfermode::GetU64ProcN(Mode mode, uint32_t flags) {
-    return find_procs(mode, flags).fPN;
+SkXfermode::D64Proc SkXfermode::GetD64Proc(SkXfermode* xfer, uint32_t flags) {
+    return xfer ? xfer->onGetD64Proc(flags) : find_proc(SkXfermode::kSrcOver_Mode, flags);
 }
+