Have SkWuffsCodec use a SkSwizzler

To measure the performance impact, I patched in
https://skia-review.googlesource.com/c/skia/+/162980 "Add
bench/DecodeBench.cpp" and took the best time (the "min" column):

gcc 7.3, x86_64 Broadwell

 Other   Before    After
11.6ms   7.58ms   5.01ms  Decode_GIF_test640x479.gif
21.0ms   15.8ms   11.9ms  Decode_GIF_flightAnim.gif
79.8µs   60.1µs   46.6µs  Decode_GIF_color_wheel.gif

As ratios:

 Other   Before    After
 2.31x    1.51x    1.00x  Decode_GIF_test640x479.gif
 1.76x    1.33x    1.00x  Decode_GIF_flightAnim.gif
 1.71x    1.29x    1.00x  Decode_GIF_color_wheel.gif

Other  is with skia_use_wuffs=false
Before is with skia_use_wuffs=true, before this CL
After  is with skia_use_wuffs=true, after  this CL

Bug: skia:8235
Change-Id: I7a1df172d172ccd668dfcbeb9728ee053fe831b1
Reviewed-on: https://skia-review.googlesource.com/c/165640
Reviewed-by: Leon Scroggins <scroggo@google.com>
Commit-Queue: Leon Scroggins <scroggo@google.com>
diff --git a/src/codec/SkWuffsCodec.cpp b/src/codec/SkWuffsCodec.cpp
index cb64748..4d2bbe7 100644
--- a/src/codec/SkWuffsCodec.cpp
+++ b/src/codec/SkWuffsCodec.cpp
@@ -10,41 +10,12 @@
 #include "../private/SkMalloc.h"
 #include "SkFrameHolder.h"
 #include "SkSampler.h"
+#include "SkSwizzler.h"
+#include "SkUtils.h"
 #include "wuffs-v0.2.h"
 
 #define SK_WUFFS_CODEC_BUFFER_SIZE 4096
 
-// TODO(nigeltao): use a swizzler instead of load_u32le and store_etc.
-
-static inline uint32_t load_u32le(uint8_t* p) {
-    return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | ((uint32_t)(p[2]) << 16) |
-           ((uint32_t)(p[3]) << 24);
-}
-
-static inline void store_u32le(uint8_t* p, uint32_t x) {
-    p[0] = x >> 0;
-    p[1] = x >> 8;
-    p[2] = x >> 16;
-    p[3] = x >> 24;
-}
-
-static inline void store_u32le_switched(uint8_t* p, uint32_t x) {
-    // This could probably be optimized, but in any case, we should use a
-    // swizzler.
-    p[0] = x >> 16;
-    p[1] = x >> 8;
-    p[2] = x >> 0;
-    p[3] = x >> 24;
-}
-
-static inline void store_565(uint8_t* p, uint32_t argb) {
-    uint32_t r5 = 0x1F & (argb >> ((8 - 5) + 16));
-    uint32_t g6 = 0x3F & (argb >> ((8 - 6) + 8));
-    uint32_t b5 = 0x1F & (argb >> ((8 - 5) + 0));
-    p[0] = (b5 << 0) | (g6 << 5);
-    p[1] = (g6 >> 3) | (r5 << 3);
-}
-
 static bool fill_buffer(wuffs_base__io_buffer* b, SkStream* s) {
     b->compact();
     size_t num_read = s->read(b->data.ptr + b->meta.wi, b->data.len - b->meta.wi);
@@ -133,6 +104,47 @@
     typedef SkFrameHolder INHERITED;
 };
 
+// SkWuffsSpySampler is a placeholder SkSampler implementation. The Skia API
+// expects to manipulate the codec's sampler (i.e. call setSampleX and
+// setSampleY) in between the startIncrementalDecode (SID) and
+// incrementalDecode (ID) calls. But creating the SkSwizzler (the real sampler)
+// requires knowing the destination buffer's dimensions, i.e. the animation
+// frame's width and height. That width and height are decoded in ID, not SID.
+//
+// To break that circle, the SkWuffsSpySampler always exists, so its methods
+// can be called between SID and ID. It doesn't actually do any sampling, it
+// merely records the arguments given to setSampleX (explicitly) and setSampleY
+// (implicitly, via the superclass' implementation). Inside ID, those recorded
+// arguments are forwarded on to the SkSwizzler (the real sampler) when that
+// SkSwizzler is created, after the frame width and height are known.
+//
+// Roughly speaking, the SkWuffsSpySampler is an eager proxy for the lazily
+// constructed real sampler. But that laziness is out of necessity.
+//
+// The "Spy" name is because it records its arguments. See
+// https://martinfowler.com/articles/mocksArentStubs.html#TheDifferenceBetweenMocksAndStubs
+class SkWuffsSpySampler final : public SkSampler {
+public:
+    SkWuffsSpySampler(int imageWidth)
+        : INHERITED(), fFillWidth(0), fImageWidth(imageWidth), fSampleX(1) {}
+
+    void reset();
+    int  sampleX() const;
+
+    int fFillWidth;
+
+private:
+    // SkSampler overrides.
+    int fillWidth() const override;
+    int onSetSampleX(int sampleX) override;
+
+    const int fImageWidth;
+
+    int fSampleX;
+
+    typedef SkSampler INHERITED;
+};
+
 class SkWuffsCodec final : public SkCodec {
 public:
     SkWuffsCodec(SkEncodedInfo&&                                         encodedInfo,
@@ -160,6 +172,7 @@
     int                  onGetFrameCount() override;
     bool                 onGetFrameInfo(int, FrameInfo*) const override;
     int                  onGetRepetitionCount() override;
+    SkSampler*           getSampler(bool createIfNecessary) override;
 
     void   readFrames();
     Result seekFrame(int frameIndex);
@@ -169,6 +182,7 @@
     const char* decodeFrame();
     void        updateNumFullyReceivedFrames();
 
+    SkWuffsSpySampler                                       fSpySampler;
     SkWuffsFrameHolder                                      fFrameHolder;
     std::unique_ptr<SkStream>                               fStream;
     std::unique_ptr<wuffs_gif__decoder, decltype(&sk_free)> fDecoder;
@@ -182,10 +196,12 @@
     wuffs_base__io_buffer    fIOBuffer;
 
     // Incremental decoding state.
-    SkColorType fIncrDecColorType;
-    uint8_t*    fIncrDecDst;
-    bool        fIncrDecHaveFrameConfig;
-    size_t      fIncrDecRowBytes;
+    uint8_t* fIncrDecDst;
+    bool     fIncrDecHaveFrameConfig;
+    size_t   fIncrDecRowBytes;
+
+    std::unique_ptr<SkSwizzler> fSwizzler;
+    SkPMColor                   fColorTable[256];
 
     uint64_t                  fNumFullyReceivedFrames;
     std::vector<SkWuffsFrame> fFrames;
@@ -252,6 +268,27 @@
     return fCodec->frame(i);
 };
 
+// -------------------------------- SkWuffsSpySampler implementation
+
+void SkWuffsSpySampler::reset() {
+    fFillWidth = 0;
+    fSampleX = 1;
+    this->setSampleY(1);
+}
+
+int SkWuffsSpySampler::sampleX() const {
+    return fSampleX;
+}
+
+int SkWuffsSpySampler::fillWidth() const {
+    return fFillWidth;
+}
+
+int SkWuffsSpySampler::onSetSampleX(int sampleX) {
+    fSampleX = sampleX;
+    return get_scaled_dimension(fImageWidth, sampleX);
+}
+
 // -------------------------------- SkWuffsCodec implementation
 
 SkWuffsCodec::SkWuffsCodec(SkEncodedInfo&&                                         encodedInfo,
@@ -269,6 +306,8 @@
                 // manage the stream ourselves, as the default SkCodec behavior
                 // is too trigger-happy on rewinding the stream.
                 nullptr),
+      fSpySampler(imgcfg.pixcfg.width()),
+      fFrameHolder(),
       fStream(std::move(stream)),
       fDecoder(std::move(dec)),
       fPixbufPtr(std::move(pixbuf_ptr)),
@@ -278,14 +317,15 @@
       fFrameConfig((wuffs_base__frame_config){}),
       fPixelBuffer(pixbuf),
       fIOBuffer((wuffs_base__io_buffer){}),
-      fIncrDecColorType(kUnknown_SkColorType),
       fIncrDecDst(nullptr),
       fIncrDecHaveFrameConfig(false),
       fIncrDecRowBytes(0),
+      fSwizzler(nullptr),
       fNumFullyReceivedFrames(0),
       fFramesComplete(false),
       fDecoderIsSuspended(false) {
     fFrameHolder.init(this, imgcfg.pixcfg.width(), imgcfg.pixcfg.height());
+    sk_memset32(fColorTable, 0, SK_ARRAY_COUNT(fColorTable));
 
     // Initialize fIOBuffer's fields, copying any outstanding data from iobuf to
     // fIOBuffer, as iobuf's backing array may not be valid for the lifetime of
@@ -340,12 +380,11 @@
         return result;
     }
 
-    SkSampler::Fill(dstInfo, dst, rowBytes, options.fZeroInitialized);
-
-    fIncrDecColorType = dstInfo.colorType();
+    fSpySampler.reset();
     fIncrDecDst = static_cast<uint8_t*>(dst);
     fIncrDecHaveFrameConfig = false;
     fIncrDecRowBytes = rowBytes;
+    fSwizzler = nullptr;
 
     return SkCodec::kSuccess;
 }
@@ -379,51 +418,57 @@
         return SkCodec::kErrorInInput;
     }
 
-    // TODO(nigeltao): use a swizzler, once I figure out how it works. For
-    // now, a C style load/store loop gets the job done.
     wuffs_base__rect_ie_u32 r = fFrameConfig.bounds();
-    wuffs_base__table_u8    pixels = fPixelBuffer.plane(0);
-    wuffs_base__slice_u8    palette = fPixelBuffer.palette();
+    if (!fSwizzler) {
+        SkIRect swizzleRect = SkIRect::MakeLTRB(r.min_incl_x, 0, r.max_excl_x, 1);
+        fSwizzler = SkSwizzler::Make(this->getEncodedInfo(), fColorTable, dstInfo(), Options(),
+                                     &swizzleRect);
+        fSwizzler->setSampleX(fSpySampler.sampleX());
+        fSwizzler->setSampleY(fSpySampler.sampleY());
+
+        auto fillInfo = dstInfo().makeWH(
+            fSwizzler->fillWidth(), get_scaled_dimension(dstInfo().height(), fSwizzler->sampleY()));
+        SkSampler::Fill(fillInfo, fIncrDecDst, fIncrDecRowBytes, options().fZeroInitialized);
+    }
+
+    wuffs_base__slice_u8 palette = fPixelBuffer.palette();
     SkASSERT(palette.len == 4 * 256);
-    switch (fIncrDecColorType) {
-        case kRGB_565_SkColorType:
-            for (uint32_t y = r.min_incl_y; y < r.max_excl_y; y++) {
-                uint8_t* d = fIncrDecDst + (y * fIncrDecRowBytes) + (r.min_incl_x * 2);
-                uint8_t* s = pixels.ptr + (y * pixels.stride) + (r.min_incl_x * 1);
-                for (uint32_t x = r.min_incl_x; x < r.max_excl_x; x++) {
-                    uint8_t  index = *s++;
-                    uint32_t argb = load_u32le(palette.ptr + 4 * static_cast<size_t>(index));
-                    store_565(d, argb);
-                    d += 2;
-                }
+    auto proc = choose_pack_color_proc(false, dstInfo().colorType());
+    for (int i = 0; i < 256; i++) {
+        uint8_t* p = palette.ptr + 4 * i;
+        fColorTable[i] = proc(p[3], p[2], p[1], p[0]);
+    }
+
+    wuffs_base__table_u8 pixels = fPixelBuffer.plane(0);
+    const int            sampleY = fSwizzler->sampleY();
+    const int            scaledHeight = get_scaled_dimension(dstInfo().height(), sampleY);
+    for (uint32_t y = r.min_incl_y; y < r.max_excl_y; y++) {
+        // In Wuffs, a paletted image is always 1 byte per pixel.
+        static constexpr size_t src_bpp = 1;
+
+        int dstY = y;
+        if (sampleY != 1) {
+            if (!fSwizzler->rowNeeded(y)) {
+                continue;
             }
-            break;
-        case kBGRA_8888_SkColorType:
-            for (uint32_t y = r.min_incl_y; y < r.max_excl_y; y++) {
-                uint8_t* d = fIncrDecDst + (y * fIncrDecRowBytes) + (r.min_incl_x * 4);
-                uint8_t* s = pixels.ptr + (y * pixels.stride) + (r.min_incl_x * 1);
-                for (uint32_t x = r.min_incl_x; x < r.max_excl_x; x++) {
-                    uint8_t  index = *s++;
-                    uint32_t argb = load_u32le(palette.ptr + 4 * static_cast<size_t>(index));
-                    store_u32le(d, argb);
-                    d += 4;
-                }
+            dstY /= sampleY;
+            if (dstY >= scaledHeight) {
+                break;
             }
-            break;
-        case kRGBA_8888_SkColorType:
-            for (uint32_t y = r.min_incl_y; y < r.max_excl_y; y++) {
-                uint8_t* d = fIncrDecDst + (y * fIncrDecRowBytes) + (r.min_incl_x * 4);
-                uint8_t* s = pixels.ptr + (y * pixels.stride) + (r.min_incl_x * 1);
-                for (uint32_t x = r.min_incl_x; x < r.max_excl_x; x++) {
-                    uint8_t  index = *s++;
-                    uint32_t argb = load_u32le(palette.ptr + 4 * static_cast<size_t>(index));
-                    store_u32le_switched(d, argb);
-                    d += 4;
-                }
-            }
-            break;
-        default:
-            return SkCodec::kUnimplemented;
+        }
+
+        // We don't adjust d by (r.min_incl_x * dst_bpp) as we have already
+        // accounted for that in swizzleRect, above.
+        uint8_t* d = fIncrDecDst + (dstY * fIncrDecRowBytes);
+
+        // The Wuffs model is that the dst buffer is the image, not the frame.
+        // The expectation is that you allocate the buffer once, but re-use it
+        // for the N frames, regardless of each frame's top-left co-ordinate.
+        //
+        // To get from the start (in the X-direction) of the image to the start
+        // of the frame, we adjust s by (r.min_incl_x * src_bpp).
+        uint8_t* s = pixels.ptr + (y * pixels.stride) + (r.min_incl_x * src_bpp);
+        fSwizzler->swizzle(d, s);
     }
 
     // The semantics of *rowsDecoded is: say you have a 10 pixel high image
@@ -444,10 +489,14 @@
     }
 
     if (result == SkCodec::kSuccess) {
-        fIncrDecColorType = kUnknown_SkColorType;
+        fSpySampler.reset();
         fIncrDecDst = nullptr;
         fIncrDecHaveFrameConfig = false;
         fIncrDecRowBytes = 0;
+        fSwizzler = nullptr;
+    } else {
+        // Make fSpySampler return whatever fSwizzler would have for fillWidth.
+        fSpySampler.fFillWidth = fSwizzler->fillWidth();
     }
     return result;
 }
@@ -484,6 +533,16 @@
     return n < INT_MAX ? n : INT_MAX;
 }
 
+SkSampler* SkWuffsCodec::getSampler(bool createIfNecessary) {
+    // fIncrDst being non-nullptr means that we are between an
+    // onStartIncrementalDecode call and the matching final (successful)
+    // onIncrementalDecode call.
+    if (createIfNecessary || fIncrDecDst) {
+        return &fSpySampler;
+    }
+    return nullptr;
+}
+
 void SkWuffsCodec::readFrames() {
     size_t n = fFrames.size();
     int    i = n ? n - 1 : 0;