Reland "have SkConvertPixels use SkColorSpaceXformSteps"

This is a reland of 6d0e566e941b09f5f9fb9f5a95c123459409c4ad

On second thought, it's probably better to correct the
types of the swizzle functions to express their required alignment.
This is a more involved CL, but I think leaves things better off.

Original change's description:
> have SkConvertPixels use SkColorSpaceXformSteps
>
> This ought to allow the fast paths in more cases, e.g. memcpy() when
> both src and dst are the same format.  Today if we tag a dst color space
> at all, we'll think we need to fall back to the general case pipeline.
>
> Some refactoring too, but no big functional change beyond using steps.
>
> Change-Id: I8fa01025229e3b9418e7f43241a2f03628a97288
> Reviewed-on: https://skia-review.googlesource.com/155640
> Reviewed-by: Brian Osman <brianosman@google.com>
> Commit-Queue: Mike Klein <mtklein@google.com>

Change-Id: Ia17d93acfe88a36c4c36d29e3a0b243f91178b61
Reviewed-on: https://skia-review.googlesource.com/156241
Commit-Queue: Mike Klein <mtklein@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/src/codec/SkPngCodec.cpp b/src/codec/SkPngCodec.cpp
index 0e7cd2b..b986121 100644
--- a/src/codec/SkPngCodec.cpp
+++ b/src/codec/SkPngCodec.cpp
@@ -293,10 +293,10 @@
 #endif
 
         if (is_rgba(tableColorType)) {
-            SkOpts::RGB_to_RGB1(colorTable + numColorsWithAlpha, palette,
+            SkOpts::RGB_to_RGB1(colorTable + numColorsWithAlpha, (const uint8_t*)palette,
                     numColors - numColorsWithAlpha);
         } else {
-            SkOpts::RGB_to_BGR1(colorTable + numColorsWithAlpha, palette,
+            SkOpts::RGB_to_BGR1(colorTable + numColorsWithAlpha, (const uint8_t*)palette,
                     numColors - numColorsWithAlpha);
         }
     }
diff --git a/src/codec/SkSwizzler.cpp b/src/codec/SkSwizzler.cpp
index cca8c41..e1e2ecb 100644
--- a/src/codec/SkSwizzler.cpp
+++ b/src/codec/SkSwizzler.cpp
@@ -475,7 +475,7 @@
     // sampling, deltaSrc should equal bpp.
     SkASSERT(deltaSrc == bpp);
 
-    SkOpts::RGBA_to_rgbA((uint32_t*) dst, src + offset, width);
+    SkOpts::RGBA_to_rgbA((uint32_t*) dst, (const uint32_t*)(src + offset), width);
 }
 
 static void fast_swizzle_rgba_to_bgra_premul(
@@ -486,7 +486,7 @@
     // sampling, deltaSrc should equal bpp.
     SkASSERT(deltaSrc == bpp);
 
-    SkOpts::RGBA_to_bgrA((uint32_t*) dst, src + offset, width);
+    SkOpts::RGBA_to_bgrA((uint32_t*) dst, (const uint32_t*)(src + offset), width);
 }
 
 static void swizzle_rgba_to_bgra_unpremul(
@@ -510,7 +510,7 @@
     // sampling, deltaSrc should equal bpp.
     SkASSERT(deltaSrc == bpp);
 
-    SkOpts::RGBA_to_BGRA((uint32_t*) dst, src + offset, width);
+    SkOpts::RGBA_to_BGRA((uint32_t*) dst, (const uint32_t*)(src + offset), width);
 }
 
 // 16-bits per component kRGB and kRGBA
@@ -705,7 +705,7 @@
     // sampling, deltaSrc should equal bpp.
     SkASSERT(deltaSrc == bpp);
 
-    SkOpts::inverted_CMYK_to_RGB1((uint32_t*) dst, src + offset, width);
+    SkOpts::inverted_CMYK_to_RGB1((uint32_t*) dst, (const uint32_t*)(src + offset), width);
 }
 
 static void fast_swizzle_cmyk_to_bgra(
@@ -716,7 +716,7 @@
     // sampling, deltaSrc should equal bpp.
     SkASSERT(deltaSrc == bpp);
 
-    SkOpts::inverted_CMYK_to_BGR1((uint32_t*) dst, src + offset, width);
+    SkOpts::inverted_CMYK_to_BGR1((uint32_t*) dst, (const uint32_t*)(src + offset), width);
 }
 
 static void swizzle_cmyk_to_565(
diff --git a/src/core/SkConvertPixels.cpp b/src/core/SkConvertPixels.cpp
index 373bc00..b613a8b 100644
--- a/src/core/SkConvertPixels.cpp
+++ b/src/core/SkConvertPixels.cpp
@@ -16,87 +16,102 @@
 #include "SkUnPreMultiplyPriv.h"
 #include "../jumper/SkJumper.h"
 
-// Fast Path 1: The memcpy() case.
-static inline bool can_memcpy(const SkImageInfo& dstInfo, const SkImageInfo& srcInfo) {
+static bool rect_memcpy(const SkImageInfo& dstInfo,       void* dstPixels, size_t dstRB,
+                        const SkImageInfo& srcInfo, const void* srcPixels, size_t srcRB,
+                        const SkColorSpaceXformSteps& steps) {
+    // We can copy the pixels when no color type, alpha type, or color space changes.
     if (dstInfo.colorType() != srcInfo.colorType()) {
         return false;
     }
-
-    if (kAlpha_8_SkColorType == dstInfo.colorType()) {
-        return true;
-    }
-
-    if (dstInfo.alphaType() != srcInfo.alphaType() &&
-        kOpaque_SkAlphaType != dstInfo.alphaType() &&
-        kOpaque_SkAlphaType != srcInfo.alphaType())
-    {
-        // We need to premultiply or unpremultiply.
+    if (dstInfo.colorType() != kAlpha_8_SkColorType
+            && steps.flags.mask() != 0b00000) {
         return false;
     }
 
-    return !dstInfo.colorSpace() ||
-           SkColorSpace::Equals(dstInfo.colorSpace(), srcInfo.colorSpace());
+    SkRectMemcpy(dstPixels, dstRB,
+                 srcPixels, srcRB, dstInfo.minRowBytes(), dstInfo.height());
+    return true;
 }
 
-// Fast Path 2: Simple swizzles and premuls.
-enum AlphaVerb {
-    kNothing_AlphaVerb,
-    kPremul_AlphaVerb,
-    kUnpremul_AlphaVerb,
-};
-
-template <bool kSwapRB>
-static void wrap_unpremultiply(uint32_t* dst, const void* src, int count) {
-    SkUnpremultiplyRow<kSwapRB>(dst, (const uint32_t*) src, count);
-}
-
-void swizzle_and_multiply(const SkImageInfo& dstInfo, void* dstPixels, size_t dstRB,
-                          const SkImageInfo& srcInfo, const void* srcPixels, size_t srcRB) {
-    void (*proc)(uint32_t* dst, const void* src, int count);
-    const bool swapRB = dstInfo.colorType() != srcInfo.colorType();
-    AlphaVerb alphaVerb = kNothing_AlphaVerb;
-    if (kPremul_SkAlphaType == dstInfo.alphaType() &&
-        kUnpremul_SkAlphaType == srcInfo.alphaType())
-    {
-        alphaVerb = kPremul_AlphaVerb;
-    } else if (kUnpremul_SkAlphaType == dstInfo.alphaType() &&
-               kPremul_SkAlphaType == srcInfo.alphaType()) {
-        alphaVerb = kUnpremul_AlphaVerb;
+static bool swizzle_and_multiply(const SkImageInfo& dstInfo,       void* dstPixels, size_t dstRB,
+                                 const SkImageInfo& srcInfo, const void* srcPixels, size_t srcRB,
+                                 const SkColorSpaceXformSteps& steps) {
+    auto is_8888 = [](SkColorType ct) {
+        return ct == kRGBA_8888_SkColorType || ct == kBGRA_8888_SkColorType;
+    };
+    if (!is_8888(dstInfo.colorType()) ||
+        !is_8888(srcInfo.colorType()) ||
+        steps.flags.linearize || steps.flags.gamut_transform || steps.flags.encode) {
+        return false;
     }
 
-    switch (alphaVerb) {
-        case kNothing_AlphaVerb:
-            // If we do not need to swap or multiply, we should hit the memcpy case.
-            SkASSERT(swapRB);
-            proc = SkOpts::RGBA_to_BGRA;
-            break;
-        case kPremul_AlphaVerb:
-            proc = swapRB ? SkOpts::RGBA_to_bgrA : SkOpts::RGBA_to_rgbA;
-            break;
-        case kUnpremul_AlphaVerb:
-            proc = swapRB ? wrap_unpremultiply<true> : wrap_unpremultiply<false>;
-            break;
+    // It'd be kind of silly for us to both...
+    SkASSERT(!(steps.flags.premul && steps.flags.unpremul));
+
+    const bool swapRB = dstInfo.colorType() != srcInfo.colorType();
+
+    void (*fn)(uint32_t*, const uint32_t*, int) = nullptr;
+
+    if (steps.flags.premul) {
+        fn = swapRB ? SkOpts::RGBA_to_bgrA
+                    : SkOpts::RGBA_to_rgbA;
+    } else if (steps.flags.unpremul) {
+        fn = swapRB ? SkUnpremultiplyRow<true>
+                    : SkUnpremultiplyRow<false>;
+    } else {
+        // If we're not swizzling, we ought to have used rect_memcpy().
+        SkASSERT(swapRB);
+        fn = SkOpts::RGBA_to_BGRA;
     }
 
     for (int y = 0; y < dstInfo.height(); y++) {
-        proc((uint32_t*) dstPixels, srcPixels, dstInfo.width());
+        fn((uint32_t*)dstPixels, (const uint32_t*)srcPixels, dstInfo.width());
         dstPixels = SkTAddOffset<void>(dstPixels, dstRB);
         srcPixels = SkTAddOffset<const void>(srcPixels, srcRB);
     }
+    return true;
 }
 
-// Fast Path 3: Alpha 8 dsts.
-static void convert_to_alpha8(uint8_t* dst, size_t dstRB, const SkImageInfo& srcInfo,
-                              const void* src, size_t srcRB) {
-    if (srcInfo.isOpaque()) {
-        for (int y = 0; y < srcInfo.height(); ++y) {
-           memset(dst, 0xFF, srcInfo.width());
-           dst = SkTAddOffset<uint8_t>(dst, dstRB);
-        }
-        return;
+static bool convert_to_alpha8(const SkImageInfo& dstInfo,       void* vdst, size_t dstRB,
+                              const SkImageInfo& srcInfo, const void*  src, size_t srcRB,
+                              const SkColorSpaceXformSteps&) {
+    if (dstInfo.colorType() != kAlpha_8_SkColorType) {
+        return false;
     }
+    auto dst = (uint8_t*)vdst;
 
     switch (srcInfo.colorType()) {
+        case kUnknown_SkColorType:
+        case kAlpha_8_SkColorType: {
+            // Unknown should never happen.
+            // Alpha8 should have been handled by rect_memcpy().
+            SkASSERT(false);
+            return false;
+        }
+
+        case kGray_8_SkColorType:
+        case kRGB_565_SkColorType:
+        case kRGB_888x_SkColorType:
+        case kRGB_101010x_SkColorType: {
+            for (int y = 0; y < srcInfo.height(); ++y) {
+               memset(dst, 0xFF, srcInfo.width());
+               dst = SkTAddOffset<uint8_t>(dst, dstRB);
+            }
+            return true;
+        }
+
+        case kARGB_4444_SkColorType: {
+            auto src16 = (const uint16_t*) src;
+            for (int y = 0; y < srcInfo.height(); y++) {
+                for (int x = 0; x < srcInfo.width(); x++) {
+                    dst[x] = SkPacked4444ToA32(src16[x]);
+                }
+                dst = SkTAddOffset<uint8_t>(dst, dstRB);
+                src16 = SkTAddOffset<const uint16_t>(src16, srcRB);
+            }
+            return true;
+        }
+
         case kBGRA_8888_SkColorType:
         case kRGBA_8888_SkColorType: {
             auto src32 = (const uint32_t*) src;
@@ -107,43 +122,21 @@
                 dst = SkTAddOffset<uint8_t>(dst, dstRB);
                 src32 = SkTAddOffset<const uint32_t>(src32, srcRB);
             }
-            break;
+            return true;
         }
+
         case kRGBA_1010102_SkColorType: {
             auto src32 = (const uint32_t*) src;
             for (int y = 0; y < srcInfo.height(); y++) {
                 for (int x = 0; x < srcInfo.width(); x++) {
-                    switch (src32[x] >> 30) {
-                        case 0:
-                            dst[x] = 0;
-                            break;
-                        case 1:
-                            dst[x] = 0x55;
-                            break;
-                        case 2:
-                            dst[x] = 0xAA;
-                            break;
-                        case 3:
-                            dst[x] = 0xFF;
-                            break;
-                    }
+                    dst[x] = (src32[x] >> 30) * 0x55;
                 }
                 dst = SkTAddOffset<uint8_t>(dst, dstRB);
                 src32 = SkTAddOffset<const uint32_t>(src32, srcRB);
             }
-            break;
+            return true;
         }
-        case kARGB_4444_SkColorType: {
-            auto src16 = (const uint16_t*) src;
-            for (int y = 0; y < srcInfo.height(); y++) {
-                for (int x = 0; x < srcInfo.width(); x++) {
-                    dst[x] = SkPacked4444ToA32(src16[x]);
-                }
-                dst = SkTAddOffset<uint8_t>(dst, dstRB);
-                src16 = SkTAddOffset<const uint16_t>(src16, srcRB);
-            }
-            break;
-        }
+
         case kRGBA_F16_SkColorType: {
             auto src64 = (const uint64_t*) src;
             for (int y = 0; y < srcInfo.height(); y++) {
@@ -153,8 +146,9 @@
                 dst = SkTAddOffset<uint8_t>(dst, dstRB);
                 src64 = SkTAddOffset<const uint64_t>(src64, srcRB);
             }
-            break;
+            return true;
         }
+
         case kRGBA_F32_SkColorType: {
             auto rgba = (const float*)src;
             for (int y = 0; y < srcInfo.height(); y++) {
@@ -164,25 +158,22 @@
                 dst  = SkTAddOffset<uint8_t>(dst, dstRB);
                 rgba = SkTAddOffset<const float>(rgba, srcRB);
             }
-        } break;
-        default:
-            SkASSERT(false);
-            break;
+            return true;
+        }
     }
+    return false;
 }
 
 // Default: Use the pipeline.
 static void convert_with_pipeline(const SkImageInfo& dstInfo, void* dstRow, size_t dstRB,
-                                  const SkImageInfo& srcInfo, const void* srcRow, size_t srcRB) {
+                                  const SkImageInfo& srcInfo, const void* srcRow, size_t srcRB,
+                                  const SkColorSpaceXformSteps& steps) {
 
     SkJumper_MemoryCtx src = { (void*)srcRow, (int)(srcRB / srcInfo.bytesPerPixel()) },
                        dst = { (void*)dstRow, (int)(dstRB / dstInfo.bytesPerPixel()) };
 
     SkRasterPipeline_<256> pipeline;
     pipeline.append_load(srcInfo.colorType(), &src);
-
-    SkColorSpaceXformSteps steps{srcInfo.colorSpace(), srcInfo.alphaType(),
-                                 dstInfo.colorSpace(), dstInfo.alphaType()};
     steps.apply(&pipeline);
 
     // We'll dither if we're decreasing precision below 32-bit.
@@ -202,40 +193,18 @@
     pipeline.run(0,0, srcInfo.width(), srcInfo.height());
 }
 
-static bool swizzle_and_multiply_color_type(SkColorType ct) {
-    switch (ct) {
-        case kRGBA_8888_SkColorType:
-        case kBGRA_8888_SkColorType:
-            return true;
-        default:
-            return false;
-    }
-}
-
-void SkConvertPixels(const SkImageInfo& dstInfo, void* dstPixels, size_t dstRB,
+void SkConvertPixels(const SkImageInfo& dstInfo,       void* dstPixels, size_t dstRB,
                      const SkImageInfo& srcInfo, const void* srcPixels, size_t srcRB) {
     SkASSERT(dstInfo.dimensions() == srcInfo.dimensions());
     SkASSERT(SkImageInfoValidConversion(dstInfo, srcInfo));
 
-    // Fast Path 1: The memcpy() case.
-    if (can_memcpy(dstInfo, srcInfo)) {
-        SkRectMemcpy(dstPixels, dstRB, srcPixels, srcRB, dstInfo.minRowBytes(), dstInfo.height());
-        return;
-    }
+    SkColorSpaceXformSteps steps{srcInfo.colorSpace(), srcInfo.alphaType(),
+                                 dstInfo.colorSpace(), dstInfo.alphaType()};
 
-    // Fast Path 2: Simple swizzles and premuls.
-    if (swizzle_and_multiply_color_type(srcInfo.colorType()) &&
-        swizzle_and_multiply_color_type(dstInfo.colorType()) && !dstInfo.colorSpace()) {
-        swizzle_and_multiply(dstInfo, dstPixels, dstRB, srcInfo, srcPixels, srcRB);
-        return;
+    for (auto fn : {rect_memcpy, swizzle_and_multiply, convert_to_alpha8}) {
+        if (fn(dstInfo, dstPixels, dstRB, srcInfo, srcPixels, srcRB, steps)) {
+            return;
+        }
     }
-
-    // Fast Path 3: Alpha 8 dsts.
-    if (kAlpha_8_SkColorType == dstInfo.colorType()) {
-        convert_to_alpha8((uint8_t*) dstPixels, dstRB, srcInfo, srcPixels, srcRB);
-        return;
-    }
-
-    // Default: Use the pipeline.
-    convert_with_pipeline(dstInfo, dstPixels, dstRB, srcInfo, srcPixels, srcRB);
+    convert_with_pipeline(dstInfo, dstPixels, dstRB, srcInfo, srcPixels, srcRB, steps);
 }
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 7f4e066..862afac 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -33,17 +33,19 @@
     extern void (*blit_row_s32a_opaque)(SkPMColor*, const SkPMColor*, int, U8CPU);
 
     // Swizzle input into some sort of 8888 pixel, {premul,unpremul} x {rgba,bgra}.
-    typedef void (*Swizzle_8888)(uint32_t*, const void*, int);
-    extern Swizzle_8888 RGBA_to_BGRA,          // i.e. just swap RB
-                        RGBA_to_rgbA,          // i.e. just premultiply
-                        RGBA_to_bgrA,          // i.e. swap RB and premultiply
-                        RGB_to_RGB1,           // i.e. insert an opaque alpha
-                        RGB_to_BGR1,           // i.e. swap RB and insert an opaque alpha
-                        gray_to_RGB1,          // i.e. expand to color channels + an opaque alpha
-                        grayA_to_RGBA,         // i.e. expand to color channels
-                        grayA_to_rgbA,         // i.e. expand to color channels and premultiply
-                        inverted_CMYK_to_RGB1, // i.e. convert color space
-                        inverted_CMYK_to_BGR1; // i.e. convert color space
+    typedef void (*Swizzle_8888_u32)(uint32_t*, const uint32_t*, int);
+    extern Swizzle_8888_u32 RGBA_to_BGRA,          // i.e. just swap RB
+                            RGBA_to_rgbA,          // i.e. just premultiply
+                            RGBA_to_bgrA,          // i.e. swap RB and premultiply
+                            inverted_CMYK_to_RGB1, // i.e. convert color space
+                            inverted_CMYK_to_BGR1; // i.e. convert color space
+
+    typedef void (*Swizzle_8888_u8)(uint32_t*, const uint8_t*, int);
+    extern Swizzle_8888_u8 RGB_to_RGB1,     // i.e. insert an opaque alpha
+                           RGB_to_BGR1,     // i.e. swap RB and insert an opaque alpha
+                           gray_to_RGB1,    // i.e. expand to color channels + an opaque alpha
+                           grayA_to_RGBA,   // i.e. expand to color channels
+                           grayA_to_rgbA;   // i.e. expand to color channels and premultiply
 
     extern void (*memset16)(uint16_t[], uint16_t, int);
     extern void SK_API (*memset32)(uint32_t[], uint32_t, int);
diff --git a/src/gpu/GrDrawOpAtlas.cpp b/src/gpu/GrDrawOpAtlas.cpp
index d891c02..9c994a0 100644
--- a/src/gpu/GrDrawOpAtlas.cpp
+++ b/src/gpu/GrDrawOpAtlas.cpp
@@ -112,7 +112,7 @@
     // copy into the data buffer, swizzling as we go if this is ARGB data
     if (4 == fBytesPerPixel && kSkia8888_GrPixelConfig == kBGRA_8888_GrPixelConfig) {
         for (int i = 0; i < height; ++i) {
-            SkOpts::RGBA_to_BGRA(reinterpret_cast<uint32_t*>(dataPtr), imagePtr, width);
+            SkOpts::RGBA_to_BGRA((uint32_t*)dataPtr, (const uint32_t*)imagePtr, width);
             dataPtr += fBytesPerPixel * fWidth;
             imagePtr += rowBytes;
         }
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index 892dc31..82eb7b6 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -20,8 +20,7 @@
 
 namespace SK_OPTS_NS {
 
-static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
+static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t a = src[i] >> 24,
                 b = src[i] >> 16,
@@ -37,8 +36,7 @@
     }
 }
 
-static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
+static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t a = src[i] >> 24,
                 b = src[i] >> 16,
@@ -54,8 +52,7 @@
     }
 }
 
-static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
+static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t a = src[i] >> 24,
                 b = src[i] >> 16,
@@ -68,8 +65,7 @@
     }
 }
 
-static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*)vsrc;
+static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t r = src[0],
                 g = src[1],
@@ -82,8 +78,7 @@
     }
 }
 
-static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*)vsrc;
+static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t r = src[0],
                 g = src[1],
@@ -96,8 +91,7 @@
     }
 }
 
-static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*)vsrc;
+static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
     for (int i = 0; i < count; i++) {
         dst[i] = (uint32_t)0xFF   << 24
                | (uint32_t)src[i] << 16
@@ -106,8 +100,7 @@
     }
 }
 
-static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*)vsrc;
+static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t g = src[0],
                 a = src[1];
@@ -119,8 +112,7 @@
     }
 }
 
-static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*)vsrc;
+static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t g = src[0],
                 a = src[1];
@@ -133,8 +125,7 @@
     }
 }
 
-static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
-    const uint32_t* src = (const uint32_t*)vsrc;
+static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t k = src[i] >> 24,
                 y = src[i] >> 16,
@@ -151,8 +142,7 @@
     }
 }
 
-static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
-    const uint32_t* src = (const uint32_t*)vsrc;
+static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
     for (int i = 0; i < count; i++) {
         uint8_t k = src[i] >> 24,
                 y = src[i] >> 16,
@@ -200,8 +190,7 @@
 }
 
 template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
+static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
     while (count >= 8) {
         // Load 8 pixels.
         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
@@ -237,17 +226,16 @@
     proc(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
+/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
     premul_should_swapRB<false>(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
+/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
     premul_should_swapRB<true>(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
+/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
     using std::swap;
-    auto src = (const uint32_t*)vsrc;
     while (count >= 16) {
         // Load 16 pixels.
         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
@@ -280,8 +268,7 @@
 }
 
 template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*) vsrc;
+static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
     while (count >= 16) {
         // Load 16 pixels.
         uint8x16x3_t rgb = vld3q_u8(src);
@@ -333,16 +320,15 @@
     proc(dst, src, count);
 }
 
-/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
     insert_alpha_should_swaprb<false>(dst, src, count);
 }
 
-/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
     insert_alpha_should_swaprb<true>(dst, src, count);
 }
 
-/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*) vsrc;
+/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
     while (count >= 16) {
         // Load 16 pixels.
         uint8x16_t gray = vld1q_u8(src);
@@ -383,8 +369,7 @@
 }
 
 template <bool kPremul>
-static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*) vsrc;
+static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
     while (count >= 16) {
         // Load 16 pixels.
         uint8x16x2_t ga = vld2q_u8(src);
@@ -437,18 +422,17 @@
     proc(dst, src, count);
 }
 
-/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
     expand_grayA<false>(dst, src, count);
 }
 
-/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
     expand_grayA<true>(dst, src, count);
 }
 
 enum Format { kRGB1, kBGR1 };
 template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
+static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
     while (count >= 8) {
         // Load 8 cmyk pixels.
         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
@@ -485,11 +469,11 @@
     proc(dst, src, count);
 }
 
-/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
     inverted_cmyk_to<kRGB1>(dst, src, count);
 }
 
-/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
     inverted_cmyk_to<kBGR1>(dst, src, count);
 }
 
@@ -506,8 +490,7 @@
 }
 
 template <bool kSwapRB>
-static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
+static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
 
     auto premul8 = [](__m128i* lo, __m128i* hi) {
         const __m128i zeros = _mm_setzero_si128();
@@ -574,16 +557,15 @@
     proc(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
+/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
     premul_should_swapRB<false>(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
+/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
     premul_should_swapRB<true>(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
+/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
 
     while (count >= 4) {
@@ -600,9 +582,7 @@
 }
 
 template <bool kSwapRB>
-static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*) vsrc;
-
+static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
     __m128i expand;
     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
@@ -634,17 +614,15 @@
     proc(dst, src, count);
 }
 
-/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
     insert_alpha_should_swaprb<false>(dst, src, count);
 }
 
-/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
     insert_alpha_should_swaprb<true>(dst, src, count);
 }
 
-/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*) vsrc;
-
+/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
     while (count >= 16) {
         __m128i grays = _mm_loadu_si128((const __m128i*) src);
@@ -672,8 +650,7 @@
     gray_to_RGB1_portable(dst, src, count);
 }
 
-/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*) vsrc;
+/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
     while (count >= 8) {
         __m128i ga = _mm_loadu_si128((const __m128i*) src);
 
@@ -694,8 +671,7 @@
     grayA_to_RGBA_portable(dst, src, count);
 }
 
-/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
-    const uint8_t* src = (const uint8_t*) vsrc;
+/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
     while (count >= 8) {
         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
 
@@ -725,9 +701,7 @@
 
 enum Format { kRGB1, kBGR1 };
 template <Format format>
-static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
-    auto src = (const uint32_t*)vsrc;
-
+static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
     auto convert8 = [](__m128i* lo, __m128i* hi) {
         const __m128i zeros = _mm_setzero_si128();
         __m128i planar;
@@ -792,53 +766,53 @@
     proc(dst, src, count);
 }
 
-/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
     inverted_cmyk_to<kRGB1>(dst, src, count);
 }
 
-/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
     inverted_cmyk_to<kBGR1>(dst, src, count);
 }
 
 #else
 
-/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
+/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
     RGBA_to_rgbA_portable(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
+/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
     RGBA_to_bgrA_portable(dst, src, count);
 }
 
-/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
+/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
     RGBA_to_BGRA_portable(dst, src, count);
 }
 
-/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
     RGB_to_RGB1_portable(dst, src, count);
 }
 
-/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
     RGB_to_BGR1_portable(dst, src, count);
 }
 
-/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
     gray_to_RGB1_portable(dst, src, count);
 }
 
-/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
     grayA_to_RGBA_portable(dst, src, count);
 }
 
-/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
     grayA_to_rgbA_portable(dst, src, count);
 }
 
-/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
     inverted_CMYK_to_RGB1_portable(dst, src, count);
 }
 
-/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
+/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
     inverted_CMYK_to_BGR1_portable(dst, src, count);
 }