Refactor swizzle names and types.

  - Plant a flag to say "pretend all the inputs are RGBA".
    This is how libpng thinks.
    This is the opposite of what the implementation had been doing,
    so I've rearranged everything to reflect the new orientation.

  - Rewrite the names to be less mysterious looking.  No more Xs.

  - Make the src type uniformly const void*, to allow for 888 (RGB) srcs.

This should be performance and pixel neutral.  (Please revert if it's not.)

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1626463002
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1626463002
diff --git a/bench/SwizzleBench.cpp b/bench/SwizzleBench.cpp
index 13b2003..922c276 100644
--- a/bench/SwizzleBench.cpp
+++ b/bench/SwizzleBench.cpp
@@ -10,7 +10,7 @@
 
 class SwizzleBench : public Benchmark {
 public:
-    SwizzleBench(const char* name, SkOpts::Swizzle_8888_8888 fn) : fName(name), fFn(fn) {}
+    SwizzleBench(const char* name, SkOpts::Swizzle_8888 fn) : fName(name), fFn(fn) {}
 
     bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
     const char* onGetName() override { return fName; }
@@ -23,10 +23,10 @@
     }
 private:
     const char* fName;
-    SkOpts::Swizzle_8888_8888 fFn;
+    SkOpts::Swizzle_8888 fFn;
 };
 
 
-DEF_BENCH(return new SwizzleBench("SkOpts::premul_xxxa", SkOpts::premul_xxxa));
-DEF_BENCH(return new SwizzleBench("SkOpts::swaprb_xxxa", SkOpts::swaprb_xxxa));
-DEF_BENCH(return new SwizzleBench("SkOpts::premul_swaprb_xxxa", SkOpts::premul_swaprb_xxxa));
+DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_rgbA", SkOpts::RGBA_to_rgbA));
+DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_bgrA", SkOpts::RGBA_to_bgrA));
+DEF_BENCH(return new SwizzleBench("SkOpts::RGBA_to_BGRA", SkOpts::RGBA_to_BGRA));
diff --git a/src/codec/SkSwizzler.cpp b/src/codec/SkSwizzler.cpp
index 24cb65f..f84b83e 100644
--- a/src/codec/SkSwizzler.cpp
+++ b/src/codec/SkSwizzler.cpp
@@ -333,7 +333,7 @@
 
     // These swizzles trust that the alpha value is already 0xFF.
 #ifdef SK_PMCOLOR_IS_RGBA
-    SkOpts::swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
+    SkOpts::RGBA_to_BGRA((uint32_t*) dst, src + offset, width);
 #else
     memcpy(dst, src + offset, width * bpp);
 #endif
@@ -361,9 +361,9 @@
     SkASSERT(deltaSrc == bpp);
 
 #ifdef SK_PMCOLOR_IS_RGBA
-    SkOpts::premul_swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
+    SkOpts::RGBA_to_bgrA((uint32_t*) dst, src + offset, width);
 #else
-    SkOpts::premul_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
+    SkOpts::RGBA_to_rgbA((uint32_t*) dst, src + offset, width);
 #endif
 }
 
@@ -419,9 +419,9 @@
     SkASSERT(deltaSrc == bpp);
 
 #ifdef SK_PMCOLOR_IS_RGBA
-    SkOpts::premul_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
+    SkOpts::RGBA_to_rgbA((uint32_t*) dst, src + offset, width);
 #else
-    SkOpts::premul_swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
+    SkOpts::RGBA_to_bgrA((uint32_t*) dst, src + offset, width);
 #endif
 }
 
@@ -450,7 +450,7 @@
 #ifdef SK_PMCOLOR_IS_RGBA
     memcpy(dst, src + offset, width * bpp);
 #else
-    SkOpts::swaprb_xxxa((uint32_t*) dst, (const uint32_t*) (src + offset), width);
+    SkOpts::RGBA_to_BGRA((uint32_t*) dst, src + offset, width);
 #endif
 }
 
diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp
index 5f1a36c..c078d85 100644
--- a/src/core/SkOpts.cpp
+++ b/src/core/SkOpts.cpp
@@ -81,9 +81,9 @@
     decltype(matrix_scale_translate) matrix_scale_translate = sk_default::matrix_scale_translate;
     decltype(matrix_affine)          matrix_affine          = sk_default::matrix_affine;
 
-    decltype(       premul_xxxa)        premul_xxxa = sk_default::       premul_xxxa;
-    decltype(       swaprb_xxxa)        swaprb_xxxa = sk_default::       swaprb_xxxa;
-    decltype(premul_swaprb_xxxa) premul_swaprb_xxxa = sk_default::premul_swaprb_xxxa;
+    decltype(RGBA_to_BGRA) RGBA_to_BGRA = sk_default::RGBA_to_BGRA;
+    decltype(RGBA_to_rgbA) RGBA_to_rgbA = sk_default::RGBA_to_rgbA;
+    decltype(RGBA_to_bgrA) RGBA_to_bgrA = sk_default::RGBA_to_bgrA;
 
     // Each Init_foo() is defined in src/opts/SkOpts_foo.cpp.
     void Init_ssse3();
diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h
index 85e38fe..a622c1a 100644
--- a/src/core/SkOpts.h
+++ b/src/core/SkOpts.h
@@ -58,10 +58,11 @@
 
     extern SkMatrix::MapPtsProc matrix_translate, matrix_scale_translate, matrix_affine;
 
-    typedef void (*Swizzle_8888_8888)(uint32_t[], const uint32_t[], int);
-    extern Swizzle_8888_8888 premul_xxxa,  // BGRA -> bgrA or RGBA -> rgbA
-                             swaprb_xxxa,  // BGRA -> RGBA or RGBA -> BGRA
-                      premul_swaprb_xxxa;  // BGRA -> rgbA or RGBA -> bgrA
+    // Swizzle input into some sort of 8888 pixel, {premul,unpremul} x {rgba,bgra}.
+    typedef void (*Swizzle_8888)(uint32_t*, const void*, int);
+    extern Swizzle_8888 RGBA_to_BGRA,  // i.e. just swap RB
+                        RGBA_to_rgbA,  // i.e. just premultiply
+                        RGBA_to_bgrA;  // i.e. swap RB and premultiply
 }
 
 #endif//SkOpts_DEFINED
diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp
index 3a07ebb..97af416 100644
--- a/src/opts/SkOpts_neon.cpp
+++ b/src/opts/SkOpts_neon.cpp
@@ -49,8 +49,8 @@
         matrix_scale_translate = sk_neon::matrix_scale_translate;
         matrix_affine          = sk_neon::matrix_affine;
 
-        premul_xxxa        = sk_neon::premul_xxxa;
-        premul_swaprb_xxxa = sk_neon::premul_swaprb_xxxa;
-        swaprb_xxxa        = sk_neon::swaprb_xxxa;
+        RGBA_to_BGRA = sk_neon::RGBA_to_BGRA;
+        RGBA_to_rgbA = sk_neon::RGBA_to_rgbA;
+        RGBA_to_bgrA = sk_neon::RGBA_to_bgrA;
     }
 }
diff --git a/src/opts/SkOpts_ssse3.cpp b/src/opts/SkOpts_ssse3.cpp
index 5378377..96e8493 100644
--- a/src/opts/SkOpts_ssse3.cpp
+++ b/src/opts/SkOpts_ssse3.cpp
@@ -18,8 +18,8 @@
         blit_mask_d32_a8 = sk_ssse3::blit_mask_d32_a8;
         color_cube_filter_span = sk_ssse3::color_cube_filter_span;
 
-        premul_xxxa = sk_ssse3::premul_xxxa;
-        premul_swaprb_xxxa = sk_ssse3::premul_swaprb_xxxa;
-        swaprb_xxxa = sk_ssse3::swaprb_xxxa;
+        RGBA_to_BGRA = sk_ssse3::RGBA_to_BGRA;
+        RGBA_to_rgbA = sk_ssse3::RGBA_to_rgbA;
+        RGBA_to_bgrA = sk_ssse3::RGBA_to_bgrA;
     }
 }
diff --git a/src/opts/SkSwizzler_opts.h b/src/opts/SkSwizzler_opts.h
index b0cf4ca..8d1be84 100644
--- a/src/opts/SkSwizzler_opts.h
+++ b/src/opts/SkSwizzler_opts.h
@@ -12,18 +12,33 @@
 
 namespace SK_OPTS_NS {
 
-// These variable names in these functions just pretend the input is BGRA.
-// They work fine with both RGBA and BGRA.
-
-static void premul_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
+static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
+    auto src = (const uint32_t*)vsrc;
     for (int i = 0; i < count; i++) {
         uint8_t a = src[i] >> 24,
-                r = src[i] >> 16,
+                b = src[i] >> 16,
                 g = src[i] >>  8,
-                b = src[i] >>  0;
-        r = (r*a+127)/255;
-        g = (g*a+127)/255;
+                r = src[i] >>  0;
         b = (b*a+127)/255;
+        g = (g*a+127)/255;
+        r = (r*a+127)/255;
+        dst[i] = (uint32_t)a << 24
+               | (uint32_t)b << 16
+               | (uint32_t)g <<  8
+               | (uint32_t)r <<  0;
+    }
+}
+
+static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
+    auto src = (const uint32_t*)vsrc;
+    for (int i = 0; i < count; i++) {
+        uint8_t a = src[i] >> 24,
+                b = src[i] >> 16,
+                g = src[i] >>  8,
+                r = src[i] >>  0;
+        b = (b*a+127)/255;
+        g = (g*a+127)/255;
+        r = (r*a+127)/255;
         dst[i] = (uint32_t)a << 24
                | (uint32_t)r << 16
                | (uint32_t)g <<  8
@@ -31,32 +46,17 @@
     }
 }
 
-static void premul_swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
+static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
+    auto src = (const uint32_t*)vsrc;
     for (int i = 0; i < count; i++) {
         uint8_t a = src[i] >> 24,
-                r = src[i] >> 16,
+                b = src[i] >> 16,
                 g = src[i] >>  8,
-                b = src[i] >>  0;
-        r = (r*a+127)/255;
-        g = (g*a+127)/255;
-        b = (b*a+127)/255;
+                r = src[i] >>  0;
         dst[i] = (uint32_t)a << 24
-               | (uint32_t)b << 16
+               | (uint32_t)r << 16
                | (uint32_t)g <<  8
-               | (uint32_t)r <<  0;
-    }
-}
-
-static void swaprb_xxxa_portable(uint32_t dst[], const uint32_t src[], int count) {
-    for (int i = 0; i < count; i++) {
-        uint8_t a = src[i] >> 24,
-                r = src[i] >> 16,
-                g = src[i] >>  8,
-                b = src[i] >>  0;
-        dst[i] = (uint32_t)a << 24
-               | (uint32_t)b << 16
-               | (uint32_t)g <<  8
-               | (uint32_t)r <<  0;
+               | (uint32_t)b <<  0;
     }
 }
 
@@ -92,30 +92,31 @@
 }
 
 template <bool kSwapRB>
-static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
+static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
+    auto src = (const uint32_t*)vsrc;
     while (count >= 8) {
         // Load 8 pixels.
         uint8x8x4_t bgra = vld4_u8((const uint8_t*) src);
 
         uint8x8_t a = bgra.val[3],
-                  r = bgra.val[2],
+                  b = bgra.val[2],
                   g = bgra.val[1],
-                  b = bgra.val[0];
+                  r = bgra.val[0];
 
         // Premultiply.
-        r = scale(r, a);
-        g = scale(g, a);
         b = scale(b, a);
+        g = scale(g, a);
+        r = scale(r, a);
 
         // Store 8 premultiplied pixels.
         if (kSwapRB) {
-            bgra.val[2] = b;
-            bgra.val[1] = g;
-            bgra.val[0] = r;
-        } else {
             bgra.val[2] = r;
             bgra.val[1] = g;
             bgra.val[0] = b;
+        } else {
+            bgra.val[2] = b;
+            bgra.val[1] = g;
+            bgra.val[0] = r;
         }
         vst4_u8((uint8_t*) dst, bgra);
         src += 8;
@@ -124,19 +125,20 @@
     }
 
     // Call portable code to finish up the tail of [0,8) pixels.
-    auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
+    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
     proc(dst, src, count);
 }
 
-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
-    premul_xxxa_should_swaprb<false>(dst, src, count);
+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
+    premul_should_swapRB<false>(dst, src, count);
 }
 
-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
-    premul_xxxa_should_swaprb<true>(dst, src, count);
+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
+    premul_should_swapRB<true>(dst, src, count);
 }
 
-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
+static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
+    auto src = (const uint32_t*)vsrc;
     while (count >= 16) {
         // Load 16 pixels.
         uint8x16x4_t bgra = vld4q_u8((const uint8_t*) src);
@@ -165,13 +167,14 @@
         count -= 8;
     }
 
-    swaprb_xxxa_portable(dst, src, count);
+    RGBA_to_BGRA_portable(dst, src, count);
 }
 
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
 
 template <bool kSwapRB>
-static void premul_xxxa_should_swaprb(uint32_t dst[], const uint32_t src[], int count) {
+static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
+    auto src = (const uint32_t*)vsrc;
 
     auto premul8 = [](__m128i* lo, __m128i* hi) {
         const __m128i zeros = _mm_setzero_si128();
@@ -185,27 +188,27 @@
         }
 
         // Swizzle the pixels to 8-bit planar.
-        *lo = _mm_shuffle_epi8(*lo, planar);                      // bbbbgggg rrrraaaa
-        *hi = _mm_shuffle_epi8(*hi, planar);                      // BBBBGGGG RRRRAAAA
-        __m128i bg = _mm_unpacklo_epi32(*lo, *hi),                // bbbbBBBB ggggGGGG
-                ra = _mm_unpackhi_epi32(*lo, *hi);                // rrrrRRRR aaaaAAAA
+        *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
+        *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
+        __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
+                ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
 
         // Unpack to 16-bit planar.
-        __m128i b = _mm_unpacklo_epi8(bg, zeros),                 // b_b_b_b_ B_B_B_B_
-                g = _mm_unpackhi_epi8(bg, zeros),                 // g_g_g_g_ G_G_G_G_
-                r = _mm_unpacklo_epi8(ra, zeros),                 // r_r_r_r_ R_R_R_R_
-                a = _mm_unpackhi_epi8(ra, zeros);                 // a_a_a_a_ A_A_A_A_
+        __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
+                g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
+                b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
+                a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
 
         // Premultiply!  (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
-        b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
-        g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
         r = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(r, a), _128), _257);
+        g = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(g, a), _128), _257);
+        b = _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(b, a), _128), _257);
 
         // Repack into interlaced pixels.
-        bg = _mm_or_si128(b, _mm_slli_epi16(g, 8));               // bgbgbgbg BGBGBGBG
-        ra = _mm_or_si128(r, _mm_slli_epi16(a, 8));               // rararara RARARARA
-        *lo = _mm_unpacklo_epi16(bg, ra);                         // bgrabgra bgrabgra
-        *hi = _mm_unpackhi_epi16(bg, ra);                         // BRGABGRA BGRABGRA
+        rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
+        ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
+        *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
+        *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
     };
 
     while (count >= 8) {
@@ -236,46 +239,47 @@
     }
 
     // Call portable code to finish up the tail of [0,4) pixels.
-    auto proc = kSwapRB ? premul_swaprb_xxxa_portable : premul_xxxa_portable;
+    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
     proc(dst, src, count);
 }
 
-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
-    premul_xxxa_should_swaprb<false>(dst, src, count);
+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
+    premul_should_swapRB<false>(dst, src, count);
 }
 
-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
-    premul_xxxa_should_swaprb<true>(dst, src, count);
+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
+    premul_should_swapRB<true>(dst, src, count);
 }
 
-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
+static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
+    auto src = (const uint32_t*)vsrc;
     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
 
     while (count >= 4) {
-        __m128i bgra = _mm_loadu_si128((const __m128i*) src);
-        __m128i rgba = _mm_shuffle_epi8(bgra, swapRB);
-        _mm_storeu_si128((__m128i*) dst, rgba);
+        __m128i rgba = _mm_loadu_si128((const __m128i*) src);
+        __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
+        _mm_storeu_si128((__m128i*) dst, bgra);
 
         src += 4;
         dst += 4;
         count -= 4;
     }
 
-    swaprb_xxxa_portable(dst, src, count);
+    RGBA_to_BGRA_portable(dst, src, count);
 }
 
 #else
 
-static void premul_xxxa(uint32_t dst[], const uint32_t src[], int count) {
-    premul_xxxa_portable(dst, src, count);
+static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
+    RGBA_to_rgbA_portable(dst, src, count);
 }
 
-static void premul_swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
-    premul_swaprb_xxxa_portable(dst, src, count);
+static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
+    RGBA_to_bgrA_portable(dst, src, count);
 }
 
-static void swaprb_xxxa(uint32_t dst[], const uint32_t src[], int count) {
-    swaprb_xxxa_portable(dst, src, count);
+static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
+    RGBA_to_BGRA_portable(dst, src, count);
 }
 
 #endif
diff --git a/tests/SwizzlerTest.cpp b/tests/SwizzlerTest.cpp
index f67cfee..e1626d5 100644
--- a/tests/SwizzlerTest.cpp
+++ b/tests/SwizzlerTest.cpp
@@ -132,28 +132,28 @@
     // forall c, c*255 == c, c*0 == 0
     for (int c = 0; c <= 255; c++) {
         src = (255<<24) | c;
-        SkOpts::premul_xxxa(&dst, &src, 1);
+        SkOpts::RGBA_to_rgbA(&dst, &src, 1);
         REPORTER_ASSERT(r, dst == src);
-        SkOpts::premul_swaprb_xxxa(&dst, &src, 1);
+        SkOpts::RGBA_to_bgrA(&dst, &src, 1);
         REPORTER_ASSERT(r, dst == (uint32_t)((255<<24) | (c<<16)));
 
         src = (0<<24) | c;
-        SkOpts::premul_xxxa(&dst, &src, 1);
+        SkOpts::RGBA_to_rgbA(&dst, &src, 1);
         REPORTER_ASSERT(r, dst == 0);
-        SkOpts::premul_swaprb_xxxa(&dst, &src, 1);
+        SkOpts::RGBA_to_bgrA(&dst, &src, 1);
         REPORTER_ASSERT(r, dst == 0);
     }
 
     // check a totally arbitrary color
     src = 0xFACEB004;
-    SkOpts::premul_xxxa(&dst, &src, 1);
+    SkOpts::RGBA_to_rgbA(&dst, &src, 1);
     REPORTER_ASSERT(r, dst == 0xFACAAD04);
 
     // swap red and blue
-    SkOpts::swaprb_xxxa(&dst, &src, 1);
+    SkOpts::RGBA_to_BGRA(&dst, &src, 1);
     REPORTER_ASSERT(r, dst == 0xFA04B0CE);
 
     // all together now
-    SkOpts::premul_swaprb_xxxa(&dst, &src, 1);
+    SkOpts::RGBA_to_bgrA(&dst, &src, 1);
     REPORTER_ASSERT(r, dst == 0xFA04ADCA);
 }