small ABI + narrow/wide code updates

The only tangible effect this CL should have is to use __vectorcall on
all Windows builds, including scalar ones.  The code generation is a
little better there with __vectorcall than not, so might as well.  This
is a baby step towards vector stages with MSVC, but a very baby step
indeed.

Mostly this refactors and regroups a bunch of logic to reflect my
current thoughts.  The BUILD.gn changes are essentially no-ops, but they
simplify things and make our flags more similar to how those targets are
built in Chromium.

(And I cleaned up other /arch: uses so this works.)

Change-Id: I73dd39d15cdc7b3d268231a707952bbbfd91496e
Reviewed-on: https://skia-review.googlesource.com/115644
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>
diff --git a/BUILD.gn b/BUILD.gn
index 7262034..2be010e 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -303,7 +303,7 @@
 opts("avx") {
   enabled = is_x86
   sources = skia_opts.avx_sources
-  if (!is_clang && is_win) {
+  if (is_win) {
     cflags = [ "/arch:AVX" ]
   } else {
     cflags = [ "-mavx" ]
@@ -313,14 +313,10 @@
 opts("hsw") {
   enabled = is_x86
   sources = skia_opts.hsw_sources
-  if (!is_clang && is_win) {
+  if (is_win) {
     cflags = [ "/arch:AVX2" ]
   } else {
-    cflags = [
-      "-mavx2",
-      "-mf16c",
-      "-mfma",
-    ]
+    cflags = [ "-march=haswell" ]
   }
 
   # Oddly, clang-cl doesn't recognize this as a valid flag.
diff --git a/gn/BUILD.gn b/gn/BUILD.gn
index 8b4f953..e7fffa2 100644
--- a/gn/BUILD.gn
+++ b/gn/BUILD.gn
@@ -112,9 +112,7 @@
       "-mfpu=neon",
       "-mthumb",
     ]
-  } else if (current_cpu == "x86" && is_win) {
-    cflags += [ "/arch:SSE2" ]
-  } else if (current_cpu == "x86") {
+  } else if (current_cpu == "x86" && !is_win) {
     asmflags += [ "-m32" ]
     cflags += [
       "-m32",
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 73624d1..5a2c5b2 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -700,28 +700,32 @@
 //    tail != 0 ~~> work on only the first tail pixels
 // tail is always < N.
 
-// Any custom ABI to use for all non-externally-facing stage functions.
-#if defined(__ARM_NEON) && defined(__arm__)
+// Any custom ABI to use for all (non-externally-facing) stage functions?
+// Also decide here whether to use narrow (compromise) or wide (ideal) stages.
+#if defined(__arm__) && defined(__ARM_NEON)
     // This lets us pass vectors more efficiently on 32-bit ARM.
+    // We can still only pass 16 floats, so best as 4x {r,g,b,a}.
     #define ABI __attribute__((pcs("aapcs-vfp")))
-#elif 0 || defined(__clang__) && defined(_MSC_VER)
-    // TODO: can we use sysv_abi here instead?  It'd allow passing far more registers.
-    #define ABI __attribute__((vectorcall))
-#else
-    #define ABI
-#endif
-
-// On 32-bit x86 we've only got 8 xmm registers, so we keep the 4 hottest (r,g,b,a)
-// in registers and the d-registers on the stack (giving us 4 temporary registers).
-// General-purpose registers are also tight, so we put most of those on the stack too.
-//
-// On ARMv7, we do the same so that we can make the r,g,b,a vectors wider.
-//
-// Finally, this narrower stage calling convention also fits Windows' __vectorcall very well.
-#if 0 || defined(__i386__) || defined(_M_IX86) || defined(__arm__) || defined(_MSC_VER)
     #define JUMPER_NARROW_STAGES 1
-#else
+#elif 0 && defined(_MSC_VER) && defined(__clang__) && defined(__x86_64__)
+    // SysV ABI makes it very sensible to use wide stages with clang-cl.
+    // TODO: crashes during compilation  :(
+    #define ABI __attribute__((sysv_abi))
     #define JUMPER_NARROW_STAGES 0
+#elif defined(_MSC_VER)
+    // Even if not vectorized, this lets us pass {r,g,b,a} as registers,
+    // instead of {b,a} on the stack.  Narrow stages work best for __vectorcall.
+    #define ABI __vectorcall
+    #define JUMPER_NARROW_STAGES 1
+#elif defined(__x86_64__) || defined(__aarch64__)
+    // These platforms are ideal for wider stages, and their default ABI is ideal.
+    #define ABI
+    #define JUMPER_NARROW_STAGES 0
+#else
+    // 32-bit or unknown... shunt them down the narrow path.
+    // Odds are these have few registers and are better off there.
+    #define ABI
+    #define JUMPER_NARROW_STAGES 1
 #endif
 
 #if JUMPER_NARROW_STAGES
@@ -767,7 +771,7 @@
     #define STAGE(name, ...)                                                    \
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,        \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);   \
-        static ABI void name(Params* params, void** program,                    \
+        static void ABI name(Params* params, void** program,                    \
                              F r, F g, F b, F a) {                              \
             name##_k(Ctx{program},params->dx,params->dy,params->tail, r,g,b,a,  \
                      params->dr, params->dg, params->db, params->da);           \
@@ -780,7 +784,7 @@
     #define STAGE(name, ...)                                                         \
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,             \
                          F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da);        \
-        static ABI void name(size_t tail, void** program, size_t dx, size_t dy,      \
+        static void ABI name(size_t tail, void** program, size_t dx, size_t dy,      \
                              F r, F g, F b, F a, F dr, F dg, F db, F da) {           \
             name##_k(Ctx{program},dx,dy,tail, r,g,b,a, dr,dg,db,da);                 \
             auto next = (Stage)load_and_inc(program);                                \
@@ -794,9 +798,9 @@
 // just_return() is a simple no-op stage that only exists to end the chain,
 // returning back up to start_pipeline(), and from there to the caller.
 #if JUMPER_NARROW_STAGES
-    static ABI void just_return(Params*, void**, F,F,F,F) {}
+    static void ABI just_return(Params*, void**, F,F,F,F) {}
 #else
-    static ABI void just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
+    static void ABI just_return(size_t, void**, size_t,size_t, F,F,F,F, F,F,F,F) {}
 #endif
 
 
@@ -2375,9 +2379,9 @@
 }
 
 #if JUMPER_NARROW_STAGES
-    static ABI void just_return(Params*, void**, U16,U16,U16,U16) {}
+    static void ABI just_return(Params*, void**, U16,U16,U16,U16) {}
 #else
-    static ABI void just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
+    static void ABI just_return(size_t,void**,size_t,size_t, U16,U16,U16,U16, U16,U16,U16,U16) {}
 #endif
 
 // All stages use the same function call ABI to chain into each other, but there are three types:
@@ -2393,7 +2397,7 @@
 #if JUMPER_NARROW_STAGES
     #define STAGE_GG(name, ...)                                                            \
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
-        static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+        static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
             auto x = join<F>(r,g),                                                         \
                  y = join<F>(b,a);                                                         \
             name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y);               \
@@ -2408,7 +2412,7 @@
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
                          U16& dr, U16& dg, U16& db, U16& da);                              \
-        static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+        static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
             auto x = join<F>(r,g),                                                         \
                  y = join<F>(b,a);                                                         \
             name##_k(Ctx{program}, params->dx,params->dy,params->tail, x,y, r,g,b,a,       \
@@ -2424,7 +2428,7 @@
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
                          U16& dr, U16& dg, U16& db, U16& da);                              \
-        static ABI void name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
+        static void ABI name(Params* params, void** program, U16 r, U16 g, U16 b, U16 a) { \
             name##_k(Ctx{program}, params->dx,params->dy,params->tail, r,g,b,a,            \
                      params->dr,params->dg,params->db,params->da);                         \
             auto next = (Stage)load_and_inc(program);                                      \
@@ -2436,7 +2440,7 @@
 #else
     #define STAGE_GG(name, ...)                                                            \
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F& x, F& y);      \
-        static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
+        static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
                              U16  r, U16  g, U16  b, U16  a,                               \
                              U16 dr, U16 dg, U16 db, U16 da) {                             \
             auto x = join<F>(r,g),                                                         \
@@ -2453,7 +2457,7 @@
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail, F x, F y,         \
                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
                          U16& dr, U16& dg, U16& db, U16& da);                              \
-        static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
+        static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
                              U16  r, U16  g, U16  b, U16  a,                               \
                              U16 dr, U16 dg, U16 db, U16 da) {                             \
             auto x = join<F>(r,g),                                                         \
@@ -2470,7 +2474,7 @@
         SI void name##_k(__VA_ARGS__, size_t dx, size_t dy, size_t tail,                   \
                          U16&  r, U16&  g, U16&  b, U16&  a,                               \
                          U16& dr, U16& dg, U16& db, U16& da);                              \
-        static ABI void name(size_t tail, void** program, size_t dx, size_t dy,            \
+        static void ABI name(size_t tail, void** program, size_t dx, size_t dy,            \
                              U16  r, U16  g, U16  b, U16  a,                               \
                              U16 dr, U16 dg, U16 db, U16 da) {                             \
             name##_k(Ctx{program}, dx,dy,tail, r,g,b,a, dr,dg,db,da);                      \