Add SSE4 optimization of S32A_Opaque_Blitrow

Adds optimization of Skia S32A_Opaque_Blitrow blitter using SSE4.2 SIMD
instruction set. Special case for when alpha is zero or opaque.

Performance increase of 10%-400% compared to the existing SSE2
optimization (measured on Silvermont architecture).
Noticeable in ~25 different skia bench subtests, especially in
bitmap_8888_*, repeatTile_*, and morph_*.

bitmap_8888_A - 100% faster
bitmap_8888_A_source_transparent - 250% faster
bitmap_8888_A_source_opaque - 25% faster
bitmap_8888_A_scale_bicubic - 75% faster

Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>

Committed: https://skia.googlesource.com/skia/+/e2527b147679b0c43019fae7d59cc3777d2d097e

Committed: https://skia.googlesource.com/skia/+/b5c281e1e06af3be804309877de1dac6145686b9

R=reed@google.com, mtklein@google.com, tomhudson@google.com, djsollen@google.com, joakim.landberg@intel.com

Author: henrik.smiding@intel.com

Review URL: https://codereview.chromium.org/289473009
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 6af4772..b949698 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -12,6 +12,7 @@
 #include "SkBlitRect_opts_SSE2.h"
 #include "SkBlitRow.h"
 #include "SkBlitRow_opts_SSE2.h"
+#include "SkBlitRow_opts_SSE4.h"
 #include "SkBlurImage_opts_SSE2.h"
 #include "SkMorphology_opts.h"
 #include "SkMorphology_opts_SSE2.h"
@@ -82,6 +83,8 @@
     getcpuid(1, cpu_info);
     if ((cpu_info[2] & (1<<20)) != 0) {
         return SK_CPU_SSE_LEVEL_SSE42;
+    } else if ((cpu_info[2] & (1<<19)) != 0) {
+        return SK_CPU_SSE_LEVEL_SSE41;
     } else if ((cpu_info[2] & (1<<9)) != 0) {
         return SK_CPU_SSE_LEVEL_SSSE3;
     } else if ((cpu_info[3] & (1<<26)) != 0) {
@@ -206,16 +209,30 @@
     }
 }
 
-static SkBlitRow::Proc32 platform_32_procs[] = {
+static SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
     NULL,                               // S32_Opaque,
     S32_Blend_BlitRow32_SSE2,           // S32_Blend,
     S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
     S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
 };
 
+#if defined(SK_ATT_ASM_SUPPORTED)
+static SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
+    NULL,                               // S32_Opaque,
+    S32_Blend_BlitRow32_SSE2,           // S32_Blend,
+    S32A_Opaque_BlitRow32_SSE4_asm,     // S32A_Opaque
+    S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
+};
+#endif
+
 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
+#if defined(SK_ATT_ASM_SUPPORTED)
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
+        return platform_32_procs_SSE4[flags];
+    } else
+#endif
     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
-        return platform_32_procs[flags];
+        return platform_32_procs_SSE2[flags];
     } else {
         return NULL;
     }