Add SSE4 version of BlurImage optimizations.

Adds an SSE4.1 version of the existing BlurImage optimizations.
Performance of blur_image_filter_* benchmarks show a 10-50%
improvement on Linux/Ubuntu Core i7.

Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>

Committed: https://skia.googlesource.com/skia/+/2830632ce93c97ed7647b13348365ea92e4ea665

R=mtklein@google.com, reed@chromium.org

Author: henrik.smiding@intel.com

Review URL: https://codereview.chromium.org/366593004
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 603458b..5bab17a 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -15,6 +15,7 @@
 #include "SkBlitRow_opts_SSE2.h"
 #include "SkBlitRow_opts_SSE4.h"
 #include "SkBlurImage_opts_SSE2.h"
+#include "SkBlurImage_opts_SSE4.h"
 #include "SkMorphology_opts.h"
 #include "SkMorphology_opts_SSE2.h"
 #include "SkRTConf.h"
@@ -358,10 +359,13 @@
 #ifdef SK_DISABLE_BLUR_DIVISION_OPTIMIZATION
     return false;
 #else
-    if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
-        return false;
+    if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
+        return SkBoxBlurGetPlatformProcs_SSE4(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
     }
-    return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
+    else if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
+        return SkBoxBlurGetPlatformProcs_SSE2(boxBlurX, boxBlurY, boxBlurXY, boxBlurYX);
+    }
+    return false;
 #endif
 }