Add SSE4 version of BlurImage optimizations.

Adds an SSE4.1 version of the existing BlurImage optimizations.
Performance of blur_image_filter_* benchmarks show a 10-50%
improvement on Linux/Ubuntu Core i7.

Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>

Committed: https://skia.googlesource.com/skia/+/2830632ce93c97ed7647b13348365ea92e4ea665

R=mtklein@google.com, reed@chromium.org

Author: henrik.smiding@intel.com

Review URL: https://codereview.chromium.org/366593004
diff --git a/gyp/opts.gyp b/gyp/opts.gyp
index 85c8c50..653006d 100644
--- a/gyp/opts.gyp
+++ b/gyp/opts.gyp
@@ -212,22 +212,39 @@
         '../src/core',
       ],
       'conditions': [
-        [ 'skia_os in ["linux", "freebsd", "openbsd", "solaris", "nacl", "chromeos", "android", "mac"] \
+        [ 'skia_os in ["linux", "freebsd", "openbsd", "solaris", "nacl", "chromeos", "android"] \
            and not skia_android_framework', {
           'cflags': [
             '-msse4',
           ],
         }],
-        [ 'skia_arch_width == 64 and skia_arch_type == "x86"', {
+        [ 'skia_os == "mac"', {
+          'xcode_settings': {
+            'OTHER_CPLUSPLUSFLAGS!': [
+              '-mssse3',
+            ],
+            'OTHER_CPLUSPLUSFLAGS': [
+              '-msse4',
+            ],
+          },
+        }],
+        [ 'skia_arch_type == "x86"', {
           'sources': [
-            '../src/opts/SkBlitRow_opts_SSE4_x64_asm.S',
+            '../src/opts/SkBlurImage_opts_SSE4.cpp',
+          ],
+          'conditions': [
+            [ 'skia_arch_width == 64', {
+              'sources': [
+                '../src/opts/SkBlitRow_opts_SSE4_x64_asm.S',
+              ],
+            }],
+            [ 'skia_arch_width == 32', {
+              'sources': [
+                '../src/opts/SkBlitRow_opts_SSE4_asm.S',
+              ],
+            }],
           ],
         }],
-        [ 'skia_arch_width == 32 and skia_arch_type == "x86"', {
-          'sources': [
-            '../src/opts/SkBlitRow_opts_SSE4_asm.S',
-          ],
-       }],
       ],
     },
     # NEON code must be compiled with -mfpu=neon which also affects scalar