Add SSE4 optimization of S32A_Opaque_Blitrow Adds optimization of Skia S32A_Opaque_Blitrow blitter using SSE4.2 SIMD instruction set. Special case for when alpha is zero or opaque. Performance increase of 10%-400% compared to the existing SSE2 optimization (measured on Silvermont architecture). Noticeable in ~25 different skia bench subtests, especially in bitmap_8888_*, repeatTile_*, and morph_*. bitmap_8888_A - 100% faster bitmap_8888_A_source_transparent - 250% faster bitmap_8888_A_source_opaque - 25% faster bitmap_8888_A_scale_bicubic - 75% faster Signed-off-by: Henrik Smiding <henrik.smiding@intel.com> Committed: https://skia.googlesource.com/skia/+/e2527b147679b0c43019fae7d59cc3777d2d097e Committed: https://skia.googlesource.com/skia/+/b5c281e1e06af3be804309877de1dac6145686b9 R=reed@google.com, mtklein@google.com, tomhudson@google.com, djsollen@google.com, joakim.landberg@intel.com Author: henrik.smiding@intel.com Review URL: https://codereview.chromium.org/289473009

commit: 3bb195ef0d9691384027d7b61b0b8ef8379aaf5d [log] [tgz]
author: henrik.smiding <henrik.smiding@intel.com> Fri Jun 27 08:03:17 2014 -0700
committer: Commit bot <commit-bot@chromium.org> Fri Jun 27 08:03:17 2014 -0700
tree: f230b9c2bf5fb6b6309196c77f326a053110251e
parent: 982542dce8acbd2f3e7642268b21e76b93230daf [diff] [blame]
diff --git a/gyp/opts.gyp b/gyp/opts.gyp
index 69e3946..85c8c50 100644
--- a/gyp/opts.gyp
+++ b/gyp/opts.gyp

@@ -46,6 +46,7 @@
           ],
           'dependencies': [
             'opts_ssse3',
+            'opts_sse4',
           ],
           'sources': [
             '../src/opts/opts_check_x86.cpp',
@@ -194,10 +195,45 @@
         }],
       ],
     },
+    # For the same lame reasons as what is done for skia_opts, we also have to
+    # create another target specifically for SSE4 code as we would not want
+    # to compile the SSE2 code with -msse4 which would potentially allow
+    # gcc to generate SSE4 code.
+    {
+      'target_name': 'opts_sse4',
+      'product_name': 'skia_opts_sse4',
+      'type': 'static_library',
+      'standalone_static_library': 1,
+      'dependencies': [
+        'core.gyp:*',
+        'effects.gyp:*'
+      ],
+      'include_dirs': [
+        '../src/core',
+      ],
+      'conditions': [
+        [ 'skia_os in ["linux", "freebsd", "openbsd", "solaris", "nacl", "chromeos", "android", "mac"] \
+           and not skia_android_framework', {
+          'cflags': [
+            '-msse4',
+          ],
+        }],
+        [ 'skia_arch_width == 64 and skia_arch_type == "x86"', {
+          'sources': [
+            '../src/opts/SkBlitRow_opts_SSE4_x64_asm.S',
+          ],
+        }],
+        [ 'skia_arch_width == 32 and skia_arch_type == "x86"', {
+          'sources': [
+            '../src/opts/SkBlitRow_opts_SSE4_asm.S',
+          ],
+       }],
+      ],
+    },
     # NEON code must be compiled with -mfpu=neon which also affects scalar
     # code. To support dynamic NEON code paths, we need to build all
     # NEON-specific sources in a separate static library. The situation
-    # is very similar to the SSSE3 one.
+    # is very similar to the SSSE3 and SSE4 one.
     {
       'target_name': 'opts_neon',
       'product_name': 'skia_opts_neon',
commit	3bb195ef0d9691384027d7b61b0b8ef8379aaf5d	[log] [tgz]
author	henrik.smiding <henrik.smiding@intel.com>	Fri Jun 27 08:03:17 2014 -0700
committer	Commit bot <commit-bot@chromium.org>	Fri Jun 27 08:03:17 2014 -0700
tree	f230b9c2bf5fb6b6309196c77f326a053110251e
parent	982542dce8acbd2f3e7642268b21e76b93230daf [diff] [blame]