Add SSE4 optimization of S32A_Opaque_Blitrow
Adds optimization of Skia S32A_Opaque_Blitrow blitter using SSE4.2 SIMD
instruction set. Special case for when alpha is zero or opaque.
Performance increase of 10%-400% compared to the existing SSE2
optimization (measured on Silvermont architecture).
Noticeable in ~25 different skia bench subtests, especially in
bitmap_8888_*, repeatTile_*, and morph_*.
bitmap_8888_A - 100% faster
bitmap_8888_A_source_transparent - 250% faster
bitmap_8888_A_source_opaque - 25% faster
bitmap_8888_A_scale_bicubic - 75% faster
Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>
Committed: https://skia.googlesource.com/skia/+/e2527b147679b0c43019fae7d59cc3777d2d097e
Committed: https://skia.googlesource.com/skia/+/b5c281e1e06af3be804309877de1dac6145686b9
R=reed@google.com, mtklein@google.com, tomhudson@google.com, djsollen@google.com, joakim.landberg@intel.com
Author: henrik.smiding@intel.com
Review URL: https://codereview.chromium.org/289473009
diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp
index 6af4772..b949698 100644
--- a/src/opts/opts_check_x86.cpp
+++ b/src/opts/opts_check_x86.cpp
@@ -12,6 +12,7 @@
#include "SkBlitRect_opts_SSE2.h"
#include "SkBlitRow.h"
#include "SkBlitRow_opts_SSE2.h"
+#include "SkBlitRow_opts_SSE4.h"
#include "SkBlurImage_opts_SSE2.h"
#include "SkMorphology_opts.h"
#include "SkMorphology_opts_SSE2.h"
@@ -82,6 +83,8 @@
getcpuid(1, cpu_info);
if ((cpu_info[2] & (1<<20)) != 0) {
return SK_CPU_SSE_LEVEL_SSE42;
+ } else if ((cpu_info[2] & (1<<19)) != 0) {
+ return SK_CPU_SSE_LEVEL_SSE41;
} else if ((cpu_info[2] & (1<<9)) != 0) {
return SK_CPU_SSE_LEVEL_SSSE3;
} else if ((cpu_info[3] & (1<<26)) != 0) {
@@ -206,16 +209,30 @@
}
}
-static SkBlitRow::Proc32 platform_32_procs[] = {
+static SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
NULL, // S32_Opaque,
S32_Blend_BlitRow32_SSE2, // S32_Blend,
S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque
S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
};
+#if defined(SK_ATT_ASM_SUPPORTED)
+static SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
+ NULL, // S32_Opaque,
+ S32_Blend_BlitRow32_SSE2, // S32_Blend,
+ S32A_Opaque_BlitRow32_SSE4_asm, // S32A_Opaque
+ S32A_Blend_BlitRow32_SSE2, // S32A_Blend,
+};
+#endif
+
SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
+#if defined(SK_ATT_ASM_SUPPORTED)
+ if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
+ return platform_32_procs_SSE4[flags];
+ } else
+#endif
if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
- return platform_32_procs[flags];
+ return platform_32_procs_SSE2[flags];
} else {
return NULL;
}