Properly enable S32_D16_filter_DX_SSE2 optimization.
Currently, the S32_D16_filter_DX_SSE2 optimization is only used in
configurations where the maximum SSE level is SSE2.
This patch enables it for higher levels, as well as fixing a color
conversion bug when the subpixels are converted into RGB565 format.
Also, refactored the function a bit, to make future modifications
less error-prone.
Author: henrik.smiding@intel.com
Signed-off-by: Henrik Smiding <henrik.smiding@intel.com>
Committed: http://code.google.com/p/skia/source/detail?r=14333
R=reed@google.com, mtklein@google.com, tomhudson@google.com, djsollen@google.com, joakim.landberg@intel.com
Author: henrik.smiding@intel.com
Review URL: https://codereview.chromium.org/239453010
git-svn-id: http://skia.googlecode.com/svn/trunk@14403 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp
index 0b07997..54a2f2d 100644
--- a/src/opts/SkBitmapProcState_opts_SSE2.cpp
+++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp
@@ -9,6 +9,7 @@
#include <emmintrin.h>
#include "SkBitmapProcState_opts_SSE2.h"
+#include "SkColorPriv.h"
#include "SkPaint.h"
#include "SkUtils.h"
@@ -639,8 +640,8 @@
* It combines S32_opaque_D32_filter_DX_SSE2 and SkPixel32ToPixel16
*/
void S32_D16_filter_DX_SSE2(const SkBitmapProcState& s,
- const uint32_t* xy,
- int count, uint16_t* colors) {
+ const uint32_t* xy,
+ int count, uint16_t* colors) {
SkASSERT(count > 0 && colors != NULL);
SkASSERT(s.fFilterLevel != SkPaint::kNone_FilterLevel);
SkASSERT(s.fBitmap->config() == SkBitmap::kARGB_8888_Config);
@@ -744,23 +745,6 @@
// Extract low int and store.
dstColor = _mm_cvtsi128_si32(sum);
- //*colors++ = SkPixel32ToPixel16(dstColor);
- // below is much faster than the above. It's tested for Android benchmark--Softweg
- __m128i _m_temp1 = _mm_set1_epi32(dstColor);
- __m128i _m_temp2 = _mm_srli_epi32(_m_temp1, 3);
-
- unsigned int r32 = _mm_cvtsi128_si32(_m_temp2);
- unsigned r = (r32 & ((1<<5) -1)) << 11;
-
- _m_temp2 = _mm_srli_epi32(_m_temp2, 7);
- unsigned int g32 = _mm_cvtsi128_si32(_m_temp2);
- unsigned g = (g32 & ((1<<6) -1)) << 5;
-
- _m_temp2 = _mm_srli_epi32(_m_temp2, 9);
- unsigned int b32 = _mm_cvtsi128_si32(_m_temp2);
- unsigned b = (b32 & ((1<<5) -1));
-
- *colors++ = r | g | b;
-
+ *colors++ = SkPixel32ToPixel16(dstColor);
} while (--count > 0);
}
diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp
index 5adfb6b..6c684c2 100644
--- a/src/opts/opts_check_SSE2.cpp
+++ b/src/opts/opts_check_SSE2.cpp
@@ -133,46 +133,54 @@
}
void SkBitmapProcState::platformProcs() {
- if (cachedHasSSSE3()) {
- if (fSampleProc32 == S32_opaque_D32_filter_DX) {
- fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
- } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
- fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
- }
+ /* Every optimization in the function requires at least SSE2 */
+ if (!cachedHasSSE2()) {
+ return;
+ }
- if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
- fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
- } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
- fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
- }
- } else if (cachedHasSSE2()) {
- if (fSampleProc32 == S32_opaque_D32_filter_DX) {
+ /* Check fSampleProc32 */
+ if (fSampleProc32 == S32_opaque_D32_filter_DX) {
+ if (cachedHasSSSE3()) {
+ fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
+ } else {
fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
- } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+ }
+ } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
+ if (cachedHasSSSE3()) {
+ fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
+ }
+ } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
+ if (cachedHasSSSE3()) {
+ fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
+ } else {
fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
}
-
- if (fSampleProc16 == S32_D16_filter_DX) {
- fSampleProc16 = S32_D16_filter_DX_SSE2;
+ } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
+ if (cachedHasSSSE3()) {
+ fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
}
}
- if (cachedHasSSSE3() || cachedHasSSE2()) {
- if (fMatrixProc == ClampX_ClampY_filter_scale) {
- fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
- } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
- fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
- }
+ /* Check fSampleProc16 */
+ if (fSampleProc16 == S32_D16_filter_DX) {
+ fSampleProc16 = S32_D16_filter_DX_SSE2;
+ }
- if (fMatrixProc == ClampX_ClampY_filter_affine) {
- fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
- } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
- fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
- }
- if (c_hqfilter_sse) {
- if (fShaderProc32 == highQualityFilter32) {
- fShaderProc32 = highQualityFilter_SSE2;
- }
+ /* Check fMatrixProc */
+ if (fMatrixProc == ClampX_ClampY_filter_scale) {
+ fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
+ } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
+ fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
+ } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
+ fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
+ } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
+ fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
+ }
+
+ /* Check fShaderProc32 */
+ if (c_hqfilter_sse) {
+ if (fShaderProc32 == highQualityFilter32) {
+ fShaderProc32 = highQualityFilter_SSE2;
}
}
}