Port Blend SSSE3 to gcc
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/491001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@239 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 14a3a0b..d7d174b 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2029,8 +2029,10 @@
 #endif  // HAS_YUY2TOYROW_SSE2
 
 #ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time
-// Destination aligned to 16 bytes, multiple of 4 pixels
+// Blend 8 pixels at a time.
+// src_argb0 unaligned.
+// src_argb1 and dst_argb aligned to 16 bytes.
+// width must be multiple of 4 pixels.
 void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                                uint8* dst_argb, int width) {
   asm volatile (
@@ -2045,7 +2047,7 @@
 
   // 8 pixel loop
   "1:                                          \n"
-    "movdqu    (%0),%%xmm3                     \n"  // first 4 pixels
+    "movdqu    (%0),%%xmm3                     \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
     "movdqu    (%1),%%xmm2                     \n"
@@ -2068,7 +2070,7 @@
     "sub       $0x4,%3                         \n"
     "movdqa    %%xmm0,(%2)                     \n"
     "jle       9f                              \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"  // next 4 pixels
+    "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
     "movdqu    0x10(%1),%%xmm2                 \n"
     "psrlw     $0x8,%%xmm3                     \n"
@@ -2105,7 +2107,7 @@
 
 // Blend 1 pixel at a time, unaligned
 void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                         uint8* dst_argb, int width) {
+                        uint8* dst_argb, int width) {
   asm volatile (
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
     "psrlw     $0xf,%%xmm7                     \n"
@@ -2130,7 +2132,7 @@
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
     "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                     \n"
+    "lea       0x4(%1),%1                      \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -2153,36 +2155,83 @@
 #endif
   );
 }
-
-void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                            uint8* dst_argb, int width) {
-  // Do 1 to 3 pixels to get destination aligned.
-  if ((uintptr_t)(dst_argb) & 15) {
-    int count = width;
-    if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
-      count = (-(intptr_t)(dst_argb) >> 2) & 3;
-    }
-    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
-    src_argb0 += count * 4;
-    src_argb1 += count * 4;
-    dst_argb += count * 4;
-    width -= count;
-  }
-  // Do multiple of 4 pixels
-  if (width & ~3) {
-    ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
-  }
-  // Do remaining 1 to 3 pixels
-  if (width & 3) {
-    src_argb0 += (width & ~3) * 4;
-    src_argb1 += (width & ~3) * 4;
-    dst_argb += (width & ~3) * 4;
-    width &= 3;
-    ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
-  }
-}
 #endif  // HAS_ARGBBLENDROW_SSE2
 
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kShuffleAlpha = {
+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+                               uint8* dst_argb, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0xf,%%xmm7                     \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x8,%%xmm6                     \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psllw     $0x8,%%xmm5                     \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
+
+  // 8 pixel loop
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm3                     \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "movdqu    (%1),%%xmm2                     \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    (%1),%%xmm1                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "movdqu    0x10(%0),%%xmm3                 \n"
+    "lea       0x20(%0),%0                     \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb    %%xmm2,%%xmm0                  \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb    %%xmm1,%%xmm0                  \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%2)                     \n"
+    "jle       9f                              \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    0x10(%1),%%xmm2                 \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    0x10(%1),%%xmm1                 \n"
+    "lea       0x20(%1),%1                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,0x10(%2)                 \n"
+    "lea       0x20(%2),%2                     \n"
+    "jg        1b                              \n"
+  "9:                                          \n"
+  : "+r"(src_argb0),    // %0
+    "+r"(src_argb1),    // %1
+    "+r"(dst_argb),     // %2
+    "+r"(width)         // %3
+  : "m"(kShuffleAlpha)  // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSSE3
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus