gcc port of alpha blend and add align to row_win loops
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/439006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@207 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index ee2e779..e4533e3 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1923,6 +1923,106 @@
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
+#ifdef HAS_ARGBBLENDROW_SSE2
+void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+  uint32 pixel = 0;
+  asm volatile (
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "sub       %0,%1                           \n"
+    "mov       (%0),%3                         \n"
+    "sub       $0x1,%2                         \n"
+    "je        8f                              \n"  // last1
+    "cmp       $0xff000000,%3                  \n"
+    "jae       2f                              \n"  // opaqueloop
+    "cmp       $0xffffff,%3                    \n"
+    "ja        3f                              \n"  // translucientloop
+
+  // transparentloop
+  "1:                                          \n"
+    "sub       $0x1,%2                         \n"
+    "lea       0x4(%0),%0                      \n"
+    "je        8f                              \n"  // last1
+    "mov       (%0),%3                         \n"
+    "cmp       $0xffffff,%3                    \n"
+    "jbe       1b                              \n"  // transparentloop
+    "cmp       $0xff000000,%3                  \n"
+    "jb        3f                              \n"  // translucientloop
+
+  // opaqueloop
+  "2:                                          \n"
+    "mov       %3,(%0,%1,1)                    \n"
+    "lea       0x4(%0),%0                      \n"
+    "sub       $0x1,%2                         \n"
+    "je        8f                              \n"  // last1
+    "mov       (%0),%3                         \n"
+    "cmp       $0xff000000,%3                  \n"
+    "jae       2b                              \n"  // opaqueloop
+    "cmp       $0xffffff,%3                    \n"
+    "jbe       1b                              \n"  // transparentloop
+    "nop                                       \n"
+
+  // translucientloop
+  "3:                                          \n"
+    "movq      (%0),%%xmm0                     \n"
+    "movq      (%0,%1,1),%%xmm1                \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pshuflw   $0xff,%%xmm0,%%xmm2             \n"
+    "pshufhw   $0xff,%%xmm2,%%xmm2             \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm3,%%xmm1                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0,(%0,%1,1)                \n"
+    "lea       0x8(%0),%0                      \n"
+    "sub       $0x2,%2                         \n"
+    "jbe       8f                              \n"  // last1
+    "mov       (%0),%3                         \n"
+    "cmp       $0xffffff,%3                    \n"
+    "jbe       1b                              \n"  // transparentloop
+    "cmp       $0xff000000,%3                  \n"
+    "jb        3b                              \n"  // translucientloop
+    "jmp       2b                              \n"  // opaqueloop
+
+  // last1
+  "8:                                          \n"
+    "add       $0x1,%2                         \n"
+    "je        9f                              \n"  // done
+    "movd      %3,%%xmm0                       \n"
+    "mov       (%0,%1,1),%3                    \n"
+    "movd      %3,%%xmm1                       \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm1,%%xmm1                   \n"
+    "pshuflw   $0xff,%%xmm0,%%xmm2             \n"
+    "pshufhw   $0xff,%%xmm2,%%xmm2             \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmulhuw   %%xmm3,%%xmm1                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+    "mov       %3,(%0,%1,1)                    \n"
+
+  // done
+  "9:                                          \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(pixel)        // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+#endif  // HAS_ARGBBLENDROW_SSE2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus