alpha blend 4 pixel loop bug fix and blender C code match SSE for better testability and reference code for future optimized code.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/645008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@287 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 4d071fc..e69779d 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2375,60 +2375,39 @@
     "add       $1-4,%3                         \n"
     "jl        49f                             \n"
 
-    // 8 pixel loop.
+    // 4 pixel loop.
     ".p2align  2                               \n"
   "41:                                         \n"
     "movdqu    (%0),%%xmm3                     \n"
+    "lea       0x10(%0),%0                     \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    (%1),%%xmm2                     \n"
     "psrlw     $0x8,%%xmm3                     \n"
     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "movdqu    (%1),%%xmm2                     \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
     "movdqu    (%1),%%xmm1                     \n"
+    "lea       0x10(%1),%1                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
-    "movdqu    0x10(%0),%%xmm3                 \n"
-    "lea       0x20(%0),%0                     \n"
     "psrlw     $0x8,%%xmm2                     \n"
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
     "movdqa    %%xmm0,(%2)                     \n"
-    "jl        49f                             \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    0x10(%1),%%xmm2                 \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    0x10(%1),%%xmm1                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,0x10(%2)                 \n"
-    "lea       0x20(%2),%2                     \n"
+    "lea       0x10(%2),%2                     \n"
     "jge       41b                             \n"
 
   "49:                                         \n"
     "add       $0x3,%3                         \n"
     "jl        99f                             \n"
 
-  // 1 pixel loop.
+    // 1 pixel loop.
   "91:                                         \n"
     "movd      (%0),%%xmm3                     \n"
     "lea       0x4(%0),%0                      \n"
@@ -2531,56 +2510,37 @@
     "add       $1-4,%3                         \n"
     "jl        49f                             \n"
 
-  // 8 pixel loop.
+    // 4 pixel loop.
     ".p2align  2                               \n"
   "41:                                         \n"
     "movdqu    (%0),%%xmm3                     \n"
+    "lea       0x10(%0),%0                     \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "pshufb    %4,%%xmm3                       \n"
     "movdqu    (%1),%%xmm2                     \n"
+    "pshufb    %4,%%xmm3                       \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
     "movdqu    (%1),%%xmm1                     \n"
+    "lea       0x10(%1),%1                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
-    "movdqu    0x10(%0),%%xmm3                 \n"
-    "lea       0x20(%0),%0                     \n"
     "psrlw     $0x8,%%xmm2                     \n"
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
     "movdqa    %%xmm0,(%2)                     \n"
-    "jl        49f                             \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    0x10(%1),%%xmm2                 \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    0x10(%1),%%xmm1                 \n"
-    "lea       0x20(%1),%1                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,0x10(%2)                 \n"
-    "lea       0x20(%2),%2                     \n"
+    "lea       0x10(%2),%2                     \n"
     "jge       41b                             \n"
 
   "49:                                         \n"
     "add       $0x3,%3                         \n"
     "jl        99f                             \n"
 
-  // 1 pixel loop.
+    // 1 pixel loop.
   "91:                                         \n"
     "movd      (%0),%%xmm3                     \n"
     "lea       0x4(%0),%0                      \n"
@@ -2629,7 +2589,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrld     $0x8,%%xmm5                     \n"
 
-  // 4 pixel loop
+    // 4 pixel loop
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"