Alpha blend test alignment of source pointer and use movdqa aligned fetches.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/714010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@321 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/README.chromium b/README.chromium
index e14ba48..efd8c85 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv

 URL: http://code.google.com/p/libyuv/

-Version: 320

+Version: 321

 License: BSD

 License File: LICENSE

 

diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ae992bf..0bbf261 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 320
+#define LIBYUV_VERSION 321
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/row_posix.cc b/source/row_posix.cc
index b70fcd0..b254356 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2540,9 +2540,40 @@
   "19:                                         \n"
     "add       $1-4,%3                         \n"
     "jl        49f                             \n"
+    "test      $0xf,%0                         \n"
+    "jne       41f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       41f                             \n"
 
     // 4 pixel loop.
     ".p2align  2                               \n"
+  "40:                                         \n"
+    "movdqu    (%0),%%xmm3                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm3,%%xmm0                   \n"
+    "pxor      %%xmm4,%%xmm3                   \n"
+    "movdqu    (%1),%%xmm2                     \n"
+    "pshufb    %4,%%xmm3                       \n"
+    "pand      %%xmm6,%%xmm2                   \n"
+    "paddw     %%xmm7,%%xmm3                   \n"
+    "pmullw    %%xmm3,%%xmm2                   \n"
+    "movdqu    (%1),%%xmm1                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "por       %%xmm4,%%xmm0                   \n"
+    "pmullw    %%xmm3,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm2                     \n"
+    "paddusb   %%xmm2,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "jge       40b                             \n"
+    "jmp       49f                             \n"
+
+    // 4 pixel unaligned loop.
+    ".p2align  2                               \n"
   "41:                                         \n"
     "movdqu    (%0),%%xmm3                     \n"
     "lea       0x10(%0),%0                     \n"
diff --git a/source/row_win.cc b/source/row_win.cc
index 17ccfe9..b69f9a2 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2636,8 +2636,39 @@
     add        ecx, 1 - 4
     jl         convertloop4b
 
+    test       eax, 15          // unaligned?
+    jne        convertuloop4
+    test       esi, 15          // unaligned?
+    jne        convertuloop4
+
     // 4 pixel loop.
   convertloop4:
+    movdqa     xmm3, [eax]      // src argb
+    lea        eax, [eax + 16]
+    movdqa     xmm0, xmm3       // src argb
+    pxor       xmm3, xmm4       // ~alpha
+    movdqa     xmm2, [esi]      // _r_b
+    pshufb     xmm3, kShuffleAlpha // alpha
+    pand       xmm2, xmm6       // _r_b
+    paddw      xmm3, xmm7       // 256 - alpha
+    pmullw     xmm2, xmm3       // _r_b * alpha
+    movdqa     xmm1, [esi]      // _a_g
+    lea        esi, [esi + 16]
+    psrlw      xmm1, 8          // _a_g
+    por        xmm0, xmm4       // set alpha to 255
+    pmullw     xmm1, xmm3       // _a_g * alpha
+    psrlw      xmm2, 8          // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2       // + src argb
+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1       // + src argb
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jge        convertloop4
+    jmp        convertloop4b
+
+    // 4 pixel unaligned loop.
+  convertuloop4:
     movdqu     xmm3, [eax]      // src argb
     lea        eax, [eax + 16]
     movdqa     xmm0, xmm3       // src argb
@@ -2659,7 +2690,7 @@
     sub        ecx, 4
     movdqa     [edx], xmm0
     lea        edx, [edx + 16]
-    jge        convertloop4
+    jge        convertuloop4
 
   convertloop4b:
     add        ecx, 4 - 1