Alpha blend test alignment of source pointer and use movdqa aligned fetches.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/714010
git-svn-id: http://libyuv.googlecode.com/svn/trunk@321 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/README.chromium b/README.chromium
index e14ba48..efd8c85 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 320
+Version: 321
License: BSD
License File: LICENSE
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ae992bf..0bbf261 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 320
+#define LIBYUV_VERSION 321
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/source/row_posix.cc b/source/row_posix.cc
index b70fcd0..b254356 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2540,9 +2540,40 @@
"19: \n"
"add $1-4,%3 \n"
"jl 49f \n"
+ "test $0xf,%0 \n"
+ "jne 41f \n"
+ "test $0xf,%1 \n"
+ "jne 41f \n"
// 4 pixel loop.
".p2align 2 \n"
+ "40: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 40b \n"
+ "jmp 49f \n"
+
+ // 4 pixel unaligned loop.
+ ".p2align 2 \n"
"41: \n"
"movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n"
diff --git a/source/row_win.cc b/source/row_win.cc
index 17ccfe9..b69f9a2 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -2636,8 +2636,39 @@
add ecx, 1 - 4
jl convertloop4b
+ test eax, 15 // unaligned?
+ jne convertuloop4
+ test esi, 15 // unaligned?
+ jne convertuloop4
+
// 4 pixel loop.
convertloop4:
+ movdqa xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqa xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqa xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+ jmp convertloop4b
+
+ // 4 pixel unaligned loop.
+ convertuloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
movdqa xmm0, xmm3 // src argb
@@ -2659,7 +2690,7 @@
sub ecx, 4
movdqa [edx], xmm0
lea edx, [edx + 16]
- jge convertloop4
+ jge convertuloop4
convertloop4b:
add ecx, 4 - 1