alpha blend 4 pixel loop bug fix and blender C code match SSE for better testability and reference code for future optimized code.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/645008
git-svn-id: http://libyuv.googlecode.com/svn/trunk@287 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 4d071fc..e69779d 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2375,60 +2375,39 @@
"add $1-4,%3 \n"
"jl 49f \n"
- // 8 pixel loop.
+ // 4 pixel loop.
".p2align 2 \n"
"41: \n"
"movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n"
- "jl 49f \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu 0x10(%1),%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
- "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu 0x10(%1),%%xmm1 \n"
- "lea 0x20(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
+ "lea 0x10(%2),%2 \n"
"jge 41b \n"
"49: \n"
"add $0x3,%3 \n"
"jl 99f \n"
- // 1 pixel loop.
+ // 1 pixel loop.
"91: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
@@ -2531,56 +2510,37 @@
"add $1-4,%3 \n"
"jl 49f \n"
- // 8 pixel loop.
+ // 4 pixel loop.
".p2align 2 \n"
"41: \n"
"movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "pshufb %4,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
- "movdqu 0x10(%0),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
"psrlw $0x8,%%xmm2 \n"
"paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n"
- "jl 49f \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu 0x10(%1),%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu 0x10(%1),%%xmm1 \n"
- "lea 0x20(%1),%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
+ "lea 0x10(%2),%2 \n"
"jge 41b \n"
"49: \n"
"add $0x3,%3 \n"
"jl 99f \n"
- // 1 pixel loop.
+ // 1 pixel loop.
"91: \n"
"movd (%0),%%xmm3 \n"
"lea 0x4(%0),%0 \n"
@@ -2629,7 +2589,7 @@
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x8,%%xmm5 \n"
- // 4 pixel loop
+ // 4 pixel loop
".p2align 4 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"