gcc port of alpha blend and add align to row_win loops
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/439006
git-svn-id: http://libyuv.googlecode.com/svn/trunk@207 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index ee2e779..e4533e3 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1923,6 +1923,106 @@
}
#endif // HAS_YUY2TOYROW_SSE2
+#ifdef HAS_ARGBBLENDROW_SSE2
+void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ uint32 pixel = 0;
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "sub %0,%1 \n"
+ "mov (%0),%3 \n"
+ "sub $0x1,%2 \n"
+ "je 8f \n" // last1
+ "cmp $0xff000000,%3 \n"
+ "jae 2f \n" // opaqueloop
+ "cmp $0xffffff,%3 \n"
+ "ja 3f \n" // translucientloop
+
+ // transparentloop
+ "1: \n"
+ "sub $0x1,%2 \n"
+ "lea 0x4(%0),%0 \n"
+ "je 8f \n" // last1
+ "mov (%0),%3 \n"
+ "cmp $0xffffff,%3 \n"
+ "jbe 1b \n" // transparentloop
+ "cmp $0xff000000,%3 \n"
+ "jb 3f \n" // translucientloop
+
+ // opaqueloop
+ "2: \n"
+ "mov %3,(%0,%1,1) \n"
+ "lea 0x4(%0),%0 \n"
+ "sub $0x1,%2 \n"
+ "je 8f \n" // last1
+ "mov (%0),%3 \n"
+ "cmp $0xff000000,%3 \n"
+ "jae 2b \n" // opaqueloop
+ "cmp $0xffffff,%3 \n"
+ "jbe 1b \n" // transparentloop
+ "nop \n"
+
+ // translucientloop
+ "3: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pshuflw $0xff,%%xmm0,%%xmm2 \n"
+ "pshufhw $0xff,%%xmm2,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0,%1,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jbe 8f \n" // last1
+ "mov (%0),%3 \n"
+ "cmp $0xffffff,%3 \n"
+ "jbe 1b \n" // transparentloop
+ "cmp $0xff000000,%3 \n"
+ "jb 3b \n" // translucientloop
+ "jmp 2b \n" // opaqueloop
+
+ // last1
+ "8: \n"
+ "add $0x1,%2 \n"
+ "je 9f \n" // done
+ "movd %3,%%xmm0 \n"
+ "mov (%0,%1,1),%3 \n"
+ "movd %3,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pshuflw $0xff,%%xmm0,%%xmm2 \n"
+ "pshufhw $0xff,%%xmm2,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+ "mov %3,(%0,%1,1) \n"
+
+ // done
+ "9: \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "+r"(pixel) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus