Alpha blend 2 sources and store to a destination. Useful for A under B blending.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/472005
git-svn-id: http://libyuv.googlecode.com/svn/trunk@233 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index f839e20..e7cfb01 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2176,6 +2176,176 @@
#endif // HAS_ARGBBLENDROW_SSE2
+
+
+
+
+
+
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time
+// Destination aligned to 16 bytes, multiple of 4 pixels
+void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+
+ // 8 pixel loop
+ "1: \n"
+ "movdqu (%0),%%xmm3 \n" // first 4 pixels
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqa (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqa (%1),%%xmm1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "jle 9f \n"
+ "movdqa %%xmm3,%%xmm0 \n" // next 4 pixels
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqa 0x10(%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqa 0x10(%1),%%xmm1 \n"
+ "lea 0x20(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "jg 1b \n"
+ "9: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+// Blend 1 pixel at a time, unaligned
+void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+
+ // 1 pixel loop
+ "1: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ // Do 1 to 3 pixels to get destination aligned.
+ if ((uintptr_t)(dst_argb) & 15) {
+ int count = width;
+ if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
+ count = (-(intptr_t)(dst_argb) >> 2) & 3;
+ }
+ ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
+ src_argb0 += count * 4;
+ src_argb1 += count * 4;
+ dst_argb += count * 4;
+ width -= count;
+ }
+ // Do multiple of 4 pixels
+ if (width & ~3) {
+ ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
+ }
+ // Do remaining 1 to 3 pixels
+ if (width & 3) {
+ src_argb0 += (width & ~3) * 4;
+ src_argb1 += (width & ~3) * 4;
+ dst_argb += (width & ~3) * 4;
+ width &= 3;
+ ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
+
+
+
+
+
+
+
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus