Port Blend SSSE3 to gcc
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/491001
git-svn-id: http://libyuv.googlecode.com/svn/trunk@239 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 14a3a0b..d7d174b 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2029,8 +2029,10 @@
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_ARGBBLENDROW_SSE2
-// Blend 8 pixels at a time
-// Destination aligned to 16 bytes, multiple of 4 pixels
+// Blend 8 pixels at a time.
+// src_argb0 unaligned.
+// src_argb1 and dst_argb aligned to 16 bytes.
+// width must be multiple of 4 pixels.
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2045,7 +2047,7 @@
// 8 pixel loop
"1: \n"
- "movdqu (%0),%%xmm3 \n" // first 4 pixels
+ "movdqu (%0),%%xmm3 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu (%1),%%xmm2 \n"
@@ -2068,7 +2070,7 @@
"sub $0x4,%3 \n"
"movdqa %%xmm0,(%2) \n"
"jle 9f \n"
- "movdqa %%xmm3,%%xmm0 \n" // next 4 pixels
+ "movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqu 0x10(%1),%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
@@ -2105,7 +2107,7 @@
// Blend 1 pixel at a time, unaligned
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
+ uint8* dst_argb, int width) {
asm volatile (
"pcmpeqb %%xmm7,%%xmm7 \n"
"psrlw $0xf,%%xmm7 \n"
@@ -2130,7 +2132,7 @@
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
+ "lea 0x4(%1),%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -2153,36 +2155,83 @@
#endif
);
}
-
-void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
- uint8* dst_argb, int width) {
- // Do 1 to 3 pixels to get destination aligned.
- if ((uintptr_t)(dst_argb) & 15) {
- int count = width;
- if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
- count = (-(intptr_t)(dst_argb) >> 2) & 3;
- }
- ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
- src_argb0 += count * 4;
- src_argb1 += count * 4;
- dst_argb += count * 4;
- width -= count;
- }
- // Do multiple of 4 pixels
- if (width & ~3) {
- ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
- }
- // Do remaining 1 to 3 pixels
- if (width & 3) {
- src_argb0 += (width & ~3) * 4;
- src_argb1 += (width & ~3) * 4;
- dst_argb += (width & ~3) * 4;
- width &= 3;
- ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
- }
-}
#endif // HAS_ARGBBLENDROW_SSE2
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+
+ // 8 pixel loop
+ "1: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "pshufb %4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "jle 9f \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu 0x10(%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "lea 0x20(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "jg 1b \n"
+ "9: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus