rewrite ARGBToI420 with SSSE3
TEST=talk unittests
BUG=none
Review URL: http://webrtc-codereview.appspot.com/251003
git-svn-id: http://libyuv.googlecode.com/svn/trunk@46 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 02ddc12..40e636c 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -12,6 +12,91 @@
extern "C" {
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constant multiplication table for converting ARGB to I400.
+extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
+ 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
+};
+
+extern "C" TALIGN16(const uint8, kAdd16[16]) = {
+ 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+};
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile(
+ "movdqa (%3),%%xmm7\n"
+ "movdqa (%4),%%xmm6\n"
+ "movdqa %%xmm6,%%xmm5\n"
+ "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte.
+"1:"
+ "movdqa (%0),%%xmm0\n"
+ "pmaddubsw %%xmm7,%%xmm0\n"
+ "movdqa 0x10(%0),%%xmm1\n"
+ "psrlw $0x7,%%xmm0\n"
+ "pmaddubsw %%xmm7,%%xmm1\n"
+ "lea 0x20(%0),%0\n"
+ "psrlw $0x7,%%xmm1\n"
+ "packuswb %%xmm1,%%xmm0\n"
+ "pmaddubsw %%xmm6,%%xmm0\n"
+ "packuswb %%xmm0,%%xmm0\n"
+ "paddb %%xmm5,%%xmm0\n"
+ "movq %%xmm0,(%1)\n"
+ "lea 0x8(%1),%1\n"
+ "sub $0x8,%2\n"
+ "ja 1b\n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "r"(kMultiplyMaskARGBToI400), // %3
+ "r"(kAdd16) // %4
+ : "memory"
+);
+}
+#endif
+
+static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+ return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
+}
+
+static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+ return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
+}
+static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+ return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
+}
+
+void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
+ for (int x = 0; x < width; ++x) {
+ dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
+ src_argb0 += 4;
+ dst_y += 1;
+ }
+}
+
+void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* src_argb1 = src_argb0 + src_stride_argb;
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
+ uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
+ uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb0 += 8;
+ src_argb1 += 8;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
+ uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
+ uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ }
+}
+
#if defined(__x86_64__)
// 64 bit linux gcc version