I411ToARGB doing 2 UV values with 8 Y values
BUG=40
TEST=planar_test
Review URL: https://webrtc-codereview.appspot.com/637005
git-svn-id: http://libyuv.googlecode.com/svn/trunk@277 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index a51207d..479ece0 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1215,7 +1215,7 @@
#endif // HAS_ARGBTOYROW_SSSE3
-#ifdef HAS_I420TOARGBROW_SSSE3
+#ifdef HAS_I422TOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
#define UR 0
@@ -1251,8 +1251,37 @@
{ YG, YG, YG, YG, YG, YG, YG, YG }
};
-// Convert 8 pixels
-#define YUVTORGB \
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUV444TORGB \
+ "movq (%1),%%xmm0 \n" \
+ "movq (%1,%2,1),%%xmm1 \n" \
+ "lea 0x8(%1),%1 \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw (%5),%%xmm0 \n" \
+ "pmaddubsw 16(%5),%%xmm1 \n" \
+ "pmaddubsw 32(%5),%%xmm2 \n" \
+ "psubw 48(%5),%%xmm0 \n" \
+ "psubw 64(%5),%%xmm1 \n" \
+ "psubw 80(%5),%%xmm2 \n" \
+ "movq (%0),%%xmm3 \n" \
+ "lea 0x8(%0),%0 \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw 96(%5),%%xmm3 \n" \
+ "pmullw 112(%5),%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+
+// Convert 8 pixels: 4 UV and 8 Y
+#define YUV422TORGB \
"movd (%1),%%xmm0 \n" \
"movd (%1,%2,1),%%xmm1 \n" \
"lea 0x4(%1),%1 \n" \
@@ -1281,10 +1310,41 @@
"packuswb %%xmm1,%%xmm1 \n" \
"packuswb %%xmm2,%%xmm2 \n"
-void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
+// Convert 8 pixels: 2 UV and 8 Y
+#define YUV411TORGB \
+ "movd (%1),%%xmm0 \n" \
+ "movd (%1,%2,1),%%xmm1 \n" \
+ "lea 0x2(%1),%1 \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpckldq %%xmm0,%%xmm0 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw (%5),%%xmm0 \n" \
+ "pmaddubsw 16(%5),%%xmm1 \n" \
+ "pmaddubsw 32(%5),%%xmm2 \n" \
+ "psubw 48(%5),%%xmm0 \n" \
+ "psubw 64(%5),%%xmm1 \n" \
+ "psubw 80(%5),%%xmm2 \n" \
+ "movq (%0),%%xmm3 \n" \
+ "lea 0x8(%0),%0 \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw 96(%5),%%xmm3 \n" \
+ "pmullw 112(%5),%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* rgb_buf,
+ uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
@@ -1292,7 +1352,7 @@
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
- YUVTORGB
+ YUV444TORGB
"punpcklbw %%xmm1,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n"
@@ -1306,7 +1366,7 @@
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
+ "+r"(argb_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
@@ -1316,10 +1376,10 @@
);
}
-void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* rgb_buf,
+ uint8* argb_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
@@ -1327,7 +1387,182 @@
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
- YUVTORGB
+ YUV422TORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%3) \n"
+ "movdqa %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(argb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ YUV411TORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%3) \n"
+ "movdqa %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(argb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ YUV444TORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(argb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ YUV422TORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(argb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* argb_buf,
+ int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ YUV411TORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(argb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* bgra_buf,
+ int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ YUV422TORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
@@ -1342,7 +1577,7 @@
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
+ "+r"(bgra_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
@@ -1352,10 +1587,10 @@
);
}
-void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* rgb_buf,
+ uint8* abgr_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
@@ -1363,7 +1598,7 @@
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
- YUVTORGB
+ YUV422TORGB
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
@@ -1377,7 +1612,7 @@
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
+ "+r"(abgr_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
@@ -1387,10 +1622,10 @@
);
}
-void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* rgb_buf,
+ uint8* bgra_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
@@ -1398,42 +1633,7 @@
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%3) \n"
- "movdqu %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+rm"(width) // %4
- : "r"(&kYuvConstants.kUVToB) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- asm volatile (
- "sub %1,%2 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- YUVTORGB
+ YUV422TORGB
"pcmpeqb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm5 \n"
@@ -1448,7 +1648,7 @@
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
+ "+r"(bgra_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
@@ -1458,10 +1658,10 @@
);
}
-void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
- uint8* rgb_buf,
+ uint8* abgr_buf,
int width) {
asm volatile (
"sub %1,%2 \n"
@@ -1469,7 +1669,7 @@
"pxor %%xmm4,%%xmm4 \n"
".p2align 4 \n"
"1: \n"
- YUVTORGB
+ YUV422TORGB
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
@@ -1483,7 +1683,7 @@
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
+ "+r"(abgr_buf), // %3
"+rm"(width) // %4
: "r"(&kYuvConstants.kUVToB) // %5
: "memory", "cc"
@@ -1493,63 +1693,7 @@
);
}
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- asm volatile (
- "sub %1,%2 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movd (%1),%%xmm0 \n"
- "movd (%1,%2,1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pmaddubsw (%5),%%xmm0 \n"
- "pmaddubsw 16(%5),%%xmm1 \n"
- "pmaddubsw 32(%5),%%xmm2 \n"
- "psubw 48(%5),%%xmm0 \n"
- "psubw 64(%5),%%xmm1 \n"
- "psubw 80(%5),%%xmm2 \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm4,%%xmm3 \n"
- "psubsw 96(%5),%%xmm3 \n"
- "pmullw 112(%5),%%xmm3 \n"
- "paddsw %%xmm3,%%xmm0 \n"
- "paddsw %%xmm3,%%xmm1 \n"
- "paddsw %%xmm3,%%xmm2 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm2 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "sub $0x4,%4 \n"
- "movdqa %%xmm0,(%3) \n"
- "lea 0x10(%3),%3 \n"
- "jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+rm"(width) // %4
- : "r"(&kYuvConstants.kUVToB) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif
+#endif // HAS_I422TOARGBROW_SSSE3
#ifdef HAS_YTOARGBROW_SSE2
void YToARGBRow_SSE2(const uint8* y_buf,