YUY2 and UYVY Unaligned and any versions
TEST=none
BUG=none
Review URL: https://webrtc-codereview.appspot.com/379009
git-svn-id: http://libyuv.googlecode.com/svn/trunk@168 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index bb213c4..fe6f62d 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -71,22 +71,22 @@
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
-"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "ja 1b \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "ja 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -100,15 +100,15 @@
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
asm volatile (
- "movdqa %3,%%xmm5 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "ja 1b \n"
+ "movdqa %3,%%xmm5 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "ja 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -123,15 +123,15 @@
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
asm volatile (
- "movdqa %3,%%xmm5 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x4,%2 \n"
- "ja 1b \n"
+ "movdqa %3,%%xmm5 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "ja 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -145,33 +145,33 @@
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqa %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqa %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -185,33 +185,33 @@
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqa %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqa %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
@@ -225,28 +225,28 @@
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
@@ -262,28 +262,28 @@
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
-"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "ja 1b \n"
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
@@ -302,9 +302,9 @@
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
@@ -315,42 +315,42 @@
#endif
);
asm volatile (
- "sub %1,%2 \n"
-"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "ja 1b \n"
+ "sub %1,%2 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "ja 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -366,9 +366,9 @@
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm5 \n"
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
:
: "m"(kARGBToU), // %0
"m"(kARGBToV), // %1
@@ -379,46 +379,46 @@
#endif
);
asm volatile (
- "sub %1,%2 \n"
-"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x10,%3 \n"
- "ja 1b \n"
+ "sub %1,%2 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "ja 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
@@ -432,7 +432,7 @@
}
#endif
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
+#ifdef HAS_I420TOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
#define UR 0
@@ -476,235 +476,235 @@
// Convert 8 pixels
#define YUVTORGB \
- "movd (%1),%%xmm0 \n" \
- "movd (%1,%2,1),%%xmm1 \n" \
- "lea 0x4(%1),%1 \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw (%5),%%xmm0 \n" \
- "pmaddubsw 16(%5),%%xmm1 \n" \
- "pmaddubsw 32(%5),%%xmm2 \n" \
- "psubw 48(%5),%%xmm0 \n" \
- "psubw 64(%5),%%xmm1 \n" \
- "psubw 80(%5),%%xmm2 \n" \
- "movq (%0),%%xmm3 \n" \
- "lea 0x8(%0),%0 \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw 96(%5),%%xmm3 \n" \
- "pmullw 112(%5),%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
+ "movd (%1),%%xmm0 \n" \
+ "movd (%1,%2,1),%%xmm1 \n" \
+ "lea 0x4(%1),%1 \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw (%5),%%xmm0 \n" \
+ "pmaddubsw 16(%5),%%xmm1 \n" \
+ "pmaddubsw 32(%5),%%xmm2 \n" \
+ "psubw 48(%5),%%xmm0 \n" \
+ "psubw 64(%5),%%xmm1 \n" \
+ "psubw 80(%5),%%xmm2 \n" \
+ "movq (%0),%%xmm3 \n" \
+ "lea 0x8(%0),%0 \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw 96(%5),%%xmm3 \n" \
+ "pmullw 112(%5),%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
-void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile (
- "sub %1,%2 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- "1: \n"
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%3) \n"
- "movdqa %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x8,%4 \n"
- "ja 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+rm"(width) // %4
- : "r"(&kYuvConstants.kUVToB) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile (
- "sub %1,%2 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- "1: \n"
- YUVTORGB
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm5 \n"
- "movdqa %%xmm5,%%xmm0 \n"
- "punpcklwd %%xmm1,%%xmm5 \n"
- "punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5,(%3) \n"
- "movdqa %%xmm0,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x8,%4 \n"
- "ja 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+rm"(width) // %4
- : "r"(&kYuvConstants.kUVToB) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile (
- "sub %1,%2 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- "1: \n"
- YUVTORGB
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm2 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,(%3) \n"
- "movdqa %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
- "sub $0x8,%4 \n"
- "ja 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+rm"(width) // %4
- : "r"(&kYuvConstants.kUVToB) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile (
- "sub %1,%2 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pxor %%xmm4,%%xmm4 \n"
-
- "1: \n"
- "movd (%1),%%xmm0 \n"
- "movd (%1,%2,1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pmaddubsw (%5),%%xmm0 \n"
- "pmaddubsw 16(%5),%%xmm1 \n"
- "pmaddubsw 32(%5),%%xmm2 \n"
- "psubw 48(%5),%%xmm0 \n"
- "psubw 64(%5),%%xmm1 \n"
- "psubw 80(%5),%%xmm2 \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
- "punpcklbw %%xmm4,%%xmm3 \n"
- "psubsw 96(%5),%%xmm3 \n"
- "pmullw 112(%5),%%xmm3 \n"
- "paddsw %%xmm3,%%xmm0 \n"
- "paddsw %%xmm3,%%xmm1 \n"
- "paddsw %%xmm3,%%xmm2 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm2 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklwd %%xmm2,%%xmm0 \n"
- "movdqa %%xmm0,(%3) \n"
- "lea 0x10(%3),%3 \n"
- "sub $0x4,%4 \n"
- "ja 1b \n"
- : "+r"(y_buf), // %0
- "+r"(u_buf), // %1
- "+r"(v_buf), // %2
- "+r"(rgb_buf), // %3
- "+rm"(width) // %4
- : "r"(&kYuvConstants.kUVToB) // %5
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-#endif
-
-#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
-
-void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
+void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx
int width) { // r8
asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "mov $0x10001000,%%eax \n"
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "mov $0x012a012a,%%eax \n"
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ "1: \n"
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%3) \n"
+ "movdqa %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "ja 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ "1: \n"
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm5,(%3) \n"
+ "movdqa %%xmm0,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "ja 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ "1: \n"
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,(%3) \n"
+ "movdqa %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x8,%4 \n"
+ "ja 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm volatile (
+ "sub %1,%2 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ "1: \n"
+ "movd (%1),%%xmm0 \n"
+ "movd (%1,%2,1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pmaddubsw (%5),%%xmm0 \n"
+ "pmaddubsw 16(%5),%%xmm1 \n"
+ "pmaddubsw 32(%5),%%xmm2 \n"
+ "psubw 48(%5),%%xmm0 \n"
+ "psubw 64(%5),%%xmm1 \n"
+ "psubw 80(%5),%%xmm2 \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm4,%%xmm3 \n"
+ "psubsw 96(%5),%%xmm3 \n"
+ "pmullw 112(%5),%%xmm3 \n"
+ "paddsw %%xmm3,%%xmm0 \n"
+ "paddsw %%xmm3,%%xmm1 \n"
+ "paddsw %%xmm3,%%xmm2 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm2 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "ja 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+rm"(width) // %4
+ : "r"(&kYuvConstants.kUVToB) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif
+
+#ifdef HAS_YTOARGBROW_SSE2
+
+void YToARGBRow_SSE2(const uint8* y_buf, // rdi
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "mov $0x10001000,%%eax \n"
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "mov $0x012a012a,%%eax \n"
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
"1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
// Step 2: Weave into ARGB
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "por %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,16(%1) \n"
- "lea 32(%1),%1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,16(%1) \n"
+ "lea 32(%1),%1 \n"
- "sub $0x8,%2 \n"
- "ja 1b \n"
+ "sub $0x8,%2 \n"
+ "ja 1b \n"
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
"+rm"(width) // %2
@@ -787,15 +787,15 @@
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
- "movdqa %3,%%xmm5 \n"
- "lea -0x10(%0),%0 \n"
+ "movdqa %3,%%xmm5 \n"
+ "lea -0x10(%0),%0 \n"
"1: \n"
- "movdqa (%0,%2),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "ja 1b \n"
+ "movdqa (%0,%2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -813,20 +813,20 @@
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
- "lea -0x10(%0),%0 \n"
+ "lea -0x10(%0),%0 \n"
"1: \n"
- "movdqu (%0,%2),%%xmm0 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "psllw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufd $0x4e,%%xmm0,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "ja 1b \n"
+ "movdqu (%0,%2),%%xmm0 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "psllw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufd $0x4e,%%xmm0,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "ja 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
@@ -839,6 +839,269 @@
}
#endif
+#ifdef HAS_YUY2TOI420ROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "ja 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_y), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u, uint8* dst_y,
+ int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "ja 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_y), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+);
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "ja 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_y), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "ja 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_y), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+#endif // HAS_YUY2TOI420ROW_SSE2
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv