ARGB to and from I420 ported to x64
BUG=none
TEST=media_unittests
Review URL: http://webrtc-codereview.appspot.com/266003
git-svn-id: http://libyuv.googlecode.com/svn/trunk@61 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 88ce475..090c1a6 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -15,62 +15,128 @@
#ifdef HAS_ARGBTOYROW_SSSE3
// Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
- 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
+static const vec8 kARGBToY = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
-extern "C" TALIGN16(const uint8, kAdd16[16]) = {
- 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+static const uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
};
+#ifdef HAS_ARGBTOUVROW_SSSE3
+static const vec8 kARGBToU = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const uvec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+static const uvec8 kAddUV128 = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+#endif
+
// Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+static const uvec8 kShuffleMaskBG24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+static const uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile(
- "movdqa (%3),%%xmm7\n"
- "movdqa (%4),%%xmm6\n"
- "movdqa %%xmm6,%%xmm5\n"
- "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte.
+ "pcmpeqb %%xmm5,%%xmm5\n"
+ "pslld $0x18,%%xmm5\n"
"1:"
- "movdqa (%0),%%xmm0\n"
- "pmaddubsw %%xmm7,%%xmm0\n"
- "movdqa 0x10(%0),%%xmm1\n"
- "psrlw $0x7,%%xmm0\n"
- "pmaddubsw %%xmm7,%%xmm1\n"
- "lea 0x20(%0),%0\n"
- "psrlw $0x7,%%xmm1\n"
- "packuswb %%xmm1,%%xmm0\n"
- "pmaddubsw %%xmm6,%%xmm0\n"
- "packuswb %%xmm0,%%xmm0\n"
- "paddb %%xmm5,%%xmm0\n"
- "movq %%xmm0,(%1)\n"
- "lea 0x8(%1),%1\n"
+ "movq (%0),%%xmm0\n"
+ "lea 0x8(%0),%0\n"
+ "punpcklbw %%xmm0,%%xmm0\n"
+ "movdqa %%xmm0,%%xmm1\n"
+ "punpcklwd %%xmm0,%%xmm0\n"
+ "punpckhwd %%xmm1,%%xmm1\n"
+ "por %%xmm5,%%xmm0\n"
+ "por %%xmm5,%%xmm1\n"
+ "movdqa %%xmm0,(%1)\n"
+ "movdqa %%xmm1,0x10(%1)\n"
+ "lea 0x20(%1),%1\n"
"sub $0x8,%2\n"
"ja 1b\n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(pix) // %2
- : "r"(kMultiplyMaskARGBToI400), // %3
- "r"(kAdd16) // %4
- : "memory"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
);
}
+
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
+ asm volatile(
+ "movdqa %3,%%xmm5\n"
+"1:"
+ "movdqa (%0),%%xmm0\n"
+ "lea 0x10(%0),%0\n"
+ "pshufb %%xmm5,%%xmm0\n"
+ "movdqa %%xmm0,(%1)\n"
+ "lea 0x10(%1),%1\n"
+ "sub $0x4,%2\n"
+ "ja 1b\n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskABGRToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
#endif
-#ifdef HAS_BG24TOARGBROW_SSSE3
+);
+}
+
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
+ asm volatile(
+ "movdqa %3,%%xmm5\n"
+"1:"
+ "movdqa (%0),%%xmm0\n"
+ "lea 0x10(%0),%0\n"
+ "pshufb %%xmm5,%%xmm0\n"
+ "movdqa %%xmm0,(%1)\n"
+ "lea 0x10(%1),%1\n"
+ "sub $0x4,%2\n"
+ "ja 1b\n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskBGRAToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+);
+}
+
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
- "pslld $0x18,%%xmm7\n"
- "movdqa (%3),%%xmm6\n"
+ "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5\n"
+ "movdqa %3,%%xmm4\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
@@ -78,19 +144,19 @@
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
- "pshufb %%xmm6,%%xmm2\n"
- "por %%xmm7,%%xmm2\n"
+ "pshufb %%xmm4,%%xmm2\n"
+ "por %%xmm5,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
- "pshufb %%xmm6,%%xmm0\n"
+ "pshufb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
- "por %%xmm7,%%xmm0\n"
- "pshufb %%xmm6,%%xmm1\n"
+ "por %%xmm5,%%xmm0\n"
+ "pshufb %%xmm4,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
- "por %%xmm7,%%xmm1\n"
+ "por %%xmm5,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
- "pshufb %%xmm6,%%xmm3\n"
+ "pshufb %%xmm4,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
- "por %%xmm7,%%xmm3\n"
+ "por %%xmm5,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
@@ -98,16 +164,19 @@
: "+r"(src_bg24), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
- : "r"(kShuffleMaskBG24ToARGB) // %3
- : "memory"
+ : "m"(kShuffleMaskBG24ToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
);
}
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile(
- "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
- "pslld $0x18,%%xmm7\n"
- "movdqa (%3),%%xmm6\n"
+ "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5\n"
+ "movdqa %3,%%xmm4\n"
"1:"
"movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n"
@@ -115,19 +184,19 @@
"lea 0x30(%0),%0\n"
"movdqa %%xmm3,%%xmm2\n"
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
- "pshufb %%xmm6,%%xmm2\n"
- "por %%xmm7,%%xmm2\n"
+ "pshufb %%xmm4,%%xmm2\n"
+ "por %%xmm5,%%xmm2\n"
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
- "pshufb %%xmm6,%%xmm0\n"
+ "pshufb %%xmm4,%%xmm0\n"
"movdqa %%xmm2,0x20(%1)\n"
- "por %%xmm7,%%xmm0\n"
- "pshufb %%xmm6,%%xmm1\n"
+ "por %%xmm5,%%xmm0\n"
+ "pshufb %%xmm4,%%xmm1\n"
"movdqa %%xmm0,(%1)\n"
- "por %%xmm7,%%xmm1\n"
+ "por %%xmm5,%%xmm1\n"
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
- "pshufb %%xmm6,%%xmm3\n"
+ "pshufb %%xmm4,%%xmm3\n"
"movdqa %%xmm1,0x10(%1)\n"
- "por %%xmm7,%%xmm3\n"
+ "por %%xmm5,%%xmm3\n"
"movdqa %%xmm3,0x30(%1)\n"
"lea 0x40(%1),%1\n"
"sub $0x10,%2\n"
@@ -135,147 +204,320 @@
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
- : "r"(kShuffleMaskRAWToARGB) // %3
- : "memory"
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+);
+}
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile(
+ "movdqa %4,%%xmm5\n"
+ "movdqa %3,%%xmm4\n"
+"1:"
+ "movdqa (%0),%%xmm0\n"
+ "movdqa 0x10(%0),%%xmm1\n"
+ "movdqa 0x20(%0),%%xmm2\n"
+ "movdqa 0x30(%0),%%xmm3\n"
+ "pmaddubsw %%xmm4,%%xmm0\n"
+ "pmaddubsw %%xmm4,%%xmm1\n"
+ "pmaddubsw %%xmm4,%%xmm2\n"
+ "pmaddubsw %%xmm4,%%xmm3\n"
+ "lea 0x40(%0),%0\n"
+ "phaddw %%xmm1,%%xmm0\n"
+ "phaddw %%xmm3,%%xmm2\n"
+ "psrlw $0x7,%%xmm0\n"
+ "psrlw $0x7,%%xmm2\n"
+ "packuswb %%xmm2,%%xmm0\n"
+ "paddb %%xmm5,%%xmm0\n"
+ "movdqa %%xmm0,(%1)\n"
+ "lea 0x10(%1),%1\n"
+ "sub $0x10,%2\n"
+ "ja 1b\n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+
);
}
#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile(
+ "movdqa %5,%%xmm7\n"
+ "movdqa %6,%%xmm6\n"
+ "movdqa %7,%%xmm5\n"
+ "sub %1,%2\n"
+"1:"
+ "movdqa (%0),%%xmm0\n"
+ "movdqa 0x10(%0),%%xmm1\n"
+ "movdqa 0x20(%0),%%xmm2\n"
+ "movdqa 0x30(%0),%%xmm3\n"
+ "pavgb (%0,%4,1),%%xmm0\n"
+ "pavgb 0x10(%0,%4,1),%%xmm1\n"
+ "pavgb 0x20(%0,%4,1),%%xmm2\n"
+ "pavgb 0x30(%0,%4,1),%%xmm3\n"
+ "lea 0x40(%0),%0\n"
+ "movdqa %%xmm0,%%xmm4\n"
+ "shufps $0x88,%%xmm1,%%xmm0\n"
+ "shufps $0xdd,%%xmm1,%%xmm4\n"
+ "pavgb %%xmm4,%%xmm0\n"
+ "movdqa %%xmm2,%%xmm4\n"
+ "shufps $0x88,%%xmm3,%%xmm2\n"
+ "shufps $0xdd,%%xmm3,%%xmm4\n"
+ "pavgb %%xmm4,%%xmm2\n"
+ "movdqa %%xmm0,%%xmm1\n"
+ "movdqa %%xmm2,%%xmm3\n"
+ "pmaddubsw %%xmm7,%%xmm0\n"
+ "pmaddubsw %%xmm7,%%xmm2\n"
+ "pmaddubsw %%xmm6,%%xmm1\n"
+ "pmaddubsw %%xmm6,%%xmm3\n"
+ "phaddw %%xmm2,%%xmm0\n"
+ "phaddw %%xmm3,%%xmm1\n"
+ "psraw $0x8,%%xmm0\n"
+ "psraw $0x8,%%xmm1\n"
+ "packsswb %%xmm1,%%xmm0\n"
+ "paddb %%xmm5,%%xmm0\n"
+ "movlps %%xmm0,(%1)\n"
+ "movhps %%xmm0,(%1,%2,1)\n"
+ "lea 0x8(%1),%1\n"
+ "sub $0x10,%3\n"
+ "ja 1b\n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_argb)), // %4
+ "m"(kARGBToU), // %5
+ "m"(kARGBToV), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+);
+}
+#endif
+
+// The following code requires 6 registers and prefers 7 registers.
+// 7 registers requires -fpic to be off, and -fomit-frame-pointer
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
#if defined(__x86_64__)
+#define REG_a "rax"
+#define REG_d "rdx"
+#else
+#define REG_a "eax"
+#define REG_d "edx"
+#endif
+#if defined(__APPLE__) || defined(__x86_64__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
-// 64 bit linux gcc version
+#if defined(__APPLE__)
+// REG6 version uses 1 less register but is slower
+#define REG6
+#endif
-void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
- asm volatile(
-"1:"
- "movzb (%1),%%r10\n"
- "lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
- "lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
- "movzb 0x1(%0),%%r11\n"
- "paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
- "lea 2(%0),%0\n"
- "movq (%5,%%r11,8),%%xmm3\n"
- "paddsw %%xmm0,%%xmm2\n"
- "paddsw %%xmm0,%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movq %%xmm2,0x0(%3)\n"
- "lea 8(%3),%3\n"
- "sub $0x2,%4\n"
+#ifdef REG6
+// 6 register version only has REG_a for temporary
+#define CLOBBER "%"REG_a
+#define YUVTORGB \
+ "1:" \
+ "movzb (%1),%%"REG_a"\n" \
+ "lea 1(%1),%1\n" \
+ "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
+ "movzb (%2),%%"REG_a"\n" \
+ "lea 1(%2),%2\n" \
+ "movq 4096(%5,%%"REG_a",8),%%xmm1\n" \
+ "paddsw %%xmm1,%%xmm0\n" \
+ "movzb (%0),%%"REG_a"\n" \
+ "movq 0(%5,%%"REG_a",8),%%xmm2\n" \
+ "movzb 0x1(%0),%%"REG_a"\n" \
+ "movq 0(%5,%%"REG_a",8),%%xmm3\n" \
+ "lea 2(%0),%0\n" \
+ "paddsw %%xmm0,%%xmm2\n" \
+ "paddsw %%xmm0,%%xmm3\n" \
+ "shufps $0x44,%%xmm3,%%xmm2\n" \
+ "psraw $0x6,%%xmm2\n" \
+ "packuswb %%xmm2,%%xmm2\n" \
+ "movq %%xmm2,0x0(%3)\n" \
+ "lea 8(%3),%3\n" \
+ "sub $0x2,%4\n" \
"ja 1b\n"
+#else
+#define CLOBBER "%"REG_a, "%"REG_d
+// This version produces 2 pixels
+#define YUVTORGB \
+"1:" \
+ "movzb (%1),%%"REG_a"\n" \
+ "lea 1(%1),%1\n" \
+ "movzb (%2),%%"REG_d"\n" \
+ "lea 1(%2),%2\n" \
+ "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
+ "movzb 0(%0),%%"REG_a"\n" \
+ "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
+ "paddsw %%xmm1,%%xmm0\n" \
+ "movzb 1(%0),%%"REG_d"\n" \
+ "punpcklqdq %%xmm0,%%xmm0\n" \
+ "lea 2(%0),%0\n" \
+ "movq 0(%5,%%"REG_a",8),%%xmm1\n" \
+ "movhps 0(%5,%%"REG_d",8),%%xmm1\n" \
+ "paddsw %%xmm0,%%xmm1\n" \
+ "psraw $6,%%xmm1\n" \
+ "packuswb %%xmm1,%%xmm1\n" \
+ "movq %%xmm1,0(%3)\n" \
+ "lea 8(%3),%3\n" \
+ "sub $0x2,%4\n" \
+ "ja 1b\n"
+// This version produces 4 pixels
+#define YUVTORGB4 \
+"1:" \
+ "movzb 0(%1),%%"REG_a"\n" \
+ "movzb 0(%2),%%"REG_d"\n" \
+ "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
+ "movzb 0(%0),%%"REG_a"\n" \
+ "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
+ "paddsw %%xmm1,%%xmm0\n" \
+ "movzb 1(%0),%%"REG_d"\n" \
+ "punpcklqdq %%xmm0,%%xmm0\n" \
+ "movq 0(%5,%%"REG_a",8),%%xmm2\n" \
+ "movhps 0(%5,%%"REG_d",8),%%xmm2\n" \
+ "paddsw %%xmm0,%%xmm2\n" \
+ "psraw $6,%%xmm2\n" \
+ "movzb 1(%1),%%"REG_a"\n" \
+ "movzb 1(%2),%%"REG_d"\n" \
+ "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
+ "movzb 2(%0),%%"REG_a"\n" \
+ "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
+ "paddsw %%xmm1,%%xmm0\n" \
+ "movzb 3(%0),%%"REG_d"\n" \
+ "punpcklqdq %%xmm0,%%xmm0\n" \
+ "movq 0(%5,%%"REG_a",8),%%xmm3\n" \
+ "movhps 0(%5,%%"REG_d",8),%%xmm3\n" \
+ "paddsw %%xmm0,%%xmm3\n" \
+ "psraw $6,%%xmm3\n" \
+ "lea 2(%1),%1\n" \
+ "lea 2(%2),%2\n" \
+ "lea 4(%0),%0\n" \
+ "packuswb %%xmm3,%%xmm2\n" \
+ "movdqa %%xmm2,0(%3)\n" \
+ "lea 16(%3),%3\n" \
+ "sub $0x4,%4\n" \
+ "ja 1b\n"
+#endif
+
+// 6 or 7 registers
+void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm volatile(
+ YUVTORGB
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsRgbY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+ "+rm"(width) // %4
+ : "r" (kCoefficientsRgbY) // %5
+ : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
);
}
-void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
+// 6 or 7 registers
+void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
asm volatile(
-"1:"
- "movzb (%1),%%r10\n"
- "lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
- "lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
- "movzb 0x1(%0),%%r11\n"
- "paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
- "lea 2(%0),%0\n"
- "movq (%5,%%r11,8),%%xmm3\n"
- "paddsw %%xmm0,%%xmm2\n"
- "paddsw %%xmm0,%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movq %%xmm2,0x0(%3)\n"
- "lea 8(%3),%3\n"
- "sub $0x2,%4\n"
- "ja 1b\n"
+ YUVTORGB4
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsBgraY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+ "+rm"(width) // %4
+ : "r" (kCoefficientsRgbY) // %5
+ : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
);
}
-void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
+void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
asm volatile(
-"1:"
- "movzb (%1),%%r10\n"
- "lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
- "lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
- "movzb 0x1(%0),%%r11\n"
- "paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
- "lea 2(%0),%0\n"
- "movq (%5,%%r11,8),%%xmm3\n"
- "paddsw %%xmm0,%%xmm2\n"
- "paddsw %%xmm0,%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
- "psraw $0x6,%%xmm2\n"
- "packuswb %%xmm2,%%xmm2\n"
- "movq %%xmm2,0x0(%3)\n"
- "lea 8(%3),%3\n"
- "sub $0x2,%4\n"
- "ja 1b\n"
+ YUVTORGB
: "+r"(y_buf), // %0
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsAbgrY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+ "+rm"(width) // %4
+ : "r" (kCoefficientsBgraY) // %5
+ : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
);
}
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
- const uint8* u_buf, // rsi
- const uint8* v_buf, // rdx
- uint8* rgb_buf, // rcx
- int width) { // r8
+void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
+ asm volatile(
+ YUVTORGB
+ : "+r"(y_buf), // %0
+ "+r"(u_buf), // %1
+ "+r"(v_buf), // %2
+ "+r"(rgb_buf), // %3
+ "+rm"(width) // %4
+ : "r" (kCoefficientsAbgrY) // %5
+ : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+);
+}
+
+// 6 registers
+void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
+ const uint8* u_buf, // rsi
+ const uint8* v_buf, // rdx
+ uint8* rgb_buf, // rcx
+ int width) { // r8
asm volatile(
"1:"
- "movzb (%1),%%r10\n"
+ "movzb (%1),%%"REG_a"\n"
"lea 1(%1),%1\n"
- "movzb (%2),%%r11\n"
+ "movq 2048(%5,%%"REG_a",8),%%xmm0\n"
+ "movzb (%2),%%"REG_a"\n"
"lea 1(%2),%2\n"
- "movq 2048(%5,%%r10,8),%%xmm0\n"
- "movzb (%0),%%r10\n"
- "movq 4096(%5,%%r11,8),%%xmm1\n"
+ "movq 4096(%5,%%"REG_a",8),%%xmm1\n"
"paddsw %%xmm1,%%xmm0\n"
- "movq (%5,%%r10,8),%%xmm2\n"
+ "movzb (%0),%%"REG_a"\n"
"lea 1(%0),%0\n"
+ "movq 0(%5,%%"REG_a",8),%%xmm2\n"
"paddsw %%xmm0,%%xmm2\n"
"shufps $0x44,%%xmm2,%%xmm2\n"
"psraw $0x6,%%xmm2\n"
@@ -288,23 +530,26 @@
"+r"(u_buf), // %1
"+r"(v_buf), // %2
"+r"(rgb_buf), // %3
- "+r"(width) // %4
- : "r" (_kCoefficientsRgbY) // %5
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
+ "+rm"(width) // %4
+ : "r" (kCoefficientsRgbY) // %5
+ : "memory", "cc", "%"REG_a
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
);
}
-void FastConvertYToRGB32Row(const uint8* y_buf, // rdi
- uint8* rgb_buf, // rcx
- int width) { // r8
+// 5 registers
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
+ uint8* rgb_buf, // rcx
+ int width) { // r8
asm volatile(
"1:"
- "movzb (%0),%%r10\n"
- "movzb 0x1(%0),%%r11\n"
- "movq (%3,%%r10,8),%%xmm2\n"
+ "movzb (%0),%%"REG_a"\n"
+ "movzb 0x1(%0),%%"REG_d"\n"
+ "movq (%3,%%"REG_a",8),%%xmm2\n"
"lea 2(%0),%0\n"
- "movq (%3,%%r11,8),%%xmm3\n"
- "shufps $0x44,%%xmm3,%%xmm2\n"
+ "movhps (%3,%%"REG_d",8),%%xmm2\n"
"psraw $0x6,%%xmm2\n"
"packuswb %%xmm2,%%xmm2\n"
"movq %%xmm2,0x0(%1)\n"
@@ -313,154 +558,27 @@
"ja 1b\n"
: "+r"(y_buf), // %0
"+r"(rgb_buf), // %1
- "+r"(width) // %2
- : "r" (_kCoefficientsRgbY) // %3
- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+ "+rm"(width) // %2
+ : "r" (kCoefficientsRgbY) // %3
+ : "memory", "cc", "%"REG_a, "%"REG_d
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
);
}
-#elif defined(__i386__)
-// 32 bit gcc version
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUVToRGB32Row\n"
-"_FastConvertYUVToRGB32Row:\n"
-#else
- ".global FastConvertYUVToRGB32Row\n"
-"FastConvertYUVToRGB32Row:\n"
#endif
- "pusha\n"
- "mov 0x24(%esp),%edx\n"
- "mov 0x28(%esp),%edi\n"
- "mov 0x2c(%esp),%esi\n"
- "mov 0x30(%esp),%ebp\n"
- "mov 0x34(%esp),%ecx\n"
-"1:"
- "movzbl (%edi),%eax\n"
- "lea 1(%edi),%edi\n"
- "movzbl (%esi),%ebx\n"
- "lea 1(%esi),%esi\n"
- "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
- "movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
- "movzbl 0x1(%edx),%ebx\n"
- "movq _kCoefficientsRgbY(,%eax,8),%mm1\n"
- "lea 2(%edx),%edx\n"
- "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%ebp)\n"
- "lea 8(%ebp),%ebp\n"
- "sub $0x2,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
+// 32 bit mmx gcc version
-void FastConvertYUVToBGRARow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUVToBGRARow\n"
-"_FastConvertYUVToBGRARow:\n"
+#ifdef OSX
+#define UNDERSCORE "_"
#else
- ".global FastConvertYUVToBGRARow\n"
-"FastConvertYUVToBGRARow:\n"
+#define UNDERSCORE ""
#endif
- "pusha\n"
- "mov 0x24(%esp),%edx\n"
- "mov 0x28(%esp),%edi\n"
- "mov 0x2c(%esp),%esi\n"
- "mov 0x30(%esp),%ebp\n"
- "mov 0x34(%esp),%ecx\n"
-"1:"
- "movzbl (%edi),%eax\n"
- "lea 1(%edi),%edi\n"
- "movzbl (%esi),%ebx\n"
- "lea 1(%esi),%esi\n"
- "movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
- "movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
- "movzbl 0x1(%edx),%ebx\n"
- "movq _kCoefficientsBgraY(,%eax,8),%mm1\n"
- "lea 2(%edx),%edx\n"
- "movq _kCoefficientsBgraY(,%ebx,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%ebp)\n"
- "lea 8(%ebp),%ebp\n"
- "sub $0x2,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-void FastConvertYUVToABGRRow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width);
- asm(
- ".text\n"
-#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUVToABGRRow\n"
-"_FastConvertYUVToABGRRow:\n"
-#else
- ".global FastConvertYUVToABGRRow\n"
-"FastConvertYUVToABGRRow:\n"
-#endif
- "pusha\n"
- "mov 0x24(%esp),%edx\n"
- "mov 0x28(%esp),%edi\n"
- "mov 0x2c(%esp),%esi\n"
- "mov 0x30(%esp),%ebp\n"
- "mov 0x34(%esp),%ecx\n"
-
-"1:"
- "movzbl (%edi),%eax\n"
- "lea 1(%edi),%edi\n"
- "movzbl (%esi),%ebx\n"
- "lea 1(%esi),%esi\n"
- "movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
- "movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
- "movzbl 0x1(%edx),%ebx\n"
- "movq _kCoefficientsAbgrY(,%eax,8),%mm1\n"
- "lea 2(%edx),%edx\n"
- "movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
- "paddsw %mm0,%mm1\n"
- "paddsw %mm0,%mm2\n"
- "psraw $0x6,%mm1\n"
- "psraw $0x6,%mm2\n"
- "packuswb %mm2,%mm1\n"
- "movntq %mm1,0x0(%ebp)\n"
- "lea 8(%ebp),%ebp\n"
- "sub $0x2,%ecx\n"
- "ja 1b\n"
- "popa\n"
- "ret\n"
-);
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
@@ -468,11 +586,11 @@
asm(
".text\n"
#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYUV444ToRGB32Row\n"
-"_FastConvertYUV444ToRGB32Row:\n"
+ ".globl _FastConvertYUVToARGBRow_MMX\n"
+"_FastConvertYUVToARGBRow_MMX:\n"
#else
- ".global FastConvertYUV444ToRGB32Row\n"
-"FastConvertYUV444ToRGB32Row:\n"
+ ".global FastConvertYUVToARGBRow_MMX\n"
+"FastConvertYUVToARGBRow_MMX:\n"
#endif
"pusha\n"
"mov 0x24(%esp),%edx\n"
@@ -486,11 +604,149 @@
"lea 1(%edi),%edi\n"
"movzbl (%esi),%ebx\n"
"lea 1(%esi),%esi\n"
- "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
"movzbl (%edx),%eax\n"
- "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+ "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+ "movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
+ "lea 2(%edx),%edx\n"
+ "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movq %mm1,0x0(%ebp)\n"
+ "lea 8(%ebp),%ebp\n"
+ "sub $0x2,%ecx\n"
+ "ja 1b\n"
+ "popa\n"
+ "ret\n"
+);
+
+void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+ asm(
+ ".text\n"
+#if defined(OSX) || defined(IOS)
+ ".globl _FastConvertYUVToBGRARow_MMX\n"
+"_FastConvertYUVToBGRARow_MMX:\n"
+#else
+ ".global FastConvertYUVToBGRARow_MMX\n"
+"FastConvertYUVToBGRARow_MMX:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+
+"1:"
+ "movzbl (%edi),%eax\n"
+ "lea 1(%edi),%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "lea 1(%esi),%esi\n"
+ "movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+ "movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
+ "lea 2(%edx),%edx\n"
+ "movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movq %mm1,0x0(%ebp)\n"
+ "lea 8(%ebp),%ebp\n"
+ "sub $0x2,%ecx\n"
+ "ja 1b\n"
+ "popa\n"
+ "ret\n"
+);
+
+void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+ asm(
+ ".text\n"
+#if defined(OSX) || defined(IOS)
+ ".globl _FastConvertYUVToABGRRow_MMX\n"
+"_FastConvertYUVToABGRRow_MMX:\n"
+#else
+ ".global FastConvertYUVToABGRRow_MMX\n"
+"FastConvertYUVToABGRRow_MMX:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+
+"1:"
+ "movzbl (%edi),%eax\n"
+ "lea 1(%edi),%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "lea 1(%esi),%esi\n"
+ "movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
+ "movzbl 0x1(%edx),%ebx\n"
+ "movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
+ "lea 2(%edx),%edx\n"
+ "movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
+ "paddsw %mm0,%mm1\n"
+ "paddsw %mm0,%mm2\n"
+ "psraw $0x6,%mm1\n"
+ "psraw $0x6,%mm2\n"
+ "packuswb %mm2,%mm1\n"
+ "movq %mm1,0x0(%ebp)\n"
+ "lea 8(%ebp),%ebp\n"
+ "sub $0x2,%ecx\n"
+ "ja 1b\n"
+ "popa\n"
+ "ret\n"
+);
+
+void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width);
+ asm(
+ ".text\n"
+#if defined(OSX) || defined(IOS)
+ ".globl _FastConvertYUV444ToARGBRow_MMX\n"
+"_FastConvertYUV444ToARGBRow_MMX:\n"
+#else
+ ".global FastConvertYUV444ToARGBRow_MMX\n"
+"FastConvertYUV444ToARGBRow_MMX:\n"
+#endif
+ "pusha\n"
+ "mov 0x24(%esp),%edx\n"
+ "mov 0x28(%esp),%edi\n"
+ "mov 0x2c(%esp),%esi\n"
+ "mov 0x30(%esp),%ebp\n"
+ "mov 0x34(%esp),%ecx\n"
+
+"1:"
+ "movzbl (%edi),%eax\n"
+ "lea 1(%edi),%edi\n"
+ "movzbl (%esi),%ebx\n"
+ "lea 1(%esi),%esi\n"
+ "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+ "movzbl (%edx),%eax\n"
+ "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
"lea 1(%edx),%edx\n"
- "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
+ "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
"psraw $0x6,%mm0\n"
"packuswb %mm0,%mm0\n"
"movd %mm0,0x0(%ebp)\n"
@@ -501,17 +757,17 @@
"ret\n"
);
-void FastConvertYToRGB32Row(const uint8* y_buf,
- uint8* rgb_buf,
- int width);
+void FastConvertYToARGBRow_MMX(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width);
asm(
".text\n"
#if defined(OSX) || defined(IOS)
- ".globl _FastConvertYToRGB32Row\n"
-"_FastConvertYToRGB32Row:\n"
+ ".globl _FastConvertYToARGBRow_MMX\n"
+"_FastConvertYToARGBRow_MMX:\n"
#else
- ".global FastConvertYToRGB32Row\n"
-"FastConvertYToRGB32Row:\n"
+ ".global FastConvertYToARGBRow_MMX\n"
+"FastConvertYToARGBRow_MMX:\n"
#endif
"push %ebx\n"
"mov 0x8(%esp),%eax\n"
@@ -520,10 +776,10 @@
"1:"
"movzbl (%eax),%ebx\n"
- "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n"
+ "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
"psraw $0x6,%mm0\n"
"movzbl 0x1(%eax),%ebx\n"
- "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n"
+ "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
"psraw $0x6,%mm1\n"
"packuswb %mm1,%mm0\n"
"lea 0x2(%eax),%eax\n"
@@ -535,125 +791,36 @@
"ret\n"
);
-#else
-// C reference code that mimic the YUV assembly.
-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
- (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+#endif
-static inline void YuvPixel(uint8 y,
- uint8 u,
- uint8 v,
- uint8* rgb_buf,
- int ashift,
- int rshift,
- int gshift,
- int bshift) {
-
- int b = _kCoefficientsRgbY[256+u][0];
- int g = _kCoefficientsRgbY[256+u][1];
- int r = _kCoefficientsRgbY[256+u][2];
- int a = _kCoefficientsRgbY[256+u][3];
-
- b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
- g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
- r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
- a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
-
- b = paddsw(b, _kCoefficientsRgbY[y][0]);
- g = paddsw(g, _kCoefficientsRgbY[y][1]);
- r = paddsw(r, _kCoefficientsRgbY[y][2]);
- a = paddsw(a, _kCoefficientsRgbY[y][3]);
-
- b >>= 6;
- g >>= 6;
- r >>= 6;
- a >>= 6;
-
- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
- (packuswb(g) << gshift) |
- (packuswb(r) << rshift) |
- (packuswb(a) << ashift);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ ABGRToARGBRow_SSSE3(src_argb, row, pix);
+ ARGBToYRow_SSSE3(row, dst_y, pix);
}
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; x += 2) {
- uint8 u = u_buf[x >> 1];
- uint8 v = v_buf[x >> 1];
- uint8 y0 = y_buf[x];
- YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
- if ((x + 1) < width) {
- uint8 y1 = y_buf[x + 1];
- YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
- }
- rgb_buf += 8; // Advance 2 pixels.
- }
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ BGRAToARGBRow_SSSE3(src_argb, row, pix);
+ ARGBToYRow_SSSE3(row, dst_y, pix);
}
-void FastConvertYUVToBGRARow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; x += 2) {
- uint8 u = u_buf[x >> 1];
- uint8 v = v_buf[x >> 1];
- uint8 y0 = y_buf[x];
- YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
- if ((x + 1) < width) {
- uint8 y1 = y_buf[x + 1];
- YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
- }
- rgb_buf += 8; // Advance 2 pixels.
- }
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ ABGRToARGBRow_SSSE3(src_argb, row, pix);
+ ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
-void FastConvertYUVToABGRRow(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; x += 2) {
- uint8 u = u_buf[x >> 1];
- uint8 v = v_buf[x >> 1];
- uint8 y0 = y_buf[x];
- YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
- if ((x + 1) < width) {
- uint8 y1 = y_buf[x + 1];
- YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
- }
- rgb_buf += 8; // Advance 2 pixels.
- }
+void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ BGRAToARGBRow_SSSE3(src_argb, row, pix);
+ BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+ ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
}
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; ++x) {
- uint8 u = u_buf[x];
- uint8 v = v_buf[x];
- uint8 y = y_buf[x];
- YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
- rgb_buf += 4; // Advance 1 pixel.
- }
-}
-
-void FastConvertYToRGB32Row(const uint8* y_buf,
- uint8* rgb_buf,
- int width) {
- for (int x = 0; x < width; ++x) {
- uint8 y = y_buf[x];
- YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
- rgb_buf += 4; // Advance 1 pixel.
- }
-}
-
#endif
} // extern "C"