ARGB to and from I420 ported to x64
BUG=none
TEST=media_unittests
Review URL: http://webrtc-codereview.appspot.com/266003

git-svn-id: http://libyuv.googlecode.com/svn/trunk@61 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 88ce475..090c1a6 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -15,62 +15,128 @@
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constant multiplication table for converting ARGB to I400.
-extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
-  13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
+static const vec8 kARGBToY = {
+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
 
-extern "C" TALIGN16(const uint8, kAdd16[16]) = {
-  1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
+static const uvec8 kAddY16 = {
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
 };
 
+#ifdef HAS_ARGBTOUVROW_SSSE3
+static const vec8 kARGBToU = {
+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const uvec8 kARGBToV = {
+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+static const uvec8 kAddUV128 = {
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+#endif
+
 // Shuffle table for converting BG24 to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
+static const uvec8 kShuffleMaskBG24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 
 // Shuffle table for converting RAW to ARGB.
-extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
+static const uvec8 kShuffleMaskRAWToARGB = {
   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   asm volatile(
-  "movdqa     (%3),%%xmm7\n"
-  "movdqa     (%4),%%xmm6\n"
-  "movdqa     %%xmm6,%%xmm5\n"
-  "psllw      $0x4,%%xmm5\n"  // Generate a mask of 0x10 on each byte.
+  "pcmpeqb    %%xmm5,%%xmm5\n"
+  "pslld      $0x18,%%xmm5\n"
 "1:"
-  "movdqa     (%0),%%xmm0\n"
-  "pmaddubsw  %%xmm7,%%xmm0\n"
-  "movdqa     0x10(%0),%%xmm1\n"
-  "psrlw      $0x7,%%xmm0\n"
-  "pmaddubsw  %%xmm7,%%xmm1\n"
-  "lea        0x20(%0),%0\n"
-  "psrlw      $0x7,%%xmm1\n"
-  "packuswb   %%xmm1,%%xmm0\n"
-  "pmaddubsw  %%xmm6,%%xmm0\n"
-  "packuswb   %%xmm0,%%xmm0\n"
-  "paddb      %%xmm5,%%xmm0\n"
-  "movq       %%xmm0,(%1)\n"
-  "lea        0x8(%1),%1\n"
+  "movq       (%0),%%xmm0\n"
+  "lea        0x8(%0),%0\n"
+  "punpcklbw  %%xmm0,%%xmm0\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "punpcklwd  %%xmm0,%%xmm0\n"
+  "punpckhwd  %%xmm1,%%xmm1\n"
+  "por        %%xmm5,%%xmm0\n"
+  "por        %%xmm5,%%xmm1\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "movdqa     %%xmm1,0x10(%1)\n"
+  "lea        0x20(%1),%1\n"
   "sub        $0x8,%2\n"
   "ja         1b\n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_y),      // %1
-    "+r"(pix)         // %2
-  : "r"(kMultiplyMaskARGBToI400),    // %3
-    "r"(kAdd16)   // %4
-  : "memory"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
 );
 }
+
+void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
+  asm volatile(
+  "movdqa     %3,%%xmm5\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        0x10(%0),%0\n"
+  "pshufb     %%xmm5,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  : "+r"(src_abgr),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskABGRToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
 #endif
 
-#ifdef  HAS_BG24TOARGBROW_SSSE3
+);
+}
+
+void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
+  asm volatile(
+  "movdqa     %3,%%xmm5\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "lea        0x10(%0),%0\n"
+  "pshufb     %%xmm5,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x4,%2\n"
+  "ja         1b\n"
+  : "+r"(src_bgra),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "m"(kShuffleMaskBGRAToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm5"
+#endif
+);
+}
+
 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
   asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
-  "pslld      $0x18,%%xmm7\n"
-  "movdqa     (%3),%%xmm6\n"
+  "pcmpeqb    %%xmm5,%%xmm5\n"  // generate mask 0xff000000
+  "pslld      $0x18,%%xmm5\n"
+  "movdqa     %3,%%xmm4\n"
 "1:"
   "movdqa     (%0),%%xmm0\n"
   "movdqa     0x10(%0),%%xmm1\n"
@@ -78,19 +144,19 @@
   "lea        0x30(%0),%0\n"
   "movdqa     %%xmm3,%%xmm2\n"
   "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
-  "pshufb     %%xmm6,%%xmm2\n"
-  "por        %%xmm7,%%xmm2\n"
+  "pshufb     %%xmm4,%%xmm2\n"
+  "por        %%xmm5,%%xmm2\n"
   "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
-  "pshufb     %%xmm6,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm0\n"
   "movdqa     %%xmm2,0x20(%1)\n"
-  "por        %%xmm7,%%xmm0\n"
-  "pshufb     %%xmm6,%%xmm1\n"
+  "por        %%xmm5,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm1\n"
   "movdqa     %%xmm0,(%1)\n"
-  "por        %%xmm7,%%xmm1\n"
+  "por        %%xmm5,%%xmm1\n"
   "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
-  "pshufb     %%xmm6,%%xmm3\n"
+  "pshufb     %%xmm4,%%xmm3\n"
   "movdqa     %%xmm1,0x10(%1)\n"
-  "por        %%xmm7,%%xmm3\n"
+  "por        %%xmm5,%%xmm3\n"
   "movdqa     %%xmm3,0x30(%1)\n"
   "lea        0x40(%1),%1\n"
   "sub        $0x10,%2\n"
@@ -98,16 +164,19 @@
   : "+r"(src_bg24),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
-  : "r"(kShuffleMaskBG24ToARGB)  // %3
-  : "memory"
+  : "m"(kShuffleMaskBG24ToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
 );
 }
 
 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
   asm volatile(
-  "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
-  "pslld      $0x18,%%xmm7\n"
-  "movdqa     (%3),%%xmm6\n"
+  "pcmpeqb    %%xmm5,%%xmm5\n"  // generate mask 0xff000000
+  "pslld      $0x18,%%xmm5\n"
+  "movdqa     %3,%%xmm4\n"
 "1:"
   "movdqa     (%0),%%xmm0\n"
   "movdqa     0x10(%0),%%xmm1\n"
@@ -115,19 +184,19 @@
   "lea        0x30(%0),%0\n"
   "movdqa     %%xmm3,%%xmm2\n"
   "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
-  "pshufb     %%xmm6,%%xmm2\n"
-  "por        %%xmm7,%%xmm2\n"
+  "pshufb     %%xmm4,%%xmm2\n"
+  "por        %%xmm5,%%xmm2\n"
   "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
-  "pshufb     %%xmm6,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm0\n"
   "movdqa     %%xmm2,0x20(%1)\n"
-  "por        %%xmm7,%%xmm0\n"
-  "pshufb     %%xmm6,%%xmm1\n"
+  "por        %%xmm5,%%xmm0\n"
+  "pshufb     %%xmm4,%%xmm1\n"
   "movdqa     %%xmm0,(%1)\n"
-  "por        %%xmm7,%%xmm1\n"
+  "por        %%xmm5,%%xmm1\n"
   "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
-  "pshufb     %%xmm6,%%xmm3\n"
+  "pshufb     %%xmm4,%%xmm3\n"
   "movdqa     %%xmm1,0x10(%1)\n"
-  "por        %%xmm7,%%xmm3\n"
+  "por        %%xmm5,%%xmm3\n"
   "movdqa     %%xmm3,0x30(%1)\n"
   "lea        0x40(%1),%1\n"
   "sub        $0x10,%2\n"
@@ -135,147 +204,320 @@
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
-  : "r"(kShuffleMaskRAWToARGB)  // %3
-  : "memory"
+  : "m"(kShuffleMaskRAWToARGB)  // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+);
+}
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  asm volatile(
+  "movdqa     %4,%%xmm5\n"
+  "movdqa     %3,%%xmm4\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     0x20(%0),%%xmm2\n"
+  "movdqa     0x30(%0),%%xmm3\n"
+  "pmaddubsw  %%xmm4,%%xmm0\n"
+  "pmaddubsw  %%xmm4,%%xmm1\n"
+  "pmaddubsw  %%xmm4,%%xmm2\n"
+  "pmaddubsw  %%xmm4,%%xmm3\n"
+  "lea        0x40(%0),%0\n"
+  "phaddw     %%xmm1,%%xmm0\n"
+  "phaddw     %%xmm3,%%xmm2\n"
+  "psrlw      $0x7,%%xmm0\n"
+  "psrlw      $0x7,%%xmm2\n"
+  "packuswb   %%xmm2,%%xmm0\n"
+  "paddb      %%xmm5,%%xmm0\n"
+  "movdqa     %%xmm0,(%1)\n"
+  "lea        0x10(%1),%1\n"
+  "sub        $0x10,%2\n"
+  "ja         1b\n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+
 );
 }
 #endif
 
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  asm volatile(
+  "movdqa     %5,%%xmm7\n"
+  "movdqa     %6,%%xmm6\n"
+  "movdqa     %7,%%xmm5\n"
+  "sub        %1,%2\n"
+"1:"
+  "movdqa     (%0),%%xmm0\n"
+  "movdqa     0x10(%0),%%xmm1\n"
+  "movdqa     0x20(%0),%%xmm2\n"
+  "movdqa     0x30(%0),%%xmm3\n"
+  "pavgb      (%0,%4,1),%%xmm0\n"
+  "pavgb      0x10(%0,%4,1),%%xmm1\n"
+  "pavgb      0x20(%0,%4,1),%%xmm2\n"
+  "pavgb      0x30(%0,%4,1),%%xmm3\n"
+  "lea        0x40(%0),%0\n"
+  "movdqa     %%xmm0,%%xmm4\n"
+  "shufps     $0x88,%%xmm1,%%xmm0\n"
+  "shufps     $0xdd,%%xmm1,%%xmm4\n"
+  "pavgb      %%xmm4,%%xmm0\n"
+  "movdqa     %%xmm2,%%xmm4\n"
+  "shufps     $0x88,%%xmm3,%%xmm2\n"
+  "shufps     $0xdd,%%xmm3,%%xmm4\n"
+  "pavgb      %%xmm4,%%xmm2\n"
+  "movdqa     %%xmm0,%%xmm1\n"
+  "movdqa     %%xmm2,%%xmm3\n"
+  "pmaddubsw  %%xmm7,%%xmm0\n"
+  "pmaddubsw  %%xmm7,%%xmm2\n"
+  "pmaddubsw  %%xmm6,%%xmm1\n"
+  "pmaddubsw  %%xmm6,%%xmm3\n"
+  "phaddw     %%xmm2,%%xmm0\n"
+  "phaddw     %%xmm3,%%xmm1\n"
+  "psraw      $0x8,%%xmm0\n"
+  "psraw      $0x8,%%xmm1\n"
+  "packsswb   %%xmm1,%%xmm0\n"
+  "paddb      %%xmm5,%%xmm0\n"
+  "movlps     %%xmm0,(%1)\n"
+  "movhps     %%xmm0,(%1,%2,1)\n"
+  "lea        0x8(%1),%1\n"
+  "sub        $0x10,%3\n"
+  "ja         1b\n"
+  : "+r"(src_argb0),       // %0
+    "+r"(dst_u),           // %1
+    "+r"(dst_v),           // %2
+    "+rm"(width)           // %3
+  : "r"(static_cast<intptr_t>(src_stride_argb)), // %4
+    "m"(kARGBToU),         // %5
+    "m"(kARGBToV),         // %6
+    "m"(kAddUV128)         // %7
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+);
+}
+#endif
+
+// The following code requires 6 registers and prefers 7 registers.
+// 7 registers requires -fpic to be off, and -fomit-frame-pointer
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
 #if defined(__x86_64__)
+#define REG_a "rax"
+#define REG_d "rdx"
+#else
+#define REG_a "eax"
+#define REG_d "edx"
+#endif
+#if defined(__APPLE__) || defined(__x86_64__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
 
-// 64 bit linux gcc version
+#if defined(__APPLE__)
+// REG6 version uses 1 less register but is slower
+#define REG6
+#endif
 
-void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
-                              const uint8* u_buf,  // rsi
-                              const uint8* v_buf,  // rdx
-                              uint8* rgb_buf,      // rcx
-                              int width) {         // r8
-  asm volatile(
-"1:"
-  "movzb  (%1),%%r10\n"
-  "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
-  "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "movzb  0x1(%0),%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "lea    2(%0),%0\n"
-  "movq   (%5,%%r11,8),%%xmm3\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "paddsw %%xmm0,%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%3)\n"
-  "lea    8(%3),%3\n"
-  "sub    $0x2,%4\n"
+#ifdef REG6
+// 6 register version only has REG_a for temporary
+#define CLOBBER "%"REG_a
+#define YUVTORGB                                                               \
+ "1:"                                                                          \
+  "movzb  (%1),%%"REG_a"\n"                                                    \
+  "lea    1(%1),%1\n"                                                          \
+  "movq   2048(%5,%%"REG_a",8),%%xmm0\n"                                       \
+  "movzb  (%2),%%"REG_a"\n"                                                    \
+  "lea    1(%2),%2\n"                                                          \
+  "movq   4096(%5,%%"REG_a",8),%%xmm1\n"                                       \
+  "paddsw %%xmm1,%%xmm0\n"                                                     \
+  "movzb  (%0),%%"REG_a"\n"                                                    \
+  "movq   0(%5,%%"REG_a",8),%%xmm2\n"                                          \
+  "movzb  0x1(%0),%%"REG_a"\n"                                                 \
+  "movq   0(%5,%%"REG_a",8),%%xmm3\n"                                          \
+  "lea    2(%0),%0\n"                                                          \
+  "paddsw %%xmm0,%%xmm2\n"                                                     \
+  "paddsw %%xmm0,%%xmm3\n"                                                     \
+  "shufps $0x44,%%xmm3,%%xmm2\n"                                               \
+  "psraw  $0x6,%%xmm2\n"                                                       \
+  "packuswb %%xmm2,%%xmm2\n"                                                   \
+  "movq   %%xmm2,0x0(%3)\n"                                                    \
+  "lea    8(%3),%3\n"                                                          \
+  "sub    $0x2,%4\n"                                                           \
   "ja     1b\n"
+#else
+#define CLOBBER "%"REG_a, "%"REG_d
+// This version produces 2 pixels
+#define YUVTORGB                                                               \
+"1:"                                                                           \
+  "movzb      (%1),%%"REG_a"\n"                                                \
+  "lea        1(%1),%1\n"                                                      \
+  "movzb      (%2),%%"REG_d"\n"                                                \
+  "lea        1(%2),%2\n"                                                      \
+  "movq       2048(%5,%%"REG_a",8),%%xmm0\n"                                   \
+  "movzb      0(%0),%%"REG_a"\n"                                               \
+  "movq       4096(%5,%%"REG_d",8),%%xmm1\n"                                   \
+  "paddsw     %%xmm1,%%xmm0\n"                                                 \
+  "movzb      1(%0),%%"REG_d"\n"                                               \
+  "punpcklqdq %%xmm0,%%xmm0\n"                                                 \
+  "lea        2(%0),%0\n"                                                      \
+  "movq       0(%5,%%"REG_a",8),%%xmm1\n"                                      \
+  "movhps     0(%5,%%"REG_d",8),%%xmm1\n"                                      \
+  "paddsw     %%xmm0,%%xmm1\n"                                                 \
+  "psraw      $6,%%xmm1\n"                                                     \
+  "packuswb   %%xmm1,%%xmm1\n"                                                 \
+  "movq       %%xmm1,0(%3)\n"                                                  \
+  "lea        8(%3),%3\n"                                                      \
+  "sub        $0x2,%4\n"                                                       \
+  "ja         1b\n"
+// This version produces 4 pixels
+#define YUVTORGB4                                                              \
+"1:"                                                                           \
+  "movzb      0(%1),%%"REG_a"\n"                                               \
+  "movzb      0(%2),%%"REG_d"\n"                                               \
+  "movq       2048(%5,%%"REG_a",8),%%xmm0\n"                                   \
+  "movzb      0(%0),%%"REG_a"\n"                                               \
+  "movq       4096(%5,%%"REG_d",8),%%xmm1\n"                                   \
+  "paddsw     %%xmm1,%%xmm0\n"                                                 \
+  "movzb      1(%0),%%"REG_d"\n"                                               \
+  "punpcklqdq %%xmm0,%%xmm0\n"                                                 \
+  "movq       0(%5,%%"REG_a",8),%%xmm2\n"                                      \
+  "movhps     0(%5,%%"REG_d",8),%%xmm2\n"                                      \
+  "paddsw     %%xmm0,%%xmm2\n"                                                 \
+  "psraw      $6,%%xmm2\n"                                                     \
+  "movzb      1(%1),%%"REG_a"\n"                                               \
+  "movzb      1(%2),%%"REG_d"\n"                                               \
+  "movq       2048(%5,%%"REG_a",8),%%xmm0\n"                                   \
+  "movzb      2(%0),%%"REG_a"\n"                                               \
+  "movq       4096(%5,%%"REG_d",8),%%xmm1\n"                                   \
+  "paddsw     %%xmm1,%%xmm0\n"                                                 \
+  "movzb      3(%0),%%"REG_d"\n"                                               \
+  "punpcklqdq %%xmm0,%%xmm0\n"                                                 \
+  "movq       0(%5,%%"REG_a",8),%%xmm3\n"                                      \
+  "movhps     0(%5,%%"REG_d",8),%%xmm3\n"                                      \
+  "paddsw     %%xmm0,%%xmm3\n"                                                 \
+  "psraw      $6,%%xmm3\n"                                                     \
+  "lea        2(%1),%1\n"                                                      \
+  "lea        2(%2),%2\n"                                                      \
+  "lea        4(%0),%0\n"                                                      \
+  "packuswb   %%xmm3,%%xmm2\n"                                                 \
+  "movdqa     %%xmm2,0(%3)\n"                                                  \
+  "lea        16(%3),%3\n"                                                     \
+  "sub        $0x4,%4\n"                                                       \
+  "ja         1b\n"
+#endif
+
+// 6 or 7 registers
+void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,  // rdi
+                                         const uint8* u_buf,  // rsi
+                                         const uint8* v_buf,  // rdx
+                                         uint8* rgb_buf,      // rcx
+                                         int width) {         // r8
+  asm volatile(
+    YUVTORGB
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
     "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsRgbY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+    "+rm"(width)    // %4
+  : "r" (kCoefficientsRgbY)  // %5
+  : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
 );
 }
 
-void FastConvertYUVToBGRARow(const uint8* y_buf,  // rdi
-                             const uint8* u_buf,  // rsi
-                             const uint8* v_buf,  // rdx
-                             uint8* rgb_buf,      // rcx
-                             int width) {         // r8
+// 6 or 7 registers
+void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,  // rdi
+                                          const uint8* u_buf,  // rsi
+                                          const uint8* v_buf,  // rdx
+                                          uint8* rgb_buf,      // rcx
+                                          int width) {         // r8
   asm volatile(
-"1:"
-  "movzb  (%1),%%r10\n"
-  "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
-  "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "movzb  0x1(%0),%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "lea    2(%0),%0\n"
-  "movq   (%5,%%r11,8),%%xmm3\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "paddsw %%xmm0,%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%3)\n"
-  "lea    8(%3),%3\n"
-  "sub    $0x2,%4\n"
-  "ja     1b\n"
+    YUVTORGB4
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
     "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsBgraY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+    "+rm"(width)    // %4
+  : "r" (kCoefficientsRgbY)  // %5
+  : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
 );
 }
 
-void FastConvertYUVToABGRRow(const uint8* y_buf,  // rdi
-                             const uint8* u_buf,  // rsi
-                             const uint8* v_buf,  // rdx
-                             uint8* rgb_buf,      // rcx
-                             int width) {         // r8
+void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,  // rdi
+                                         const uint8* u_buf,  // rsi
+                                         const uint8* v_buf,  // rdx
+                                         uint8* rgb_buf,      // rcx
+                                         int width) {         // r8
   asm volatile(
-"1:"
-  "movzb  (%1),%%r10\n"
-  "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
-  "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
-  "movzb  0x1(%0),%%r11\n"
-  "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
-  "lea    2(%0),%0\n"
-  "movq   (%5,%%r11,8),%%xmm3\n"
-  "paddsw %%xmm0,%%xmm2\n"
-  "paddsw %%xmm0,%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
-  "psraw  $0x6,%%xmm2\n"
-  "packuswb %%xmm2,%%xmm2\n"
-  "movq   %%xmm2,0x0(%3)\n"
-  "lea    8(%3),%3\n"
-  "sub    $0x2,%4\n"
-  "ja     1b\n"
+    YUVTORGB
   : "+r"(y_buf),    // %0
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
     "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsAbgrY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+    "+rm"(width)    // %4
+  : "r" (kCoefficientsBgraY)  // %5
+  : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
 );
 }
 
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,  // rdi
-                                 const uint8* u_buf,  // rsi
-                                 const uint8* v_buf,  // rdx
-                                 uint8* rgb_buf,      // rcx
-                                 int width) {         // r8
+void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,  // rdi
+                                         const uint8* u_buf,  // rsi
+                                         const uint8* v_buf,  // rdx
+                                         uint8* rgb_buf,      // rcx
+                                         int width) {         // r8
+  asm volatile(
+    YUVTORGB
+  : "+r"(y_buf),    // %0
+    "+r"(u_buf),    // %1
+    "+r"(v_buf),    // %2
+    "+r"(rgb_buf),  // %3
+    "+rm"(width)    // %4
+  : "r" (kCoefficientsAbgrY)  // %5
+  : "memory", "cc", CLOBBER
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+);
+}
+
+// 6 registers
+void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,  // rdi
+                                            const uint8* u_buf,  // rsi
+                                            const uint8* v_buf,  // rdx
+                                            uint8* rgb_buf,      // rcx
+                                            int width) {         // r8
   asm volatile(
 "1:"
-  "movzb  (%1),%%r10\n"
+  "movzb  (%1),%%"REG_a"\n"
   "lea    1(%1),%1\n"
-  "movzb  (%2),%%r11\n"
+  "movq   2048(%5,%%"REG_a",8),%%xmm0\n"
+  "movzb  (%2),%%"REG_a"\n"
   "lea    1(%2),%2\n"
-  "movq   2048(%5,%%r10,8),%%xmm0\n"
-  "movzb  (%0),%%r10\n"
-  "movq   4096(%5,%%r11,8),%%xmm1\n"
+  "movq   4096(%5,%%"REG_a",8),%%xmm1\n"
   "paddsw %%xmm1,%%xmm0\n"
-  "movq   (%5,%%r10,8),%%xmm2\n"
+  "movzb  (%0),%%"REG_a"\n"
   "lea    1(%0),%0\n"
+  "movq   0(%5,%%"REG_a",8),%%xmm2\n"
   "paddsw %%xmm0,%%xmm2\n"
   "shufps $0x44,%%xmm2,%%xmm2\n"
   "psraw  $0x6,%%xmm2\n"
@@ -288,23 +530,26 @@
     "+r"(u_buf),    // %1
     "+r"(v_buf),    // %2
     "+r"(rgb_buf),  // %3
-    "+r"(width)     // %4
-  : "r" (_kCoefficientsRgbY)  // %5
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
+    "+rm"(width)    // %4
+  : "r" (kCoefficientsRgbY)  // %5
+  : "memory", "cc", "%"REG_a
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2"
+#endif
 );
 }
 
-void FastConvertYToRGB32Row(const uint8* y_buf,  // rdi
-                            uint8* rgb_buf,      // rcx
-                            int width) {         // r8
+// 5 registers
+void FastConvertYToARGBRow_SSE2(const uint8* y_buf,  // rdi
+                                uint8* rgb_buf,      // rcx
+                                int width) {         // r8
   asm volatile(
 "1:"
-  "movzb  (%0),%%r10\n"
-  "movzb  0x1(%0),%%r11\n"
-  "movq   (%3,%%r10,8),%%xmm2\n"
+  "movzb  (%0),%%"REG_a"\n"
+  "movzb  0x1(%0),%%"REG_d"\n"
+  "movq   (%3,%%"REG_a",8),%%xmm2\n"
   "lea    2(%0),%0\n"
-  "movq   (%3,%%r11,8),%%xmm3\n"
-  "shufps $0x44,%%xmm3,%%xmm2\n"
+  "movhps (%3,%%"REG_d",8),%%xmm2\n"
   "psraw  $0x6,%%xmm2\n"
   "packuswb %%xmm2,%%xmm2\n"
   "movq   %%xmm2,0x0(%1)\n"
@@ -313,154 +558,27 @@
   "ja     1b\n"
   : "+r"(y_buf),    // %0
     "+r"(rgb_buf),  // %1
-    "+r"(width)     // %2
-  : "r" (_kCoefficientsRgbY)  // %3
-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
+    "+rm"(width)    // %2
+  : "r" (kCoefficientsRgbY)  // %3
+  : "memory", "cc", "%"REG_a, "%"REG_d
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2"
+#endif
 );
 }
 
-#elif defined(__i386__)
-// 32 bit gcc version
-
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToRGB32Row\n"
-"_FastConvertYUVToRGB32Row:\n"
-#else
-  ".global FastConvertYUVToRGB32Row\n"
-"FastConvertYUVToRGB32Row:\n"
 #endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
 
-"1:"
-  "movzbl (%edi),%eax\n"
-  "lea    1(%edi),%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   _kCoefficientsRgbY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx\n"
-  "movq   _kCoefficientsRgbY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "lea    8(%ebp),%ebp\n"
-  "sub    $0x2,%ecx\n"
-  "ja     1b\n"
-  "popa\n"
-  "ret\n"
-);
+#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
+// 32 bit mmx gcc version
 
-void FastConvertYUVToBGRARow(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToBGRARow\n"
-"_FastConvertYUVToBGRARow:\n"
+#ifdef OSX
+#define UNDERSCORE "_"
 #else
-  ".global FastConvertYUVToBGRARow\n"
-"FastConvertYUVToBGRARow:\n"
+#define UNDERSCORE ""
 #endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
 
-"1:"
-  "movzbl (%edi),%eax\n"
-  "lea    1(%edi),%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   _kCoefficientsBgraY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx\n"
-  "movq   _kCoefficientsBgraY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "lea    8(%ebp),%ebp\n"
-  "sub    $0x2,%ecx\n"
-  "ja     1b\n"
-  "popa\n"
-  "ret\n"
-);
-
-void FastConvertYUVToABGRRow(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width);
-  asm(
-  ".text\n"
-#if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUVToABGRRow\n"
-"_FastConvertYUVToABGRRow:\n"
-#else
-  ".global FastConvertYUVToABGRRow\n"
-"FastConvertYUVToABGRRow:\n"
-#endif
-  "pusha\n"
-  "mov    0x24(%esp),%edx\n"
-  "mov    0x28(%esp),%edi\n"
-  "mov    0x2c(%esp),%esi\n"
-  "mov    0x30(%esp),%ebp\n"
-  "mov    0x34(%esp),%ecx\n"
-
-"1:"
-  "movzbl (%edi),%eax\n"
-  "lea    1(%edi),%edi\n"
-  "movzbl (%esi),%ebx\n"
-  "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
-  "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
-  "movzbl 0x1(%edx),%ebx\n"
-  "movq   _kCoefficientsAbgrY(,%eax,8),%mm1\n"
-  "lea    2(%edx),%edx\n"
-  "movq   _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
-  "paddsw %mm0,%mm1\n"
-  "paddsw %mm0,%mm2\n"
-  "psraw  $0x6,%mm1\n"
-  "psraw  $0x6,%mm2\n"
-  "packuswb %mm2,%mm1\n"
-  "movntq %mm1,0x0(%ebp)\n"
-  "lea    8(%ebp),%ebp\n"
-  "sub    $0x2,%ecx\n"
-  "ja     1b\n"
-  "popa\n"
-  "ret\n"
-);
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
+void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
                                  uint8* rgb_buf,
@@ -468,11 +586,11 @@
   asm(
   ".text\n"
 #if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYUV444ToRGB32Row\n"
-"_FastConvertYUV444ToRGB32Row:\n"
+  ".globl _FastConvertYUVToARGBRow_MMX\n"
+"_FastConvertYUVToARGBRow_MMX:\n"
 #else
-  ".global FastConvertYUV444ToRGB32Row\n"
-"FastConvertYUV444ToRGB32Row:\n"
+  ".global FastConvertYUVToARGBRow_MMX\n"
+"FastConvertYUVToARGBRow_MMX:\n"
 #endif
   "pusha\n"
   "mov    0x24(%esp),%edx\n"
@@ -486,11 +604,149 @@
   "lea    1(%edi),%edi\n"
   "movzbl (%esi),%ebx\n"
   "lea    1(%esi),%esi\n"
-  "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movq   " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
   "movzbl (%edx),%eax\n"
-  "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+  "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
+  "lea    2(%edx),%edx\n"
+  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movq   %mm1,0x0(%ebp)\n"
+  "lea    8(%ebp),%ebp\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+
+void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUVToBGRARow_MMX\n"
+"_FastConvertYUVToBGRARow_MMX:\n"
+#else
+  ".global FastConvertYUVToBGRARow_MMX\n"
+"FastConvertYUVToBGRARow_MMX:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
+  "lea    2(%edx),%edx\n"
+  "movq   " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movq   %mm1,0x0(%ebp)\n"
+  "lea    8(%ebp),%ebp\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+
+void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
+                                 const uint8* u_buf,
+                                 const uint8* v_buf,
+                                 uint8* rgb_buf,
+                                 int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUVToABGRRow_MMX\n"
+"_FastConvertYUVToABGRRow_MMX:\n"
+#else
+  ".global FastConvertYUVToABGRRow_MMX\n"
+"FastConvertYUVToABGRRow_MMX:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
+  "movzbl 0x1(%edx),%ebx\n"
+  "movq   " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
+  "lea    2(%edx),%edx\n"
+  "movq   " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
+  "paddsw %mm0,%mm1\n"
+  "paddsw %mm0,%mm2\n"
+  "psraw  $0x6,%mm1\n"
+  "psraw  $0x6,%mm2\n"
+  "packuswb %mm2,%mm1\n"
+  "movq   %mm1,0x0(%ebp)\n"
+  "lea    8(%ebp),%ebp\n"
+  "sub    $0x2,%ecx\n"
+  "ja     1b\n"
+  "popa\n"
+  "ret\n"
+);
+
+void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
+                                    const uint8* u_buf,
+                                    const uint8* v_buf,
+                                    uint8* rgb_buf,
+                                    int width);
+  asm(
+  ".text\n"
+#if defined(OSX) || defined(IOS)
+  ".globl _FastConvertYUV444ToARGBRow_MMX\n"
+"_FastConvertYUV444ToARGBRow_MMX:\n"
+#else
+  ".global FastConvertYUV444ToARGBRow_MMX\n"
+"FastConvertYUV444ToARGBRow_MMX:\n"
+#endif
+  "pusha\n"
+  "mov    0x24(%esp),%edx\n"
+  "mov    0x28(%esp),%edi\n"
+  "mov    0x2c(%esp),%esi\n"
+  "mov    0x30(%esp),%ebp\n"
+  "mov    0x34(%esp),%ecx\n"
+
+"1:"
+  "movzbl (%edi),%eax\n"
+  "lea    1(%edi),%edi\n"
+  "movzbl (%esi),%ebx\n"
+  "lea    1(%esi),%esi\n"
+  "movq   " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
+  "movzbl (%edx),%eax\n"
+  "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
   "lea    1(%edx),%edx\n"
-  "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
+  "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
   "psraw  $0x6,%mm0\n"
   "packuswb %mm0,%mm0\n"
   "movd   %mm0,0x0(%ebp)\n"
@@ -501,17 +757,17 @@
   "ret\n"
 );
 
-void FastConvertYToRGB32Row(const uint8* y_buf,
-                            uint8* rgb_buf,
-                            int width);
+void FastConvertYToARGBRow_MMX(const uint8* y_buf,
+                               uint8* rgb_buf,
+                               int width);
   asm(
   ".text\n"
 #if defined(OSX) || defined(IOS)
-  ".globl _FastConvertYToRGB32Row\n"
-"_FastConvertYToRGB32Row:\n"
+  ".globl _FastConvertYToARGBRow_MMX\n"
+"_FastConvertYToARGBRow_MMX:\n"
 #else
-  ".global FastConvertYToRGB32Row\n"
-"FastConvertYToRGB32Row:\n"
+  ".global FastConvertYToARGBRow_MMX\n"
+"FastConvertYToARGBRow_MMX:\n"
 #endif
   "push   %ebx\n"
   "mov    0x8(%esp),%eax\n"
@@ -520,10 +776,10 @@
 
 "1:"
   "movzbl (%eax),%ebx\n"
-  "movq   _kCoefficientsRgbY(,%ebx,8),%mm0\n"
+  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
   "psraw  $0x6,%mm0\n"
   "movzbl 0x1(%eax),%ebx\n"
-  "movq   _kCoefficientsRgbY(,%ebx,8),%mm1\n"
+  "movq   " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
   "psraw  $0x6,%mm1\n"
   "packuswb %mm1,%mm0\n"
   "lea    0x2(%eax),%eax\n"
@@ -535,125 +791,36 @@
   "ret\n"
 );
 
-#else
-// C reference code that mimic the YUV assembly.
-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
-    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
+#endif
 
-static inline void YuvPixel(uint8 y,
-                            uint8 u,
-                            uint8 v,
-                            uint8* rgb_buf,
-                            int ashift,
-                            int rshift,
-                            int gshift,
-                            int bshift) {
-
-  int b = _kCoefficientsRgbY[256+u][0];
-  int g = _kCoefficientsRgbY[256+u][1];
-  int r = _kCoefficientsRgbY[256+u][2];
-  int a = _kCoefficientsRgbY[256+u][3];
-
-  b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
-  g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
-  r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
-  a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
-
-  b = paddsw(b, _kCoefficientsRgbY[y][0]);
-  g = paddsw(g, _kCoefficientsRgbY[y][1]);
-  r = paddsw(r, _kCoefficientsRgbY[y][2]);
-  a = paddsw(a, _kCoefficientsRgbY[y][3]);
-
-  b >>= 6;
-  g >>= 6;
-  r >>= 6;
-  a >>= 6;
-
-  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
-                                        (packuswb(g) << gshift) |
-                                        (packuswb(r) << rshift) |
-                                        (packuswb(a) << ashift);
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  ABGRToARGBRow_SSSE3(src_argb, row, pix);
+  ARGBToYRow_SSSE3(row, dst_y, pix);
 }
 
-void FastConvertYUVToRGB32Row(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              uint8* rgb_buf,
-                              int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride]);
+  BGRAToARGBRow_SSSE3(src_argb, row, pix);
+  ARGBToYRow_SSSE3(row, dst_y, pix);
 }
 
-void FastConvertYUVToBGRARow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  ABGRToARGBRow_SSSE3(src_argb, row, pix);
+  ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
 }
 
-void FastConvertYUVToABGRRow(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             uint8* rgb_buf,
-                             int width) {
-  for (int x = 0; x < width; x += 2) {
-    uint8 u = u_buf[x >> 1];
-    uint8 v = v_buf[x >> 1];
-    uint8 y0 = y_buf[x];
-    YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
-    if ((x + 1) < width) {
-      uint8 y1 = y_buf[x + 1];
-      YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
-    }
-    rgb_buf += 8;  // Advance 2 pixels.
-  }
+void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int pix) {
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+  BGRAToARGBRow_SSSE3(src_argb, row, pix);
+  BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
+  ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
 }
-
-void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* rgb_buf,
-                                 int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 u = u_buf[x];
-    uint8 v = v_buf[x];
-    uint8 y = y_buf[x];
-    YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
-}
-
-void FastConvertYToRGB32Row(const uint8* y_buf,
-                            uint8* rgb_buf,
-                            int width) {
-  for (int x = 0; x < width; ++x) {
-    uint8 y = y_buf[x];
-    YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
-    rgb_buf += 4;  // Advance 1 pixel.
-  }
-}
-
 #endif
 
 }  // extern "C"