direct conversion from NV12 to ARGB
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/645004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@281 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 28f10c0..6b4af08 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1231,14 +1231,17 @@
 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
 
 struct {
-  vec8 kUVToB;
-  vec8 kUVToG;
-  vec8 kUVToR;
-  vec16 kUVBiasB;
-  vec16 kUVBiasG;
-  vec16 kUVBiasR;
-  vec16 kYSub16;
-  vec16 kYToRgb;
+  vec8 kUVToB;  // 0
+  vec8 kUVToG;  // 16
+  vec8 kUVToR;  // 32
+  vec16 kUVBiasB;  // 48
+  vec16 kUVBiasG;  // 64
+  vec16 kUVBiasR;  // 80
+  vec16 kYSub16;  // 96
+  vec16 kYToRgb;  // 112
+  vec8 kVUToB;  // 128
+  vec8 kVUToG;  // 144
+  vec8 kVUToR;  // 160
 } CONST SIMD_ALIGNED(kYuvConstants) = {
   { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
@@ -1247,48 +1250,58 @@
   { BG, BG, BG, BG, BG, BG, BG, BG },
   { BR, BR, BR, BR, BR, BR, BR, BR },
   { 16, 16, 16, 16, 16, 16, 16, 16 },
-  { YG, YG, YG, YG, YG, YG, YG, YG }
+  { YG, YG, YG, YG, YG, YG, YG, YG },
+  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
 };
 
+
 // Read 8 UV from 411
 #define READYUV444                                                             \
-    "movq       (%1),%%xmm0                    \n"                             \
-    "movq       (%1,%2,1),%%xmm1               \n"                             \
-    "lea        0x8(%1),%1                     \n"                             \
+    "movq       (%[u_buf]),%%xmm0              \n"                             \
+    "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
+    "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
 
 // Read 4 UV from 422, upsample to 8 UV
 #define READYUV422                                                             \
-    "movd       (%1),%%xmm0                    \n"                             \
-    "movd       (%1,%2,1),%%xmm1               \n"                             \
-    "lea        0x4(%1),%1                     \n"                             \
+    "movd       (%[u_buf]),%%xmm0              \n"                             \
+    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
+    "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
 
 // Read 2 UV from 411, upsample to 8 UV
 #define READYUV411                                                             \
-    "movd       (%1),%%xmm0                    \n"                             \
-    "movd       (%1,%2,1),%%xmm1               \n"                             \
-    "lea        0x2(%1),%1                     \n"                             \
+    "movd       (%[u_buf]),%%xmm0              \n"                             \
+    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
+    "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
     "punpckldq  %%xmm0,%%xmm0                  \n"                             \
 
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12                                                               \
+    "movq       (%[uv_buf]),%%xmm0             \n"                             \
+    "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
+    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
+
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB                                                               \
     "movdqa     %%xmm0,%%xmm1                  \n"                             \
     "movdqa     %%xmm0,%%xmm2                  \n"                             \
-    "pmaddubsw  (%5),%%xmm0                    \n"                             \
-    "pmaddubsw  16(%5),%%xmm1                  \n"                             \
-    "pmaddubsw  32(%5),%%xmm2                  \n"                             \
-    "psubw      48(%5),%%xmm0                  \n"                             \
-    "psubw      64(%5),%%xmm1                  \n"                             \
-    "psubw      80(%5),%%xmm2                  \n"                             \
-    "movq       (%0),%%xmm3                    \n"                             \
-    "lea        0x8(%0),%0                     \n"                             \
+    "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
+    "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
+    "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
+    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
+    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
+    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
+    "movq       (%[y_buf]),%%xmm3              \n"                             \
+    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
     "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
-    "psubsw     96(%5),%%xmm3                  \n"                             \
-    "pmullw     112(%5),%%xmm3                 \n"                             \
+    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
+    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
     "paddsw     %%xmm3,%%xmm0                  \n"                             \
     "paddsw     %%xmm3,%%xmm1                  \n"                             \
     "paddsw     %%xmm3,%%xmm2                  \n"                             \
@@ -1297,7 +1310,32 @@
     "psraw      $0x6,%%xmm2                    \n"                             \
     "packuswb   %%xmm0,%%xmm0                  \n"                             \
     "packuswb   %%xmm1,%%xmm1                  \n"                             \
-    "packuswb   %%xmm2,%%xmm2                  \n"
+    "packuswb   %%xmm2,%%xmm2                  \n"                             \
+
+// Convert 8 pixels: 8 VU and 8 Y
+#define YVUTORGB                                                               \
+    "movdqa     %%xmm0,%%xmm1                  \n"                             \
+    "movdqa     %%xmm0,%%xmm2                  \n"                             \
+    "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
+    "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
+    "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
+    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
+    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
+    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
+    "movq       (%[y_buf]),%%xmm3              \n"                             \
+    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
+    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
+    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
+    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
+    "paddsw     %%xmm3,%%xmm0                  \n"                             \
+    "paddsw     %%xmm3,%%xmm1                  \n"                             \
+    "paddsw     %%xmm3,%%xmm2                  \n"                             \
+    "psraw      $0x6,%%xmm0                    \n"                             \
+    "psraw      $0x6,%%xmm1                    \n"                             \
+    "psraw      $0x6,%%xmm2                    \n"                             \
+    "packuswb   %%xmm0,%%xmm0                  \n"                             \
+    "packuswb   %%xmm1,%%xmm1                  \n"                             \
+    "packuswb   %%xmm2,%%xmm2                  \n"                             \
 
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                 const uint8* u_buf,
@@ -1305,7 +1343,7 @@
                                 uint8* argb_buf,
                                 int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1317,17 +1355,17 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%3)                     \n"
-    "movdqa    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(argb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1341,7 +1379,7 @@
                                 uint8* argb_buf,
                                 int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1353,17 +1391,17 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%3)                     \n"
-    "movdqa    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(argb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1377,7 +1415,7 @@
                                 uint8* argb_buf,
                                 int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1389,17 +1427,83 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%3)                     \n"
-    "movdqa    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(argb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* uv_buf,
+                                uint8* argb_buf,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+                                const uint8* vu_buf,
+                                uint8* argb_buf,
+                                int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YVUTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1413,7 +1517,7 @@
                                           uint8* argb_buf,
                                           int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1425,17 +1529,17 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%3)                     \n"
-    "movdqu    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(argb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1449,7 +1553,7 @@
                                           uint8* argb_buf,
                                           int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1461,17 +1565,17 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%3)                     \n"
-    "movdqu    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(argb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1485,7 +1589,7 @@
                                           uint8* argb_buf,
                                           int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1497,17 +1601,83 @@
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%3)                     \n"
-    "movdqu    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(argb_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* uv_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YUVTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+                                          const uint8* vu_buf,
+                                          uint8* argb_buf,
+                                          int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    READNV12
+    YVUTORGB
+    "punpcklbw %%xmm1,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm0                   \n"
+    "punpckhwd %%xmm2,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
+    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1521,7 +1691,7 @@
                                 uint8* bgra_buf,
                                 int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1534,17 +1704,17 @@
     "movdqa    %%xmm5,%%xmm0                   \n"
     "punpcklwd %%xmm1,%%xmm5                   \n"
     "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5,(%3)                     \n"
-    "movdqa    %%xmm0,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqa    %%xmm5,(%[argb_buf])            \n"
+    "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(bgra_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1558,7 +1728,7 @@
                                 uint8* abgr_buf,
                                 int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1570,17 +1740,17 @@
     "movdqa    %%xmm2,%%xmm1                   \n"
     "punpcklwd %%xmm0,%%xmm2                   \n"
     "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,(%3)                     \n"
-    "movdqa    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqa    %%xmm2,(%[argb_buf])            \n"
+    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(abgr_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1594,7 +1764,7 @@
                                           uint8* bgra_buf,
                                           int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1607,17 +1777,17 @@
     "movdqa    %%xmm5,%%xmm0                   \n"
     "punpcklwd %%xmm1,%%xmm5                   \n"
     "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5,(%3)                     \n"
-    "movdqu    %%xmm0,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqu    %%xmm5,(%[argb_buf])            \n"
+    "movdqu    %%xmm0,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(bgra_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -1631,7 +1801,7 @@
                                           uint8* abgr_buf,
                                           int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
@@ -1643,24 +1813,23 @@
     "movdqa    %%xmm2,%%xmm1                   \n"
     "punpcklwd %%xmm0,%%xmm2                   \n"
     "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,(%3)                     \n"
-    "movdqu    %%xmm1,0x10(%3)                 \n"
-    "lea       0x20(%3),%3                     \n"
-    "sub       $0x8,%4                         \n"
+    "movdqu    %%xmm2,(%[argb_buf])            \n"
+    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
+    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : "+r"(y_buf),    // %0
-    "+r"(u_buf),    // %1
-    "+r"(v_buf),    // %2
-    "+r"(abgr_buf),  // %3
-    "+rm"(width)    // %4
-  : "r"(&kYuvConstants.kUVToB) // %5
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
   );
 }
-
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 #ifdef HAS_YTOARGBROW_SSE2