Fix for I444ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_Unaligned_SSSE3 on Windows using movdqu instead of movdqa.  break YUVTORGB into 2 macros - one to fetch pixels, another to do YUV conversion.  Less duplicated source and lends itself to future YUV formats.
BUG=none
TEST=WebRtcVideoFrameTest.ConvertToARGBBufferStride
Review URL: https://webrtc-codereview.appspot.com/644004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@279 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 479ece0..28f10c0 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -1212,7 +1212,6 @@
 #endif
   );
 }
-
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 #ifdef HAS_I422TOARGBROW_SSSE3
@@ -1251,73 +1250,32 @@
   { YG, YG, YG, YG, YG, YG, YG, YG }
 };
 
-// Convert 8 pixels: 8 UV and 8 Y
-#define YUV444TORGB                                                            \
+// Read 8 UV from 411
+#define READYUV444                                                             \
     "movq       (%1),%%xmm0                    \n"                             \
     "movq       (%1,%2,1),%%xmm1               \n"                             \
     "lea        0x8(%1),%1                     \n"                             \
     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
-    "movdqa     %%xmm0,%%xmm1                  \n"                             \
-    "movdqa     %%xmm0,%%xmm2                  \n"                             \
-    "pmaddubsw  (%5),%%xmm0                    \n"                             \
-    "pmaddubsw  16(%5),%%xmm1                  \n"                             \
-    "pmaddubsw  32(%5),%%xmm2                  \n"                             \
-    "psubw      48(%5),%%xmm0                  \n"                             \
-    "psubw      64(%5),%%xmm1                  \n"                             \
-    "psubw      80(%5),%%xmm2                  \n"                             \
-    "movq       (%0),%%xmm3                    \n"                             \
-    "lea        0x8(%0),%0                     \n"                             \
-    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
-    "psubsw     96(%5),%%xmm3                  \n"                             \
-    "pmullw     112(%5),%%xmm3                 \n"                             \
-    "paddsw     %%xmm3,%%xmm0                  \n"                             \
-    "paddsw     %%xmm3,%%xmm1                  \n"                             \
-    "paddsw     %%xmm3,%%xmm2                  \n"                             \
-    "psraw      $0x6,%%xmm0                    \n"                             \
-    "psraw      $0x6,%%xmm1                    \n"                             \
-    "psraw      $0x6,%%xmm2                    \n"                             \
-    "packuswb   %%xmm0,%%xmm0                  \n"                             \
-    "packuswb   %%xmm1,%%xmm1                  \n"                             \
-    "packuswb   %%xmm2,%%xmm2                  \n"
 
-// Convert 8 pixels: 4 UV and 8 Y
-#define YUV422TORGB                                                            \
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422                                                             \
     "movd       (%1),%%xmm0                    \n"                             \
     "movd       (%1,%2,1),%%xmm1               \n"                             \
     "lea        0x4(%1),%1                     \n"                             \
     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
-    "movdqa     %%xmm0,%%xmm1                  \n"                             \
-    "movdqa     %%xmm0,%%xmm2                  \n"                             \
-    "pmaddubsw  (%5),%%xmm0                    \n"                             \
-    "pmaddubsw  16(%5),%%xmm1                  \n"                             \
-    "pmaddubsw  32(%5),%%xmm2                  \n"                             \
-    "psubw      48(%5),%%xmm0                  \n"                             \
-    "psubw      64(%5),%%xmm1                  \n"                             \
-    "psubw      80(%5),%%xmm2                  \n"                             \
-    "movq       (%0),%%xmm3                    \n"                             \
-    "lea        0x8(%0),%0                     \n"                             \
-    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
-    "psubsw     96(%5),%%xmm3                  \n"                             \
-    "pmullw     112(%5),%%xmm3                 \n"                             \
-    "paddsw     %%xmm3,%%xmm0                  \n"                             \
-    "paddsw     %%xmm3,%%xmm1                  \n"                             \
-    "paddsw     %%xmm3,%%xmm2                  \n"                             \
-    "psraw      $0x6,%%xmm0                    \n"                             \
-    "psraw      $0x6,%%xmm1                    \n"                             \
-    "psraw      $0x6,%%xmm2                    \n"                             \
-    "packuswb   %%xmm0,%%xmm0                  \n"                             \
-    "packuswb   %%xmm1,%%xmm1                  \n"                             \
-    "packuswb   %%xmm2,%%xmm2                  \n"
 
-// Convert 8 pixels: 2 UV and 8 Y
-#define YUV411TORGB                                                            \
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411                                                             \
     "movd       (%1),%%xmm0                    \n"                             \
     "movd       (%1,%2,1),%%xmm1               \n"                             \
     "lea        0x2(%1),%1                     \n"                             \
     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
     "punpckldq  %%xmm0,%%xmm0                  \n"                             \
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB                                                               \
     "movdqa     %%xmm0,%%xmm1                  \n"                             \
     "movdqa     %%xmm0,%%xmm2                  \n"                             \
     "pmaddubsw  (%5),%%xmm0                    \n"                             \
@@ -1352,7 +1310,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV444TORGB
+    READYUV444
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1387,7 +1346,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV422TORGB
+    READYUV422
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1422,7 +1382,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV411TORGB
+    READYUV411
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1457,7 +1418,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV444TORGB
+    READYUV444
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1492,7 +1454,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV422TORGB
+    READYUV422
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1527,7 +1490,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV411TORGB
+    READYUV411
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1562,7 +1526,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV422TORGB
+    READYUV422
+    YUVTORGB
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm5                   \n"
@@ -1598,7 +1563,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV422TORGB
+    READYUV422
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "movdqa    %%xmm2,%%xmm1                   \n"
@@ -1633,7 +1599,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV422TORGB
+    READYUV422
+    YUVTORGB
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm5                   \n"
@@ -1669,7 +1636,8 @@
     "pxor      %%xmm4,%%xmm4                   \n"
     ".p2align  4                               \n"
   "1:                                          \n"
-    YUV422TORGB
+    READYUV422
+    YUVTORGB
     "punpcklbw %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "movdqa    %%xmm2,%%xmm1                   \n"
@@ -1741,7 +1709,7 @@
 #endif
   );
 }
-#endif
+#endif  // HAS_YTOARGBROW_SSE2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
@@ -1772,7 +1740,7 @@
 #endif
   );
 }
-#endif
+#endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_SSE2
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
@@ -1803,7 +1771,7 @@
 #endif
   );
 }
-#endif
+#endif  // HAS_MIRRORROW_SSE2
 
 #ifdef HAS_MIRRORROW_UV_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
@@ -1838,7 +1806,7 @@
 #endif
   );
 }
-#endif
+#endif  // HAS_MIRRORROW_UV_SSSE3
 
 #ifdef HAS_ADDROW_SSE2
 // dst and width aligned to 16
@@ -1939,7 +1907,7 @@
 #endif
   );
 }
-#endif
+#endif  // HAS_SPLITUV_SSE2
 
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
@@ -1979,7 +1947,7 @@
   : "memory", "cc"
   );
 }
-#endif
+#endif  // HAS_COPYROW_X86
 
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {