4 pixel version of affine for gcc and aligned version of win.
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/714007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@320 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index ee1dbc0..b70fcd0 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3220,61 +3220,91 @@
 #endif  // HAS_ARGBSHADE_SSE2
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
+// TODO(fbarchard): Find 64 bit way to avoid masking.
+// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
 // Copy ARGB pixels from source image with slope to a row of destination.
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
+  intptr_t temp = 0;
   asm volatile (
     "movq      (%3),%%xmm2                     \n"
-    "movq      0x8(%3),%%xmm3                  \n"
+    "movq      0x8(%3),%%xmm7                  \n"
     "shl       $0x10,%1                        \n"
     "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm4                       \n"
-    "xor       %1,%1                           \n"  // cleanse upper bits.
-    "sub       $0x2,%4                         \n"
-    "jl        29f                             \n"
+    "movd      %1,%%xmm5                       \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+
+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
     "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm3,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm0                   \n"
     "movlhps   %%xmm0,%%xmm2                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "movlhps   %%xmm3,%%xmm3                   \n"
-    "addps     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqa    %%xmm7,%%xmm4                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "addps     %%xmm4,%%xmm4                   \n"
 
-  // 2 pixel loop                              \n"
-    ".p2align  2                               \n"
-  "20:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm1                   \n"
-    "packssdw  %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm4,%%xmm1                   \n"
-    "addps     %%xmm3,%%xmm2                   \n"
-    "movd      %%xmm1,%1                       \n"
+  // 4 pixel loop                              \n"
+    ".p2align  4                               \n"
+  "40:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "cvttps2dq %%xmm3,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+#if defined(__x86_64__)
+    "movq      %%xmm0,%1                       \n"
+    "mov       %1,%5                           \n"
     "and       $0x0fffffff,%1                  \n"
-    "movdqa    %%xmm1,%%xmm5                   \n"
-    "pshufd    $0x55,%%xmm5,%%xmm5             \n"
+    "shr       $32,%5                          \n"
+    "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
+#else
+    "movd      %%xmm0,%1                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%5                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+#endif
+    "movd      (%0,%1,1),%%xmm1                \n"
+    "movd      (%0,%5,1),%%xmm6                \n"
+    "punpckldq %%xmm6,%%xmm1                   \n"
+    "addps     %%xmm4,%%xmm2                   \n"
+    "movq      %%xmm1,(%2)                     \n"
+#if defined(__x86_64__)
+    "movq      %%xmm0,%1                       \n"
+    "mov       %1,%5                           \n"
+    "and       $0x0fffffff,%1                  \n"
+    "shr       $32,%5                          \n"
+#else
+    "movd      %%xmm0,%1                       \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+    "movd      %%xmm0,%5                       \n"
+#endif
     "movd      (%0,%1,1),%%xmm0                \n"
-    "movd      %%xmm5,%1                       \n"
-    "and       $0x0fffffff,%1                  \n"
-    "movd      (%0,%1,1),%%xmm5                \n"
-    "punpckldq %%xmm5,%%xmm0                   \n"
-    "sub       $0x2,%4                         \n"
-    "movq      %%xmm0,(%2)                     \n"
-    "lea       0x8(%2),%2                      \n"
-    "jge       20b                             \n"
+    "movd      (%0,%5,1),%%xmm6                \n"
+    "punpckldq %%xmm6,%%xmm0                   \n"
+    "addps     %%xmm4,%%xmm3                   \n"
+    "sub       $0x4,%4                         \n"
+    "movq      %%xmm0,0x08(%2)                 \n"
+    "lea       0x10(%2),%2                     \n"
+    "jge       40b                             \n"
 
-  "29:                                         \n"
-    "add       $0x1,%4                         \n"
+  "49:                                         \n"
+    "add       $0x3,%4                         \n"
     "jl        19f                             \n"
 
   // 1 pixel loop                              \n"
-    ".p2align  2                               \n"
+    ".p2align  4                               \n"
   "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm1                   \n"
-    "packssdw  %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm4,%%xmm1                   \n"
-    "addps     %%xmm3,%%xmm2                   \n"
-    "movd      %%xmm1,%1                       \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "pmaddwd   %%xmm5,%%xmm0                   \n"
+    "addps     %%xmm7,%%xmm2                   \n"
+    "movd      %%xmm0,%1                       \n"
+#if defined(__x86_64__)
     "and       $0x0fffffff,%1                  \n"
+#endif
     "movd      (%0,%1,1),%%xmm0                \n"
     "sub       $0x1,%4                         \n"
     "movd      %%xmm0,(%2)                     \n"
@@ -3285,11 +3315,12 @@
     "+r"(src_argb_stride_temp),  // %1
     "+r"(dst_argb),  // %2
     "+r"(uv_dudv),   // %3
-    "+rm"(width)     // %4
+    "+rm"(width),    // %4
+    "+r"(temp)   // %5
   :
   : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
   );
 }