AffineRow for GCC.
BUG=62
TEST=planar_unittest
Review URL: https://webrtc-codereview.appspot.com/733004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@317 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 06aefb5..ee1dbc0 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3219,6 +3219,82 @@
 }
 #endif  // HAS_ARGBSHADE_SSE2
 
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  asm volatile (
+    "movq      (%3),%%xmm2                     \n"
+    "movq      0x8(%3),%%xmm3                  \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm4                       \n"
+    "xor       %1,%1                           \n"  // cleanse upper bits.
+    "sub       $0x2,%4                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm3,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movlhps   %%xmm3,%%xmm3                   \n"
+    "addps     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+  // 2 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "20:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "addps     %%xmm3,%%xmm2                   \n"
+    "movd      %%xmm1,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movdqa    %%xmm1,%%xmm5                   \n"
+    "pshufd    $0x55,%%xmm5,%%xmm5             \n"
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "movd      %%xmm5,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movd      (%0,%1,1),%%xmm5                \n"
+    "punpckldq %%xmm5,%%xmm0                   \n"
+    "sub       $0x2,%4                         \n"
+    "movq      %%xmm0,(%2)                     \n"
+    "lea       0x8(%2),%2                      \n"
+    "jge       20b                             \n"
+
+  "29:                                         \n"
+    "add       $0x1,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "addps     %%xmm3,%%xmm2                   \n"
+    "movd      %%xmm1,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "sub       $0x1,%4                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(uv_dudv),   // %3
+    "+rm"(width)     // %4
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus