Adapt row interpolator to do YUV as well as ARGB without extrude so it can be used in I420Scale.
BUG=237
TEST=Scale*
R=ryanpetrie@google.com

Review URL: https://webrtc-codereview.appspot.com/1587004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@710 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 6b6c886..760b9a9 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -4781,7 +4781,7 @@
   : "+r"(src_argb),  // %0
     "+r"(src_argb_stride_temp),  // %1
     "+r"(dst_argb),  // %2
-    "+r"(src_dudv),   // %3
+    "+r"(src_dudv),  // %3
     "+rm"(width),    // %4
     "+r"(temp)   // %5
   :
@@ -4793,11 +4793,10 @@
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                              ptrdiff_t src_stride, int dst_width,
-                              int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -4831,7 +4830,7 @@
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -4844,7 +4843,7 @@
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -4856,7 +4855,7 @@
     "movdqa    (%1),%%xmm0                     \n"
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -4869,7 +4868,7 @@
     "movdqa    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -4879,14 +4878,14 @@
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
@@ -4897,11 +4896,10 @@
   );
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb,
-                             ptrdiff_t src_stride, int dst_width,
-                             int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -4943,7 +4941,7 @@
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -4956,7 +4954,7 @@
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -4968,7 +4966,7 @@
     "movdqa    (%1),%%xmm0                     \n"
     "movdqa    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -4981,7 +4979,7 @@
     "movdqa    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -4991,14 +4989,14 @@
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
@@ -5009,11 +5007,10 @@
   );
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                                        ptrdiff_t src_stride, int dst_width,
-                                        int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                                    ptrdiff_t src_stride, int dst_width,
+                                    int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -5047,7 +5044,7 @@
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -5060,7 +5057,7 @@
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -5072,7 +5069,7 @@
     "movdqu    (%1),%%xmm0                     \n"
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -5085,7 +5082,7 @@
     "movdqu    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -5095,14 +5092,14 @@
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqu    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
@@ -5113,11 +5110,10 @@
   );
 }
 
-// Bilinear image filtering.
-// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated.
-void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb,
-                                       ptrdiff_t src_stride, int dst_width,
-                                       int source_y_fraction) {
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride, int dst_width,
+                                   int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -5159,7 +5155,7 @@
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
@@ -5172,7 +5168,7 @@
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        25b                             \n"
@@ -5184,7 +5180,7 @@
     "movdqu    (%1),%%xmm0                     \n"
     "movdqu    (%1,%4,1),%%xmm1                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        50b                             \n"
@@ -5197,7 +5193,7 @@
     "movdqu    (%1,%4,1),%%xmm0                \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        75b                             \n"
@@ -5207,14 +5203,14 @@
     ".p2align  4                               \n"
   "100:                                        \n"
     "movdqu    (%1),%%xmm0                     \n"
-    "sub       $0x4,%2                         \n"
+    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
+  : "+r"(dst_ptr),    // %0
+    "+r"(src_ptr),    // %1
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4