Add 2 ARGB Images together and store to destination
BUG=175
TEST=Add unittest
Review URL: https://webrtc-codereview.appspot.com/1049004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@543 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index c710241..04be0be 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3948,10 +3948,10 @@
     "movdqa    %%xmm0,(%0,%1,1)                \n"
     "lea       0x10(%0),%0                     \n"
     "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(value)       // %3
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
@@ -3961,7 +3961,7 @@
 #endif  // HAS_ARGBSHADEROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
@@ -3988,8 +3988,8 @@
     "movdqa    %%xmm0,(%0,%2,1)                \n"
     "lea       0x10(%0),%0                     \n"
     "jg        1b                              \n"
-  : "+r"(src_argb0),   // %0
-    "+r"(src_argb1),   // %1
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
     "+r"(dst_argb),   // %2
     "+r"(width)       // %3
   :
@@ -4001,6 +4001,39 @@
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// Aligned to 16 bytes.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+
+    // 4 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    (%0,%1),%%xmm1                  \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0,(%0,%2,1)                \n"
+    "lea       0x10(%0),%0                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBADDROW_SSE2
+
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.