Neon YToARGB and fix SSE2 to match C version
BUG=none
TEST=YToARGB_Opt
Review URL: https://webrtc-codereview.appspot.com/966007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@466 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index fa0c07e..5e26005 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -2334,12 +2334,13 @@
                      uint8* rgb_buf,
                      int width) {
   asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
     "pslld     $0x18,%%xmm4                    \n"
-    "mov       $0x10001000,%%eax               \n"
+    "mov       $0x00100010,%%eax               \n"
     "movd      %%eax,%%xmm3                    \n"
     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "mov       $0x012a012a,%%eax               \n"
+    "mov       $0x004a004a,%%eax               \n"
     "movd      %%eax,%%xmm2                    \n"
     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
     ".p2align  4                               \n"
@@ -2347,9 +2348,10 @@
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     "movq      (%0),%%xmm0                     \n"
     "lea       0x8(%0),%0                      \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
     "psubusw   %%xmm3,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
+    "pmullw    %%xmm2,%%xmm0                   \n"
+    "psrlw     $6, %%xmm0                      \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
 
     // Step 2: Weave into ARGB