reorder stores for FastConvertYUVToABGRRow_SSSE3 and FastConvertYUVToBGRARow_SSSE3. ReverseRow_SSE2.  cpu detect allow environment variable override set LIBYUV_DISABLE_SSSE3=1 set LIBYUV_DISABLE_SSE2=1.  Reorder stores in rotate for core2
BUG=none
TEST=none
Review URL: http://webrtc-codereview.appspot.com/317010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@107 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index eadde78..b6e9bf9 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -17,16 +17,22 @@
 extern "C" {
 #endif
 
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
+
 #ifdef HAS_ARGBTOUVROW_SSSE3
-vec8 kARGBToU = {
+CONST vec8 kARGBToU = {
   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
 
-uvec8 kARGBToV = {
+CONST uvec8 kARGBToV = {
   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
 };
 
-uvec8 kAddUV128 = {
+CONST uvec8 kAddUV128 = {
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
@@ -35,31 +41,31 @@
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constant multiplication table for converting ARGB to I400.
-vec8 kARGBToY = {
+CONST vec8 kARGBToY = {
   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
 
-uvec8 kAddY16 = {
+CONST uvec8 kAddY16 = {
   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
 // Shuffle table for converting BG24 to ARGB.
-uvec8 kShuffleMaskBG24ToARGB = {
+CONST uvec8 kShuffleMaskBG24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 
 // Shuffle table for converting RAW to ARGB.
-uvec8 kShuffleMaskRAWToARGB = {
+CONST uvec8 kShuffleMaskRAWToARGB = {
   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 
 // Shuffle table for converting ABGR to ARGB.
-uvec8 kShuffleMaskABGRToARGB = {
+CONST uvec8 kShuffleMaskABGRToARGB = {
   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
 };
 
 // Shuffle table for converting BGRA to ARGB.
-uvec8 kShuffleMaskBGRAToARGB = {
+CONST uvec8 kShuffleMaskBGRAToARGB = {
   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
 
@@ -352,7 +358,7 @@
   vec16 kUVBiasR;
   vec16 kYSub16;
   vec16 kYToRgb;
-} SIMD_ALIGNED(kYuvConstants) = {
+} CONST SIMD_ALIGNED(kYuvConstants) = {
   { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
   { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
@@ -445,8 +451,8 @@
     "punpcklbw   %%xmm2,%%xmm5                 \n"
     "movdqa      %%xmm5,%%xmm0                 \n"
     "punpcklwd   %%xmm1,%%xmm5                 \n"
-    "movdqa      %%xmm5,(%3)                   \n"
     "punpckhwd   %%xmm1,%%xmm0                 \n"
+    "movdqa      %%xmm5,(%3)                   \n"
     "movdqa      %%xmm0,0x10(%3)               \n"
     "lea         0x20(%3),%3                   \n"
     "sub         $0x8,%4                       \n"
@@ -480,8 +486,8 @@
     "punpcklbw   %%xmm5,%%xmm0                 \n"
     "movdqa      %%xmm2,%%xmm1                 \n"
     "punpcklwd   %%xmm0,%%xmm2                 \n"
-    "movdqa      %%xmm2,(%3)                   \n"
     "punpckhwd   %%xmm0,%%xmm1                 \n"
+    "movdqa      %%xmm2,(%3)                   \n"
     "movdqa      %%xmm1,0x10(%3)               \n"
     "lea         0x20(%3),%3                   \n"
     "sub         $0x8,%4                       \n"
@@ -640,11 +646,8 @@
 
 #ifdef HAS_REVERSE_ROW_SSSE3
 
-// TODO(fbarchard): define CONST macro that is static const for linux, but
-// does nothing for gcc on OSX (which has an internal compiler fault)
-
 // Shuffle table for reversing the bytes.
-uvec8 kShuffleReverse = {
+CONST uvec8 kShuffleReverse = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
@@ -653,14 +656,14 @@
   asm volatile (
   "movdqa     %3,%%xmm5                        \n"
   "lea        -0x10(%0,%2,1),%0                \n"
-"1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "lea        -0x10(%0),%0                     \n"
-  "pshufb     %%xmm5,%%xmm0                    \n"
-  "movdqa     %%xmm0,(%1)                      \n"
-  "lea        0x10(%1),%1                      \n"
-  "sub        $0x10,%2                         \n"
-  "ja         1b                               \n"
+  "1:                                          \n"
+    "movdqa     (%0),%%xmm0                    \n"
+    "lea        -0x10(%0),%0                   \n"
+    "pshufb     %%xmm5,%%xmm0                  \n"
+    "movdqa     %%xmm0,(%1)                    \n"
+    "lea        0x10(%1),%1                    \n"
+    "sub        $0x10,%2                       \n"
+    "ja         1b                             \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
@@ -673,6 +676,38 @@
 }
 #endif
 
+#ifdef HAS_REVERSE_ROW_SSE2
+
+void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = static_cast<intptr_t>(width);
+  asm volatile (
+  "lea        -0x10(%0,%2,1),%0                \n"
+  "1:                                          \n"
+    "movdqa     (%0),%%xmm0                    \n"
+    "lea        -0x10(%0),%0                   \n"
+    "movdqa     %%xmm0,%%xmm1                  \n"
+    "psllw      $0x8,%%xmm0                    \n"
+    "psrlw      $0x8,%%xmm1                    \n"
+    "por        %%xmm1,%%xmm0                  \n"
+    "pshuflw    $0x1b,%%xmm0,%%xmm0            \n"
+    "pshufhw    $0x1b,%%xmm0,%%xmm0            \n"
+    "pshufd     $0x4e,%%xmm0,%%xmm0            \n"
+    "movdqa     %%xmm0,(%1)                    \n"
+    "lea        0x10(%1),%1                    \n"
+    "sub        $0x10,%2                       \n"
+    "ja         1b                             \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv