p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/547007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 1a8f4fb..122b309 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -112,6 +112,7 @@
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0x18,%%xmm5                    \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movq      (%0),%%xmm0                     \n"
     "lea       0x8(%0),%0                      \n"
@@ -141,6 +142,7 @@
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
@@ -164,6 +166,7 @@
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
@@ -187,6 +190,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -227,6 +231,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -279,6 +284,7 @@
     "psllw     $0x8,%%xmm7                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -327,6 +333,7 @@
     "psllw     $0x8,%%xmm7                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -372,6 +379,7 @@
     "pslld     $0x4,%%xmm5                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -405,6 +413,7 @@
 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -445,6 +454,7 @@
 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -491,6 +501,7 @@
     "pslld     $0x5,%%xmm4                     \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0xb,%%xmm5                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -531,6 +542,7 @@
     "pslld     $0xa,%%xmm6                     \n"
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
     "pslld     $0xf,%%xmm7                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -570,6 +582,7 @@
     "psllw     $0xc,%%xmm4                     \n"
     "movdqa    %%xmm4,%%xmm3                   \n"
     "psrlw     $0x8,%%xmm3                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -599,6 +612,7 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -635,6 +649,7 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -689,6 +704,7 @@
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -753,6 +769,7 @@
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -808,6 +825,7 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -844,6 +862,7 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -893,6 +912,7 @@
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -957,6 +977,7 @@
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1012,6 +1033,7 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1048,6 +1070,7 @@
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1097,6 +1120,7 @@
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1161,6 +1185,7 @@
   );
   asm volatile (
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1295,6 +1320,7 @@
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
@@ -1329,6 +1355,7 @@
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -1364,6 +1391,7 @@
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm2                   \n"
@@ -1398,6 +1426,7 @@
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm0                   \n"
@@ -1432,6 +1461,7 @@
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
@@ -1467,6 +1497,7 @@
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     YUVTORGB
     "punpcklbw %%xmm1,%%xmm2                   \n"
@@ -1501,6 +1532,7 @@
     "sub       %1,%2                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movd      (%1),%%xmm0                     \n"
     "movd      (%1,%2,1),%%xmm1                \n"
@@ -1562,6 +1594,7 @@
     "mov       $0x012a012a,%%eax               \n"
     "movd      %%eax,%%xmm2                    \n"
     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     "movq      (%0),%%xmm0                     \n"
@@ -1607,6 +1640,7 @@
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
     "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0,%2),%%xmm0                  \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
@@ -1631,6 +1665,7 @@
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
     "lea       -0x10(%0),%0                    \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0,%2),%%xmm0                  \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -1668,6 +1703,7 @@
     "movdqa    %4,%%xmm1                       \n"
     "lea       -16(%0,%3,2),%0                 \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "lea       -16(%0),%0                      \n"
@@ -1695,6 +1731,7 @@
 void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
   asm volatile (
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm2                     \n"
     "lea       0x10(%0),%0                     \n"
@@ -1725,6 +1762,7 @@
 void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
   asm volatile (
     "pxor      %%xmm4,%%xmm4                   \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm2                     \n"
     "lea       0x10(%0),%0                     \n"
@@ -1758,6 +1796,7 @@
     "pcmpeqb    %%xmm5,%%xmm5                    \n"
     "psrlw      $0x8,%%xmm5                      \n"
     "sub        %1,%2                            \n"
+    ".p2align  4                               \n"
   "1:                                            \n"
     "movdqa     (%0),%%xmm0                      \n"
     "movdqa     0x10(%0),%%xmm1                  \n"
@@ -1833,6 +1872,7 @@
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1861,6 +1901,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1900,6 +1941,7 @@
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1930,6 +1972,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -1965,6 +2008,7 @@
 
 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -1993,6 +2037,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movdqa    0x10(%0),%%xmm1                 \n"
@@ -2029,6 +2074,7 @@
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                                uint8* dst_y, int pix) {
   asm volatile (
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -2057,6 +2103,7 @@
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm0                     \n"
     "movdqu    0x10(%0),%%xmm1                 \n"
@@ -2109,6 +2156,7 @@
     "pslld     $0x18,%%xmm4                    \n"
 
   // 8 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm3                     \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
@@ -2184,6 +2232,7 @@
     "pslld     $0x18,%%xmm4                    \n"
 
   // 1 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movd      (%0),%%xmm3                     \n"
     "lea       0x4(%0),%0                      \n"
@@ -2241,6 +2290,7 @@
     "pslld     $0x18,%%xmm4                    \n"
 
   // 8 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqu    (%0),%%xmm3                     \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
@@ -2313,6 +2363,7 @@
     "pslld     $0x18,%%xmm4                    \n"
 
   // 1 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movd      (%0),%%xmm3                     \n"
     "lea       0x4(%0),%0                      \n"
@@ -2361,6 +2412,7 @@
     "psrld     $0x8,%%xmm5                     \n"
 
   // 4 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -2415,6 +2467,7 @@
     "movdqa    %4,%%xmm5                       \n"
 
   // 4 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
@@ -2503,6 +2556,7 @@
     "pslld     $0x18,%%xmm4                    \n"
 
   // 4 pixel loop
+    ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%0),%%xmm0                     \n"
     "movzb     0x3(%0),%3                      \n"