p2align all loops, copy stride to local for scale, and copy last byte in bilinear more efficiently
BUG=none
TEST=none
Review URL: https://webrtc-codereview.appspot.com/547007

git-svn-id: http://libyuv.googlecode.com/svn/trunk@255 16f28f9a-4ce2-e073-06de-1de4eb20be90
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 70dd420..af790ae 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -32,6 +32,7 @@
     "sub         %4, #8                        \n"
 
     // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
     "1:                                        \n"
       "mov         r9, %0                      \n"
 
@@ -198,6 +199,7 @@
     "sub         %6, #8                        \n"
 
     // handle 8x8 blocks.  this should be the majority of the plane
+    ".p2align  4                               \n"
     "1:                                        \n"
       "mov         r9, %0                      \n"