Add ARM optimizations to the build.

Also had to fix a problem in the ARM memset code that was
causing some tests and bench to fail.
Review URL: http://codereview.appspot.com/5522052

git-svn-id: http://skia.googlecode.com/svn/trunk@2989 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/opts/memset.arm.S b/src/opts/memset.arm.S
index 7732174..bc0c060 100644
--- a/src/opts/memset.arm.S
+++ b/src/opts/memset.arm.S
@@ -32,6 +32,10 @@
         .fnstart
         push        {lr}
 
+        /* Multiply count by 2 - go from the number of 16-bit shorts
+         * to the number of bytes desired. */
+        mov         r2, r2, lsl #1
+
         /* expand the data to 32 bits */
         orr         r1, r1, lsl #16
 
@@ -40,10 +44,6 @@
         strneh      r1, [r0], #2
         subne       r2, r2, #2
 
-        /* Multiply count by 2 - go from the number of 16-bit shorts
-         * to the number of bytes desired. */
-        mov         r2, r2, lsl #1
-
         /* Now jump into the main loop below. */
         b           .Lwork_32
         .fnend