Add Neon versions of memset32 and memset16

Patch by pgalizia (of codeaurora.org)

(Note: I don't read ARM and I didn't manage to find a reviewer for the
ARM assembly code so this is landing somewhat unreviewed.)

http://codereview.appspot.com/1157045/show

git-svn-id: http://skia.googlecode.com/svn/trunk@573 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/opts/memset16_neon.S b/src/opts/memset16_neon.S
new file mode 100644
index 0000000..b47cc22
--- /dev/null
+++ b/src/opts/memset16_neon.S
@@ -0,0 +1,152 @@
+/***************************************************************************
+ Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License.  You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied.  See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+/***************************************************************************
+  Neon memset: Attempts to do a memset with Neon registers if possible,
+     Inputs:
+        s: The buffer to write to
+        c: The integer data to write to the buffer
+        n: The size_t count.
+     Outputs:
+
+***************************************************************************/
+
+        .code 32
+        .fpu neon
+        .align 4
+        .globl memset16_neon
+        .func
+
+memset16_neon:
+        cmp             r2, #0
+        bxeq            lr
+
+        /* Keep in mind that r2 -- the count argument -- is for the
+         * number of 16-bit items to copy.
+         */
+        lsl             r2, r2, #1
+
+        push            {r0}
+
+        /* If we have < 8 bytes, just do a quick loop to handle that */
+        cmp             r2, #8
+        bgt             memset_gt4
+memset_smallcopy_loop:
+        strh            r1, [r0], #2
+        subs            r2, r2, #2
+        bne             memset_smallcopy_loop
+memset_smallcopy_done:
+        pop             {r0}
+        bx              lr
+
+memset_gt4:
+        /*
+         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
+         * a register with two 16-bit-values we can copy. We do this by
+         * duplicating lowest 16-bits of r1 to upper 16-bits.
+         */
+        orr             r1, r1, r1, lsl #16
+        /*
+         * If we're copying > 64 bytes, then we may want to get
+         * onto a 16-byte boundary to improve speed even more.
+         */
+        cmp             r2, #64
+        blt             memset_route
+        ands            r12, r0, #0xf
+        beq             memset_route
+        /*
+         * Determine the number of bytes to move forward to get to the 16-byte
+         * boundary.  Note that this will be a multiple of 4, since we
+         * already are word-aligned.
+         */
+        rsb             r12, r12, #16
+        sub             r2, r2, r12
+        lsls            r12, r12, #29
+        strmi           r1, [r0], #4
+        strcs           r1, [r0], #4
+        strcs           r1, [r0], #4
+        lsls            r12, r12, #2
+        strcsh          r1, [r0], #2
+memset_route:
+        /*
+         * Decide where to route for the maximum copy sizes.  Note that we
+         * build q0 and q1 depending on if we'll need it, so that's
+         * interwoven here as well.
+         */
+        vdup.u32        d0, r1
+        cmp             r2, #16
+        blt             memset_8
+        vmov            d1, d0
+        cmp             r2, #64
+        blt             memset_16
+        vmov            q1, q0
+        cmp             r2, #128
+        blt             memset_32
+memset_128:
+        mov             r12, r2, lsr #7
+memset_128_loop:
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        subs            r12, r12, #1
+        bne             memset_128_loop
+        ands            r2, r2, #0x7f
+        beq             memset_end
+memset_32:
+        movs            r12, r2, lsr #5
+        beq             memset_16
+memset_32_loop:
+        subs            r12, r12, #1
+        vst1.64         {q0, q1}, [r0]!
+        bne             memset_32_loop
+        ands            r2, r2, #0x1f
+        beq             memset_end
+memset_16:
+        movs            r12, r2, lsr #4
+        beq             memset_8
+memset_16_loop:
+        subs            r12, r12, #1
+        vst1.32         {q0}, [r0]!
+        bne             memset_16_loop
+        ands            r2, r2, #0xf
+        beq             memset_end
+        /*
+         * memset_8 isn't a loop, since we try to do our loops at 16
+         * bytes and above.  We should loop there, then drop down here
+         * to finish the <16-byte versions.  Same for memset_4 and
+         * memset_1.
+         */
+memset_8:
+        cmp             r2, #8
+        blt             memset_4
+        subs            r2, r2, #8
+        vst1.32         {d0}, [r0]!
+memset_4:
+        cmp             r2, #4
+        blt             memset_2
+        subs            r2, r2, #4
+        str             r1, [r0], #4
+memset_2:
+        cmp             r2, #0
+        ble             memset_end
+        strh            r1, [r0], #2
+memset_end:
+        pop             {r0}
+        bx              lr
+
+        .endfunc
+        .end
diff --git a/src/opts/memset32_neon.S b/src/opts/memset32_neon.S
new file mode 100644
index 0000000..9052c4f
--- /dev/null
+++ b/src/opts/memset32_neon.S
@@ -0,0 +1,122 @@
+/***************************************************************************
+ Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License.  You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied.  See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+	.code 32
+	.fpu neon
+	.align 4
+	.globl	memset32_neon
+	.func
+
+	/* r0 = buffer, r1 = value, r2 = times to write */
+memset32_neon:
+	cmp		r2, #1
+	streq		r1, [r0], #4
+	bxeq		lr
+
+	cmp		r2, #4
+	bgt		memset32_neon_start
+	cmp		r2, #0
+	bxeq		lr
+memset32_neon_small:
+	str		r1, [r0], #4
+	subs		r2, r2, #1
+	bne		memset32_neon_small
+	bx		lr
+memset32_neon_start:
+	cmp		r2, #16
+	blt		memset32_dropthru
+	vdup.32		q0, r1
+	vmov		q1, q0
+	cmp		r2, #32
+	blt		memset32_16
+	cmp		r2, #64
+	blt		memset32_32
+	cmp		r2, #128
+	blt		memset32_64
+memset32_128:
+	movs		r12, r2, lsr #7
+memset32_loop128:
+	subs		r12, r12, #1
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	bne		memset32_loop128
+	ands		r2, r2, #0x7f
+	bxeq		lr
+memset32_64:
+	movs		r12, r2, lsr #6
+	beq		memset32_32
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	ands		r2, r2, #0x3f
+	bxeq		lr
+memset32_32:
+	movs		r12, r2, lsr #5
+	beq		memset32_16
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+	ands		r2, r2, #0x1f
+	bxeq		lr
+memset32_16:
+	movs		r12, r2, lsr #4
+	beq		memset32_dropthru
+	and		r2, r2, #0xf
+	vst1.64		{q0, q1}, [r0]!
+	vst1.64		{q0, q1}, [r0]!
+memset32_dropthru:
+	rsb		r2, r2, #15
+	add		pc, pc, r2, lsl #2
+	nop
+	str		r1, [r0, #56]
+	str		r1, [r0, #52]
+	str		r1, [r0, #48]
+	str		r1, [r0, #44]
+	str		r1, [r0, #40]
+	str		r1, [r0, #36]
+	str		r1, [r0, #32]
+	str		r1, [r0, #28]
+	str		r1, [r0, #24]
+	str		r1, [r0, #20]
+	str		r1, [r0, #16]
+	str		r1, [r0, #12]
+	str		r1, [r0, #8]
+	str		r1, [r0, #4]
+	str		r1, [r0, #0]
+	bx		lr
+
+	.endfunc
+	.end
diff --git a/src/opts/opts_check_arm_neon.cpp b/src/opts/opts_check_arm_neon.cpp
new file mode 100644
index 0000000..8f18df2
--- /dev/null
+++ b/src/opts/opts_check_arm_neon.cpp
@@ -0,0 +1,45 @@
+/***************************************************************************
+ Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License.  You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied.  See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+
+#include "SkUtils.h"
+
+extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count);
+extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count);
+
+static inline bool hasNeonRegisters() {
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+    return true;
+#else
+    return false;
+#endif
+}
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+    if (hasNeonRegisters()) {
+        return memset16_neon;
+    } else {
+        return NULL;
+    }
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+    if (hasNeonRegisters()) {
+        return memset32_neon;
+    } else {
+        return NULL;
+    }
+}