Add Neon versions of memset32 and memset16
Patch by pgalizia (of codeaurora.org)
(Note: I don't read ARM and I didn't manage to find a reviewer for the
ARM assembly code so this is landing somewhat unreviewed.)
http://codereview.appspot.com/1157045/show
git-svn-id: http://skia.googlecode.com/svn/trunk@573 2bbb7eff-a529-9590-31e7-b0007b416f81
diff --git a/src/opts/memset16_neon.S b/src/opts/memset16_neon.S
new file mode 100644
index 0000000..b47cc22
--- /dev/null
+++ b/src/opts/memset16_neon.S
@@ -0,0 +1,152 @@
+/***************************************************************************
+ Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License. You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied. See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+/***************************************************************************
+ Neon memset: Attempts to do a memset with Neon registers if possible,
+ Inputs:
+ s: The buffer to write to
+ c: The integer data to write to the buffer
+ n: The size_t count.
+ Outputs:
+
+***************************************************************************/
+
+ .code 32
+ .fpu neon
+ .align 4
+ .globl memset16_neon
+ .func
+
+memset16_neon:
+ cmp r2, #0
+ bxeq lr
+
+ /* Keep in mind that r2 -- the count argument -- is for the
+ * number of 16-bit items to copy.
+ */
+ lsl r2, r2, #1
+
+ push {r0}
+
+ /* If we have < 8 bytes, just do a quick loop to handle that */
+ cmp r2, #8
+ bgt memset_gt4
+memset_smallcopy_loop:
+ strh r1, [r0], #2
+ subs r2, r2, #2
+ bne memset_smallcopy_loop
+memset_smallcopy_done:
+ pop {r0}
+ bx lr
+
+memset_gt4:
+ /*
+ * Duplicate the r1 lowest 16-bits across r1. The idea is to have
+ * a register with two 16-bit-values we can copy. We do this by
+ * duplicating lowest 16-bits of r1 to upper 16-bits.
+ */
+ orr r1, r1, r1, lsl #16
+ /*
+ * If we're copying > 64 bytes, then we may want to get
+ * onto a 16-byte boundary to improve speed even more.
+ */
+ cmp r2, #64
+ blt memset_route
+ ands r12, r0, #0xf
+ beq memset_route
+ /*
+ * Determine the number of bytes to move forward to get to the 16-byte
+ * boundary. Note that this will be a multiple of 4, since we
+ * already are word-aligned.
+ */
+ rsb r12, r12, #16
+ sub r2, r2, r12
+ lsls r12, r12, #29
+ strmi r1, [r0], #4
+ strcs r1, [r0], #4
+ strcs r1, [r0], #4
+ lsls r12, r12, #2
+ strcsh r1, [r0], #2
+memset_route:
+ /*
+ * Decide where to route for the maximum copy sizes. Note that we
+ * build q0 and q1 depending on if we'll need it, so that's
+ * interwoven here as well.
+ */
+ vdup.u32 d0, r1
+ cmp r2, #16
+ blt memset_8
+ vmov d1, d0
+ cmp r2, #64
+ blt memset_16
+ vmov q1, q0
+ cmp r2, #128
+ blt memset_32
+memset_128:
+ mov r12, r2, lsr #7
+memset_128_loop:
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ subs r12, r12, #1
+ bne memset_128_loop
+ ands r2, r2, #0x7f
+ beq memset_end
+memset_32:
+ movs r12, r2, lsr #5
+ beq memset_16
+memset_32_loop:
+ subs r12, r12, #1
+ vst1.64 {q0, q1}, [r0]!
+ bne memset_32_loop
+ ands r2, r2, #0x1f
+ beq memset_end
+memset_16:
+ movs r12, r2, lsr #4
+ beq memset_8
+memset_16_loop:
+ subs r12, r12, #1
+ vst1.32 {q0}, [r0]!
+ bne memset_16_loop
+ ands r2, r2, #0xf
+ beq memset_end
+ /*
+ * memset_8 isn't a loop, since we try to do our loops at 16
+ * bytes and above. We should loop there, then drop down here
+ * to finish the <16-byte versions. Same for memset_4 and
+ * memset_1.
+ */
+memset_8:
+ cmp r2, #8
+ blt memset_4
+ subs r2, r2, #8
+ vst1.32 {d0}, [r0]!
+memset_4:
+ cmp r2, #4
+ blt memset_2
+ subs r2, r2, #4
+ str r1, [r0], #4
+memset_2:
+ cmp r2, #0
+ ble memset_end
+ strh r1, [r0], #2
+memset_end:
+ pop {r0}
+ bx lr
+
+ .endfunc
+ .end
diff --git a/src/opts/memset32_neon.S b/src/opts/memset32_neon.S
new file mode 100644
index 0000000..9052c4f
--- /dev/null
+++ b/src/opts/memset32_neon.S
@@ -0,0 +1,122 @@
+/***************************************************************************
+ Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License. You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied. See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+ .code 32
+ .fpu neon
+ .align 4
+ .globl memset32_neon
+ .func
+
+ /* r0 = buffer, r1 = value, r2 = times to write */
+memset32_neon:
+ cmp r2, #1
+ streq r1, [r0], #4
+ bxeq lr
+
+ cmp r2, #4
+ bgt memset32_neon_start
+ cmp r2, #0
+ bxeq lr
+memset32_neon_small:
+ str r1, [r0], #4
+ subs r2, r2, #1
+ bne memset32_neon_small
+ bx lr
+memset32_neon_start:
+ cmp r2, #16
+ blt memset32_dropthru
+ vdup.32 q0, r1
+ vmov q1, q0
+ cmp r2, #32
+ blt memset32_16
+ cmp r2, #64
+ blt memset32_32
+ cmp r2, #128
+ blt memset32_64
+memset32_128:
+ movs r12, r2, lsr #7
+memset32_loop128:
+ subs r12, r12, #1
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ bne memset32_loop128
+ ands r2, r2, #0x7f
+ bxeq lr
+memset32_64:
+ movs r12, r2, lsr #6
+ beq memset32_32
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ ands r2, r2, #0x3f
+ bxeq lr
+memset32_32:
+ movs r12, r2, lsr #5
+ beq memset32_16
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ ands r2, r2, #0x1f
+ bxeq lr
+memset32_16:
+ movs r12, r2, lsr #4
+ beq memset32_dropthru
+ and r2, r2, #0xf
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+memset32_dropthru:
+ rsb r2, r2, #15
+ add pc, pc, r2, lsl #2
+ nop
+ str r1, [r0, #56]
+ str r1, [r0, #52]
+ str r1, [r0, #48]
+ str r1, [r0, #44]
+ str r1, [r0, #40]
+ str r1, [r0, #36]
+ str r1, [r0, #32]
+ str r1, [r0, #28]
+ str r1, [r0, #24]
+ str r1, [r0, #20]
+ str r1, [r0, #16]
+ str r1, [r0, #12]
+ str r1, [r0, #8]
+ str r1, [r0, #4]
+ str r1, [r0, #0]
+ bx lr
+
+ .endfunc
+ .end
diff --git a/src/opts/opts_check_arm_neon.cpp b/src/opts/opts_check_arm_neon.cpp
new file mode 100644
index 0000000..8f18df2
--- /dev/null
+++ b/src/opts/opts_check_arm_neon.cpp
@@ -0,0 +1,45 @@
+/***************************************************************************
+ Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License"); you
+ may not use this file except in compliance with the License. You may
+ obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied. See the License for the specific language governing
+ permissions and limitations under the License.
+ ***************************************************************************/
+
+
+#include "SkUtils.h"
+
+extern "C" void memset16_neon(uint16_t dst[], uint16_t value, int count);
+extern "C" void memset32_neon(uint32_t dst[], uint32_t value, int count);
+
+static inline bool hasNeonRegisters() {
+#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN)
+ return true;
+#else
+ return false;
+#endif
+}
+
+SkMemset16Proc SkMemset16GetPlatformProc() {
+ if (hasNeonRegisters()) {
+ return memset16_neon;
+ } else {
+ return NULL;
+ }
+}
+
+SkMemset32Proc SkMemset32GetPlatformProc() {
+ if (hasNeonRegisters()) {
+ return memset32_neon;
+ } else {
+ return NULL;
+ }
+}