| /*************************************************************************** |
| * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. |
| * |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| ***************************************************************************/ |
| |
| /*************************************************************************** |
| Neon memset: Attempts to do a memset with Neon registers if possible, |
| Inputs: |
| s: The buffer to write to |
| c: The integer data to write to the buffer |
| n: The size_t count. |
| Outputs: |
| |
| ***************************************************************************/ |
| |
| .code 32 |
| .fpu neon |
| .align 4 |
| .globl memset16_neon |
| .func |
| |
| memset16_neon: |
| cmp r2, #0 |
| bxeq lr |
| |
| /* Keep in mind that r2 -- the count argument -- is for the |
| * number of 16-bit items to copy. |
| */ |
| lsl r2, r2, #1 |
| |
| push {r0} |
| |
| /* If we have < 8 bytes, just do a quick loop to handle that */ |
| cmp r2, #8 |
| bgt memset_gt4 |
| memset_smallcopy_loop: |
| strh r1, [r0], #2 |
| subs r2, r2, #2 |
| bne memset_smallcopy_loop |
| memset_smallcopy_done: |
| pop {r0} |
| bx lr |
| |
| memset_gt4: |
| /* |
| * Duplicate the r1 lowest 16-bits across r1. The idea is to have |
| * a register with two 16-bit-values we can copy. We do this by |
| * duplicating lowest 16-bits of r1 to upper 16-bits. |
| */ |
| orr r1, r1, r1, lsl #16 |
| /* |
| * If we're copying > 64 bytes, then we may want to get |
| * onto a 16-byte boundary to improve speed even more. |
| */ |
| cmp r2, #64 |
| blt memset_route |
| ands r12, r0, #0xf |
| beq memset_route |
| /* |
| * Determine the number of bytes to move forward to get to the 16-byte |
| * boundary. Note that this will be a multiple of 4, since we |
| * already are word-aligned. |
| */ |
| rsb r12, r12, #16 |
| sub r2, r2, r12 |
| lsls r12, r12, #29 |
| strmi r1, [r0], #4 |
| strcs r1, [r0], #4 |
| strcs r1, [r0], #4 |
| lsls r12, r12, #2 |
| strcsh r1, [r0], #2 |
| memset_route: |
| /* |
| * Decide where to route for the maximum copy sizes. Note that we |
| * build q0 and q1 depending on if we'll need it, so that's |
| * interwoven here as well. |
| */ |
| vdup.u32 d0, r1 |
| cmp r2, #16 |
| blt memset_8 |
| vmov d1, d0 |
| cmp r2, #64 |
| blt memset_16 |
| vmov q1, q0 |
| cmp r2, #128 |
| blt memset_32 |
| memset_128: |
| mov r12, r2, lsr #7 |
| memset_128_loop: |
| vst1.64 {q0, q1}, [r0]! |
| vst1.64 {q0, q1}, [r0]! |
| vst1.64 {q0, q1}, [r0]! |
| vst1.64 {q0, q1}, [r0]! |
| subs r12, r12, #1 |
| bne memset_128_loop |
| ands r2, r2, #0x7f |
| beq memset_end |
| memset_32: |
| movs r12, r2, lsr #5 |
| beq memset_16 |
| memset_32_loop: |
| subs r12, r12, #1 |
| vst1.64 {q0, q1}, [r0]! |
| bne memset_32_loop |
| ands r2, r2, #0x1f |
| beq memset_end |
| memset_16: |
| movs r12, r2, lsr #4 |
| beq memset_8 |
| memset_16_loop: |
| subs r12, r12, #1 |
| vst1.32 {q0}, [r0]! |
| bne memset_16_loop |
| ands r2, r2, #0xf |
| beq memset_end |
| /* |
| * memset_8 isn't a loop, since we try to do our loops at 16 |
| * bytes and above. We should loop there, then drop down here |
| * to finish the <16-byte versions. Same for memset_4 and |
| * memset_1. |
| */ |
| memset_8: |
| cmp r2, #8 |
| blt memset_4 |
| subs r2, r2, #8 |
| vst1.32 {d0}, [r0]! |
| memset_4: |
| cmp r2, #4 |
| blt memset_2 |
| subs r2, r2, #4 |
| str r1, [r0], #4 |
| memset_2: |
| cmp r2, #0 |
| ble memset_end |
| strh r1, [r0], #2 |
| memset_end: |
| pop {r0} |
| bx lr |
| |
| .endfunc |
| .end |