blob: b39832fff163627a532ef34209fb9309717c2e06 [file] [log] [blame]
/***************************************************************************
* Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
*
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
***************************************************************************/
/***************************************************************************
Neon memset: Attempts to do a memset with Neon registers if possible,
Inputs:
s: The buffer to write to
c: The integer data to write to the buffer
n: The size_t count.
Outputs:
***************************************************************************/
.syntax unified
.code 32
.fpu neon
.align 4
.globl memset16_neon
.hidden memset16_neon
memset16_neon:
cmp r2, #0
bxeq lr
/* Keep in mind that r2 -- the count argument -- is for the
* number of 16-bit items to copy.
*/
lsl r2, r2, #1
push {r0}
/* If we have < 8 bytes, just do a quick loop to handle that */
cmp r2, #8
bgt memset_gt4
memset_smallcopy_loop:
strh r1, [r0], #2
subs r2, r2, #2
bne memset_smallcopy_loop
memset_smallcopy_done:
pop {r0}
bx lr
memset_gt4:
/*
* Duplicate the r1 lowest 16-bits across r1. The idea is to have
* a register with two 16-bit-values we can copy. We do this by
* duplicating lowest 16-bits of r1 to upper 16-bits.
*/
orr r1, r1, r1, lsl #16
/*
* If we're copying > 64 bytes, then we may want to get
* onto a 16-byte boundary to improve speed even more.
*/
cmp r2, #64
blt memset_route
ands r12, r0, #0xf
beq memset_route
/*
* Determine the number of bytes to move forward to get to the 16-byte
* boundary. Note that this will be a multiple of 4, since we
* already are word-aligned.
*/
rsb r12, r12, #16
sub r2, r2, r12
lsls r12, r12, #29
strmi r1, [r0], #4
strcs r1, [r0], #4
strcs r1, [r0], #4
lsls r12, r12, #2
strhcs r1, [r0], #2
memset_route:
/*
* Decide where to route for the maximum copy sizes. Note that we
* build q0 and q1 depending on if we'll need it, so that's
* interwoven here as well.
*/
vdup.u32 d0, r1
cmp r2, #16
blt memset_8
vmov d1, d0
cmp r2, #64
blt memset_16
vmov q1, q0
cmp r2, #128
blt memset_32
memset_128:
mov r12, r2, lsr #7
memset_128_loop:
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
vst1.64 {q0, q1}, [r0]!
subs r12, r12, #1
bne memset_128_loop
ands r2, r2, #0x7f
beq memset_end
memset_32:
movs r12, r2, lsr #5
beq memset_16
memset_32_loop:
subs r12, r12, #1
vst1.64 {q0, q1}, [r0]!
bne memset_32_loop
ands r2, r2, #0x1f
beq memset_end
memset_16:
movs r12, r2, lsr #4
beq memset_8
memset_16_loop:
subs r12, r12, #1
vst1.32 {q0}, [r0]!
bne memset_16_loop
ands r2, r2, #0xf
beq memset_end
/*
* memset_8 isn't a loop, since we try to do our loops at 16
* bytes and above. We should loop there, then drop down here
* to finish the <16-byte versions. Same for memset_4 and
* memset_1.
*/
memset_8:
cmp r2, #8
blt memset_4
subs r2, r2, #8
vst1.32 {d0}, [r0]!
memset_4:
cmp r2, #4
blt memset_2
subs r2, r2, #4
str r1, [r0], #4
memset_2:
cmp r2, #0
ble memset_end
strh r1, [r0], #2
memset_end:
pop {r0}
bx lr
.end