blob: b1719fa1fa4de15ca31ae98138e0243fcb18f5f2 [file] [log] [blame]
agl@chromium.orgaab40902010-06-04 14:47:38 +00001/***************************************************************************
epoger@google.comfd03db02011-07-28 14:24:55 +00002 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
agl@chromium.orgaab40902010-06-04 14:47:38 +00006 ***************************************************************************/
7
8/***************************************************************************
9 Neon memset: Attempts to do a memset with Neon registers if possible,
10 Inputs:
11 s: The buffer to write to
12 c: The integer data to write to the buffer
13 n: The size_t count.
14 Outputs:
15
16***************************************************************************/
17
18 .code 32
19 .fpu neon
20 .align 4
21 .globl memset16_neon
22 .func
23
24memset16_neon:
25 cmp r2, #0
26 bxeq lr
27
28 /* Keep in mind that r2 -- the count argument -- is for the
29 * number of 16-bit items to copy.
30 */
31 lsl r2, r2, #1
32
33 push {r0}
34
35 /* If we have < 8 bytes, just do a quick loop to handle that */
36 cmp r2, #8
37 bgt memset_gt4
38memset_smallcopy_loop:
39 strh r1, [r0], #2
40 subs r2, r2, #2
41 bne memset_smallcopy_loop
42memset_smallcopy_done:
43 pop {r0}
44 bx lr
45
46memset_gt4:
47 /*
48 * Duplicate the r1 lowest 16-bits across r1. The idea is to have
49 * a register with two 16-bit-values we can copy. We do this by
50 * duplicating lowest 16-bits of r1 to upper 16-bits.
51 */
52 orr r1, r1, r1, lsl #16
53 /*
54 * If we're copying > 64 bytes, then we may want to get
55 * onto a 16-byte boundary to improve speed even more.
56 */
57 cmp r2, #64
58 blt memset_route
59 ands r12, r0, #0xf
60 beq memset_route
61 /*
62 * Determine the number of bytes to move forward to get to the 16-byte
63 * boundary. Note that this will be a multiple of 4, since we
64 * already are word-aligned.
65 */
66 rsb r12, r12, #16
67 sub r2, r2, r12
68 lsls r12, r12, #29
69 strmi r1, [r0], #4
70 strcs r1, [r0], #4
71 strcs r1, [r0], #4
72 lsls r12, r12, #2
73 strcsh r1, [r0], #2
74memset_route:
75 /*
76 * Decide where to route for the maximum copy sizes. Note that we
77 * build q0 and q1 depending on if we'll need it, so that's
78 * interwoven here as well.
79 */
80 vdup.u32 d0, r1
81 cmp r2, #16
82 blt memset_8
83 vmov d1, d0
84 cmp r2, #64
85 blt memset_16
86 vmov q1, q0
87 cmp r2, #128
88 blt memset_32
89memset_128:
90 mov r12, r2, lsr #7
91memset_128_loop:
92 vst1.64 {q0, q1}, [r0]!
93 vst1.64 {q0, q1}, [r0]!
94 vst1.64 {q0, q1}, [r0]!
95 vst1.64 {q0, q1}, [r0]!
96 subs r12, r12, #1
97 bne memset_128_loop
98 ands r2, r2, #0x7f
99 beq memset_end
100memset_32:
101 movs r12, r2, lsr #5
102 beq memset_16
103memset_32_loop:
104 subs r12, r12, #1
105 vst1.64 {q0, q1}, [r0]!
106 bne memset_32_loop
107 ands r2, r2, #0x1f
108 beq memset_end
109memset_16:
110 movs r12, r2, lsr #4
111 beq memset_8
112memset_16_loop:
113 subs r12, r12, #1
114 vst1.32 {q0}, [r0]!
115 bne memset_16_loop
116 ands r2, r2, #0xf
117 beq memset_end
118 /*
119 * memset_8 isn't a loop, since we try to do our loops at 16
120 * bytes and above. We should loop there, then drop down here
121 * to finish the <16-byte versions. Same for memset_4 and
122 * memset_1.
123 */
124memset_8:
125 cmp r2, #8
126 blt memset_4
127 subs r2, r2, #8
128 vst1.32 {d0}, [r0]!
129memset_4:
130 cmp r2, #4
131 blt memset_2
132 subs r2, r2, #4
133 str r1, [r0], #4
134memset_2:
135 cmp r2, #0
136 ble memset_end
137 strh r1, [r0], #2
138memset_end:
139 pop {r0}
140 bx lr
141
142 .endfunc
143 .end