blob: b47cc226be841b8f1ccdbf09a5115b0dee089cb4 [file] [log] [blame]
agl@chromium.orgaab40902010-06-04 14:47:38 +00001/***************************************************************************
2 Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
3
4 Licensed under the Apache License, Version 2.0 (the "License"); you
5 may not use this file except in compliance with the License. You may
6 obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 implied. See the License for the specific language governing
14 permissions and limitations under the License.
15 ***************************************************************************/
16
17/***************************************************************************
18 Neon memset: Attempts to do a memset with Neon registers if possible,
19 Inputs:
20 s: The buffer to write to
21 c: The integer data to write to the buffer
22 n: The size_t count.
23 Outputs:
24
25***************************************************************************/
26
27 .code 32
28 .fpu neon
29 .align 4
30 .globl memset16_neon
31 .func
32
33memset16_neon:
34 cmp r2, #0
35 bxeq lr
36
37 /* Keep in mind that r2 -- the count argument -- is for the
38 * number of 16-bit items to copy.
39 */
40 lsl r2, r2, #1
41
42 push {r0}
43
44 /* If we have < 8 bytes, just do a quick loop to handle that */
45 cmp r2, #8
46 bgt memset_gt4
47memset_smallcopy_loop:
48 strh r1, [r0], #2
49 subs r2, r2, #2
50 bne memset_smallcopy_loop
51memset_smallcopy_done:
52 pop {r0}
53 bx lr
54
55memset_gt4:
56 /*
57 * Duplicate the r1 lowest 16-bits across r1. The idea is to have
58 * a register with two 16-bit-values we can copy. We do this by
59 * duplicating lowest 16-bits of r1 to upper 16-bits.
60 */
61 orr r1, r1, r1, lsl #16
62 /*
63 * If we're copying > 64 bytes, then we may want to get
64 * onto a 16-byte boundary to improve speed even more.
65 */
66 cmp r2, #64
67 blt memset_route
68 ands r12, r0, #0xf
69 beq memset_route
70 /*
71 * Determine the number of bytes to move forward to get to the 16-byte
72 * boundary. Note that this will be a multiple of 4, since we
73 * already are word-aligned.
74 */
75 rsb r12, r12, #16
76 sub r2, r2, r12
77 lsls r12, r12, #29
78 strmi r1, [r0], #4
79 strcs r1, [r0], #4
80 strcs r1, [r0], #4
81 lsls r12, r12, #2
82 strcsh r1, [r0], #2
83memset_route:
84 /*
85 * Decide where to route for the maximum copy sizes. Note that we
86 * build q0 and q1 depending on if we'll need it, so that's
87 * interwoven here as well.
88 */
89 vdup.u32 d0, r1
90 cmp r2, #16
91 blt memset_8
92 vmov d1, d0
93 cmp r2, #64
94 blt memset_16
95 vmov q1, q0
96 cmp r2, #128
97 blt memset_32
98memset_128:
99 mov r12, r2, lsr #7
100memset_128_loop:
101 vst1.64 {q0, q1}, [r0]!
102 vst1.64 {q0, q1}, [r0]!
103 vst1.64 {q0, q1}, [r0]!
104 vst1.64 {q0, q1}, [r0]!
105 subs r12, r12, #1
106 bne memset_128_loop
107 ands r2, r2, #0x7f
108 beq memset_end
109memset_32:
110 movs r12, r2, lsr #5
111 beq memset_16
112memset_32_loop:
113 subs r12, r12, #1
114 vst1.64 {q0, q1}, [r0]!
115 bne memset_32_loop
116 ands r2, r2, #0x1f
117 beq memset_end
118memset_16:
119 movs r12, r2, lsr #4
120 beq memset_8
121memset_16_loop:
122 subs r12, r12, #1
123 vst1.32 {q0}, [r0]!
124 bne memset_16_loop
125 ands r2, r2, #0xf
126 beq memset_end
127 /*
128 * memset_8 isn't a loop, since we try to do our loops at 16
129 * bytes and above. We should loop there, then drop down here
130 * to finish the <16-byte versions. Same for memset_4 and
131 * memset_1.
132 */
133memset_8:
134 cmp r2, #8
135 blt memset_4
136 subs r2, r2, #8
137 vst1.32 {d0}, [r0]!
138memset_4:
139 cmp r2, #4
140 blt memset_2
141 subs r2, r2, #4
142 str r1, [r0], #4
143memset_2:
144 cmp r2, #0
145 ble memset_end
146 strh r1, [r0], #2
147memset_end:
148 pop {r0}
149 bx lr
150
151 .endfunc
152 .end