blob: 975500a475e5162de466309866ccf3456adfcd04 [file] [log] [blame]
Travis Geiselbrecht1d0df692008-09-01 02:26:09 -07001/*
2 * Copyright (c) 2008 Travis Geiselbrecht
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files
6 * (the "Software"), to deal in the Software without restriction,
7 * including without limitation the rights to use, copy, modify, merge,
8 * publish, distribute, sublicense, and/or sell copies of the Software,
9 * and to permit persons to whom the Software is furnished to do so,
10 * subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23#include <asm.h>
24#include <arch/arm/cores.h>
25
26.text
27.align 2
28
29 .global mymemcpy
30mymemcpy:
31 // check for zero length copy or the same pointer
32 cmp r2, #0
33 cmpne r1, r0
34 bxeq lr
35
36 // save a few registers for use and the return code (input dst)
37 stmfd sp!, {r0, r4, r5, lr}
38
39 // check for forwards overlap (src > dst, distance < len)
40 subs r3, r0, r1
41 cmpgt r2, r3
42 bgt .L_forwardoverlap
43
44 // check for a short copy len.
45 // 20 bytes is enough so that if a 16 byte alignment needs to happen there is at least a
46 // wordwise copy worth of work to be done.
47 cmp r2, #(16+4)
48 blt .L_bytewise
49
50 // see if they are similarly aligned on 4 byte boundaries
51 eor r3, r0, r1
52 tst r3, #3
53 bne .L_bytewise // dissimilarly aligned, nothing we can do (for now)
54
55 // check for 16 byte alignment on dst.
56 // this will also catch src being not 4 byte aligned, since it is similarly 4 byte
57 // aligned with dst at this point.
58 tst r0, #15
59 bne .L_not16bytealigned
60
61 // check to see if we have at least 32 bytes of data to copy.
62 // if not, just revert to wordwise copy
63 cmp r2, #32
64 blt .L_wordwise
65
66.L_bigcopy:
67 // copy 32 bytes at a time. src & dst need to be at least 4 byte aligned,
68 // and we need at least 32 bytes remaining to copy
69
70 // save r6-r7 for use in the big copy
71 stmfd sp!, {r6-r7}
72
73 sub r2, r2, #32 // subtract an extra 32 to the len so we can avoid an extra compare
74
75.L_bigcopy_loop:
76 ldmia r1!, {r4, r5, r6, r7}
77 stmia r0!, {r4, r5, r6, r7}
78 ldmia r1!, {r4, r5, r6, r7}
79 subs r2, r2, #32
80 stmia r0!, {r4, r5, r6, r7}
81 bge .L_bigcopy_loop
82
83 // restore r6-r7
84 ldmfd sp!, {r6-r7}
85
86 // see if we are done
87 adds r2, r2, #32
88 beq .L_done
89
90 // less then 4 bytes left?
91 cmp r2, #4
92 blt .L_bytewise
93
94.L_wordwise:
95 // copy 4 bytes at a time.
96 // src & dst are guaranteed to be word aligned, and at least 4 bytes are left to copy.
97 subs r2, r2, #4
98
99.L_wordwise_loop:
100 ldr r3, [r1], #4
101 subs r2, r2, #4
102 str r3, [r0], #4
103 bge .L_wordwise_loop
104
105 // correct the remaining len and test for completion
106 adds r2, r2, #4
107 beq .L_done
108
109.L_bytewise:
110 // simple bytewise copy
111 ldrb r3, [r1], #1
112 subs r2, r2, #1
113 strb r3, [r0], #1
114 bgt .L_bytewise
115
116.L_done:
117 // load dst for return and restore r4,r5
118#if ARM_ARCH_LEVEL >= 5
119 ldmfd sp!, {r0, r4, r5, pc}
120#else
121 ldmfd sp!, {r0, r4, r5, lr}
122 bx lr
123#endif
124
125.L_not16bytealigned:
126 // dst is not 16 byte aligned, so we will copy up to 15 bytes to get it aligned.
127 // src is guaranteed to be similarly word aligned with dst.
128
129 // set the condition flags based on the alignment.
130 lsl r12, r0, #28
131 rsb r12, r12, #0
132 msr CPSR_f, r12 // move into NZCV fields in CPSR
133
134 // move as many bytes as necessary to get the dst aligned
135 ldrvsb r3, [r1], #1 // V set
136 ldrcsh r4, [r1], #2 // C set
137 ldreq r5, [r1], #4 // Z set
138
139 strvsb r3, [r0], #1
140 strcsh r4, [r0], #2
141 streq r5, [r0], #4
142
143 ldmmiia r1!, {r3-r4} // N set
144 stmmiia r0!, {r3-r4}
145
146 // fix the remaining len
147 sub r2, r2, r12, lsr #28
148
149 // test to see what we should do now
150 cmp r2, #32
151 bge .L_bigcopy
152 b .L_wordwise
153
154 // src and dest overlap 'forwards' or dst > src
155.L_forwardoverlap:
156
157 // do a bytewise reverse copy for now
158 add r1, r1, r2
159 add r0, r0, r2
160
161.L_bytewisereverse:
162 // simple bytewise reverse copy
163 ldrb r3, [r1], #-1
164 subs r2, r2, #1
165 strb r3, [r0], #-1
166 bgt .L_bytewisereverse
167
168 b .L_done
169