blob: 55fd3bd561d62082f177ec3c3e32a4aedcca272c [file] [log] [blame]
Christopher Ferris5f45d582013-08-07 13:09:51 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * This code assumes it is running on a processor that supports all arm v7
31 * instructions, that supports neon instructions, and that has a 32 byte
32 * cache line.
33 */
34
Nick Kralevich32bbf8a2013-10-02 16:54:58 -070035ENTRY_PRIVATE(MEMCPY_BASE)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070036 .cfi_startproc
37 .save {r0, lr}
38 .cfi_def_cfa_offset 8
39 .cfi_rel_offset r0, 0
40 .cfi_rel_offset lr, 4
41
Christopher Ferris5f45d582013-08-07 13:09:51 -070042 // Check so divider is at least 16 bytes, needed for alignment code.
43 cmp r2, #16
44 blo 5f
45
Christopher Ferris5f45d582013-08-07 13:09:51 -070046 /* check if buffers are aligned. If so, run arm-only version */
47 eor r3, r0, r1
48 ands r3, r3, #0x3
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070049 beq __memcpy_base_aligned
Christopher Ferris5f45d582013-08-07 13:09:51 -070050
51 /* Check the upper size limit for Neon unaligned memory access in memcpy */
52 cmp r2, #224
53 blo 3f
54
55 /* align destination to 16 bytes for the write-buffer */
56 rsb r3, r0, #0
57 ands r3, r3, #0xF
58 beq 3f
59
60 /* copy up to 15-bytes (count in r3) */
61 sub r2, r2, r3
62 movs ip, r3, lsl #31
63 itt mi
64 ldrbmi lr, [r1], #1
65 strbmi lr, [r0], #1
66 itttt cs
67 ldrbcs ip, [r1], #1
68 ldrbcs lr, [r1], #1
69 strbcs ip, [r0], #1
70 strbcs lr, [r0], #1
71 movs ip, r3, lsl #29
72 bge 1f
73 // copies 4 bytes, destination 32-bits aligned
74 vld1.32 {d0[0]}, [r1]!
75 vst1.32 {d0[0]}, [r0, :32]!
761: bcc 2f
77 // copies 8 bytes, destination 64-bits aligned
78 vld1.8 {d0}, [r1]!
79 vst1.8 {d0}, [r0, :64]!
802:
81 /* preload immediately the next cache line, which we may need */
82 pld [r1, #0]
83 pld [r1, #(32 * 2)]
843:
85 /* make sure we have at least 64 bytes to copy */
86 subs r2, r2, #64
87 blo 2f
88
89 /* preload all the cache lines we need */
90 pld [r1, #(32 * 4)]
91 pld [r1, #(32 * 6)]
92
931: /* The main loop copies 64 bytes at a time */
94 vld1.8 {d0 - d3}, [r1]!
95 vld1.8 {d4 - d7}, [r1]!
96 pld [r1, #(32 * 6)]
97 subs r2, r2, #64
98 vst1.8 {d0 - d3}, [r0]!
99 vst1.8 {d4 - d7}, [r0]!
100 bhs 1b
101
1022: /* fix-up the remaining count and make sure we have >= 32 bytes left */
103 add r2, r2, #64
104 subs r2, r2, #32
105 blo 4f
106
1073: /* 32 bytes at a time. These cache lines were already preloaded */
108 vld1.8 {d0 - d3}, [r1]!
109 subs r2, r2, #32
110 vst1.8 {d0 - d3}, [r0]!
111 bhs 3b
112
1134: /* less than 32 left */
114 add r2, r2, #32
115 tst r2, #0x10
116 beq 5f
117 // copies 16 bytes, 128-bits aligned
118 vld1.8 {d0, d1}, [r1]!
119 vst1.8 {d0, d1}, [r0]!
1205: /* copy up to 15-bytes (count in r2) */
121 movs ip, r2, lsl #29
122 bcc 1f
123 vld1.8 {d0}, [r1]!
124 vst1.8 {d0}, [r0]!
1251: bge 2f
126 vld1.32 {d0[0]}, [r1]!
127 vst1.32 {d0[0]}, [r0]!
1282: movs ip, r2, lsl #31
129 itt mi
130 ldrbmi r3, [r1], #1
131 strbmi r3, [r0], #1
132 itttt cs
133 ldrbcs ip, [r1], #1
134 ldrbcs lr, [r1], #1
135 strbcs ip, [r0], #1
136 strbcs lr, [r0], #1
137
138 ldmfd sp!, {r0, lr}
139 bx lr
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700140
141 .cfi_endproc
142END(MEMCPY_BASE)
143
Nick Kralevich32bbf8a2013-10-02 16:54:58 -0700144ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700145 .cfi_startproc
146
147 .save {r0, lr}
148 .cfi_def_cfa_offset 8
149 .cfi_rel_offset r0, 0
150 .cfi_rel_offset lr, 4
151
Christopher Ferris5f45d582013-08-07 13:09:51 -0700152 /* Simple arm-only copy loop to handle aligned copy operations */
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700153 stmfd sp!, {r4-r8}
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700154 .save {r4-r8}
155 .cfi_adjust_cfa_offset 20
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700156 .cfi_rel_offset r4, 0
157 .cfi_rel_offset r5, 4
158 .cfi_rel_offset r6, 8
159 .cfi_rel_offset r7, 12
160 .cfi_rel_offset r8, 16
Christopher Ferris5f45d582013-08-07 13:09:51 -0700161 pld [r1, #(32 * 4)]
162
163 /* Check alignment */
164 rsb r3, r1, #0
165 ands r3, #3
166 beq 2f
167
168 /* align source to 32 bits. We need to insert 2 instructions between
169 * a ldr[b|h] and str[b|h] because byte and half-word instructions
170 * stall 2 cycles.
171 */
172 movs r12, r3, lsl #31
173 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
174 itt mi
175 ldrbmi r3, [r1], #1
176 strbmi r3, [r0], #1
177 itttt cs
178 ldrbcs r4, [r1], #1
179 ldrbcs r5, [r1], #1
180 strbcs r4, [r0], #1
181 strbcs r5, [r0], #1
182
1832:
184 subs r2, r2, #64
185 blt 4f
186
1873: /* Main copy loop, copying 64 bytes at a time */
188 pld [r1, #(32 * 8)]
189 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
190 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
191 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
192 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
193 subs r2, r2, #64
194 bge 3b
195
1964: /* Check if there are > 32 bytes left */
197 adds r2, r2, #64
198 subs r2, r2, #32
199 blt 5f
200
201 /* Copy 32 bytes */
202 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
203 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
204 subs r2, #32
205
2065: /* Handle any remaining bytes */
207 adds r2, #32
208 beq 6f
209
210 movs r12, r2, lsl #28
211 itt cs
212 ldmiacs r1!, {r3, r4, r5, r6} /* 16 bytes */
213 stmiacs r0!, {r3, r4, r5, r6}
214 itt mi
215 ldmiami r1!, {r7, r8} /* 8 bytes */
216 stmiami r0!, {r7, r8}
217 movs r12, r2, lsl #30
218 itt cs
219 ldrcs r3, [r1], #4 /* 4 bytes */
220 strcs r3, [r0], #4
221 itt mi
222 ldrhmi r4, [r1], #2 /* 2 bytes */
223 strhmi r4, [r0], #2
224 tst r2, #0x1
225 itt ne
226 ldrbne r3, [r1] /* last byte */
227 strbne r3, [r0]
2286:
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700229 ldmfd sp!, {r4-r8}
Christopher Ferris5f45d582013-08-07 13:09:51 -0700230 ldmfd sp!, {r0, pc}
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700231
232 .cfi_endproc
233END(MEMCPY_BASE_ALIGNED)