blob: 6ba4931f9d5088d728638eb0e52859581dea33ad [file] [log] [blame]
Christopher Ferris5f45d582013-08-07 13:09:51 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28/*
29 * Copyright (c) 2013 ARM Ltd
30 * All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. The name of the company may not be used to endorse or promote
41 * products derived from this software without specific prior written
42 * permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
45 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
46 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
49 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
50 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
51 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
52 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
53 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 */
55
Nick Kralevich32bbf8a2013-10-02 16:54:58 -070056ENTRY_PRIVATE(MEMCPY_BASE)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070057 .save {r0, lr}
58 .cfi_def_cfa_offset 8
59 .cfi_rel_offset r0, 0
60 .cfi_rel_offset lr, 4
61
Christopher Ferris5f45d582013-08-07 13:09:51 -070062 // Assumes that n >= 0, and dst, src are valid pointers.
63 // For any sizes less than 832 use the neon code that doesn't
64 // care about the src alignment. This avoids any checks
65 // for src alignment, and offers the best improvement since
66 // smaller sized copies are dominated by the overhead of
67 // the pre and post main loop.
68 // For larger copies, if src and dst cannot both be aligned to
69 // word boundaries, use the neon code.
70 // For all other copies, align dst to a double word boundary
71 // and copy using LDRD/STRD instructions.
72
73 cmp r2, #16
74 blo .L_copy_less_than_16_unknown_align
75
Christopher Ferrisac6bc312013-10-15 14:54:02 -070076 // TODO: The aligned copy code is extremely slow copying some large
77 // buffers so always go through the unaligned path for now.
78 //cmp r2, #832
79 //bge .L_check_alignment
Christopher Ferris5f45d582013-08-07 13:09:51 -070080
81.L_copy_unknown_alignment:
82 // Unknown alignment of src and dst.
83 // Assumes that the first few bytes have already been prefetched.
84
85 // Align destination to 128 bits. The mainloop store instructions
86 // require this alignment or they will throw an exception.
87 rsb r3, r0, #0
88 ands r3, r3, #0xF
89 beq 2f
90
91 // Copy up to 15 bytes (count in r3).
92 sub r2, r2, r3
93 movs ip, r3, lsl #31
94
95 itt mi
96 ldrbmi lr, [r1], #1
97 strbmi lr, [r0], #1
98 itttt cs
99 ldrbcs ip, [r1], #1
100 ldrbcs lr, [r1], #1
101 strbcs ip, [r0], #1
102 strbcs lr, [r0], #1
103
104 movs ip, r3, lsl #29
105 bge 1f
106 // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
107 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
108 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1091: bcc 2f
110 // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
111 vld1.8 {d0}, [r1]!
112 vst1.8 {d0}, [r0, :64]!
113
1142: // Make sure we have at least 64 bytes to copy.
115 subs r2, r2, #64
116 blo 2f
117
1181: // The main loop copies 64 bytes at a time.
119 vld1.8 {d0 - d3}, [r1]!
120 vld1.8 {d4 - d7}, [r1]!
121 pld [r1, #(64*4)]
122 subs r2, r2, #64
123 vst1.8 {d0 - d3}, [r0, :128]!
124 vst1.8 {d4 - d7}, [r0, :128]!
125 bhs 1b
126
1272: // Fix-up the remaining count and make sure we have >= 32 bytes left.
128 adds r2, r2, #32
129 blo 3f
130
131 // 32 bytes. These cache lines were already preloaded.
132 vld1.8 {d0 - d3}, [r1]!
133 sub r2, r2, #32
134 vst1.8 {d0 - d3}, [r0, :128]!
1353: // Less than 32 left.
136 add r2, r2, #32
137 tst r2, #0x10
138 beq .L_copy_less_than_16_unknown_align
139 // Copies 16 bytes, destination 128 bits aligned.
140 vld1.8 {d0, d1}, [r1]!
141 vst1.8 {d0, d1}, [r0, :128]!
142
143.L_copy_less_than_16_unknown_align:
144 // Copy up to 15 bytes (count in r2).
145 movs ip, r2, lsl #29
146 bcc 1f
147 vld1.8 {d0}, [r1]!
148 vst1.8 {d0}, [r0]!
1491: bge 2f
150 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
151 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
152
1532: // Copy 0 to 4 bytes.
154 lsls r2, r2, #31
155 itt ne
156 ldrbne lr, [r1], #1
157 strbne lr, [r0], #1
158 itttt cs
159 ldrbcs ip, [r1], #1
160 ldrbcs lr, [r1]
161 strbcs ip, [r0], #1
162 strbcs lr, [r0]
163
164 pop {r0, pc}
165
166.L_check_alignment:
167 // If src and dst cannot both be aligned to a word boundary,
168 // use the unaligned copy version.
169 eor r3, r0, r1
170 ands r3, r3, #0x3
171 bne .L_copy_unknown_alignment
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700172END(MEMCPY_BASE)
173
Nick Kralevich32bbf8a2013-10-02 16:54:58 -0700174ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700175 .save {r0, lr}
176 .cfi_def_cfa_offset 8
177 .cfi_rel_offset r0, 0
178 .cfi_rel_offset lr, 4
179
Christopher Ferris5f45d582013-08-07 13:09:51 -0700180 // To try and improve performance, stack layout changed,
181 // i.e., not keeping the stack looking like users expect
182 // (highest numbered register at highest address).
Christopher Ferris5f45d582013-08-07 13:09:51 -0700183 strd r4, r5, [sp, #-8]!
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700184 .save {r4, r5}
185 .cfi_adjust_cfa_offset 8
186 .cfi_rel_offset r4, 0
187 .cfi_rel_offset r5, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -0700188 strd r6, r7, [sp, #-8]!
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700189 .save {r6, r7}
190 .cfi_adjust_cfa_offset 8
191 .cfi_rel_offset r6, 0
192 .cfi_rel_offset r7, 0
Christopher Ferris5f45d582013-08-07 13:09:51 -0700193 strd r8, r9, [sp, #-8]!
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700194 .save {r8, r9}
195 .cfi_adjust_cfa_offset 8
196 .cfi_rel_offset r8, 0
197 .cfi_rel_offset r9, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -0700198
199 // Optimized for already aligned dst code.
200 ands ip, r0, #3
201 bne .L_dst_not_word_aligned
202
203.L_word_aligned:
204 // Align the destination buffer to 8 bytes, to make sure double
205 // loads and stores don't cross a cache line boundary,
206 // as they are then more expensive even if the data is in the cache
207 // (require two load/store issue cycles instead of one).
208 // If only one of the buffers is not 8 bytes aligned,
209 // then it's more important to align dst than src,
210 // because there is more penalty for stores
211 // than loads that cross a cacheline boundary.
212 // This check and realignment are only done if there is >= 832
213 // bytes to copy.
214
215 // Dst is word aligned, but check if it is already double word aligned.
216 ands r3, r0, #4
217 beq 1f
218 ldr r3, [r1], #4
219 str r3, [r0], #4
220 sub r2, #4
221
2221: // Can only get here if > 64 bytes to copy, so don't do check r2.
223 sub r2, #64
224
2252: // Every loop iteration copies 64 bytes.
226 .irp offset, #0, #8, #16, #24, #32
227 ldrd r4, r5, [r1, \offset]
228 strd r4, r5, [r0, \offset]
229 .endr
230
231 ldrd r4, r5, [r1, #40]
232 ldrd r6, r7, [r1, #48]
233 ldrd r8, r9, [r1, #56]
234
235 // Keep the pld as far from the next load as possible.
236 // The amount to prefetch was determined experimentally using
237 // large sizes, and verifying the prefetch size does not affect
238 // the smaller copies too much.
239 // WARNING: If the ldrd and strd instructions get too far away
240 // from each other, performance suffers. Three loads
241 // in a row is the best tradeoff.
242 pld [r1, #(64*16)]
243 strd r4, r5, [r0, #40]
244 strd r6, r7, [r0, #48]
245 strd r8, r9, [r0, #56]
246
247 add r0, r0, #64
248 add r1, r1, #64
249 subs r2, r2, #64
250 bge 2b
251
252 // Fix-up the remaining count and make sure we have >= 32 bytes left.
253 adds r2, r2, #32
254 blo 4f
255
256 // Copy 32 bytes. These cache lines were already preloaded.
257 .irp offset, #0, #8, #16, #24
258 ldrd r4, r5, [r1, \offset]
259 strd r4, r5, [r0, \offset]
260 .endr
261 add r1, r1, #32
262 add r0, r0, #32
263 sub r2, r2, #32
2644: // Less than 32 left.
265 add r2, r2, #32
266 tst r2, #0x10
267 beq 5f
268 // Copy 16 bytes.
269 .irp offset, #0, #8
270 ldrd r4, r5, [r1, \offset]
271 strd r4, r5, [r0, \offset]
272 .endr
273 add r1, r1, #16
274 add r0, r0, #16
275
2765: // Copy up to 15 bytes (count in r2).
277 movs ip, r2, lsl #29
278 bcc 1f
279 // Copy 8 bytes.
280 ldrd r4, r5, [r1], #8
281 strd r4, r5, [r0], #8
2821: bge 2f
283 // Copy 4 bytes.
284 ldr r4, [r1], #4
285 str r4, [r0], #4
2862: // Copy 0 to 4 bytes.
287 lsls r2, r2, #31
288 itt ne
289 ldrbne lr, [r1], #1
290 strbne lr, [r0], #1
291 itttt cs
292 ldrbcs ip, [r1], #1
293 ldrbcs lr, [r1]
294 strbcs ip, [r0], #1
295 strbcs lr, [r0]
296
297 // Restore registers: optimized pop {r0, pc}
298 ldrd r8, r9, [sp], #8
299 ldrd r6, r7, [sp], #8
300 ldrd r4, r5, [sp], #8
301 pop {r0, pc}
302
303.L_dst_not_word_aligned:
304 // Align dst to word.
305 rsb ip, ip, #4
306 cmp ip, #2
307
308 itt gt
309 ldrbgt lr, [r1], #1
310 strbgt lr, [r0], #1
311
312 itt ge
313 ldrbge lr, [r1], #1
314 strbge lr, [r0], #1
315
316 ldrb lr, [r1], #1
317 strb lr, [r0], #1
318
319 sub r2, r2, ip
320
321 // Src is guaranteed to be at least word aligned by this point.
322 b .L_word_aligned
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700323END(MEMCPY_BASE_ALIGNED)