blob: 2a7385247d9385d8f13fa1913b7dd00ef709d770 [file] [log] [blame]
Christopher Ferris5f45d582013-08-07 13:09:51 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28/*
29 * Copyright (c) 2013 ARM Ltd
30 * All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. The name of the company may not be used to endorse or promote
41 * products derived from this software without specific prior written
42 * permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
45 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
46 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
49 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
50 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
51 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
52 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
53 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 */
55
Nick Kralevich32bbf8a2013-10-02 16:54:58 -070056ENTRY_PRIVATE(MEMCPY_BASE)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070057 .cfi_def_cfa_offset 8
58 .cfi_rel_offset r0, 0
59 .cfi_rel_offset lr, 4
60
Christopher Ferris5f45d582013-08-07 13:09:51 -070061 // Assumes that n >= 0, and dst, src are valid pointers.
62 // For any sizes less than 832 use the neon code that doesn't
63 // care about the src alignment. This avoids any checks
64 // for src alignment, and offers the best improvement since
65 // smaller sized copies are dominated by the overhead of
66 // the pre and post main loop.
67 // For larger copies, if src and dst cannot both be aligned to
68 // word boundaries, use the neon code.
69 // For all other copies, align dst to a double word boundary
70 // and copy using LDRD/STRD instructions.
71
72 cmp r2, #16
73 blo .L_copy_less_than_16_unknown_align
74
Christopher Ferrisac6bc312013-10-15 14:54:02 -070075 // TODO: The aligned copy code is extremely slow copying some large
76 // buffers so always go through the unaligned path for now.
77 //cmp r2, #832
78 //bge .L_check_alignment
Christopher Ferris5f45d582013-08-07 13:09:51 -070079
80.L_copy_unknown_alignment:
81 // Unknown alignment of src and dst.
82 // Assumes that the first few bytes have already been prefetched.
83
84 // Align destination to 128 bits. The mainloop store instructions
85 // require this alignment or they will throw an exception.
86 rsb r3, r0, #0
87 ands r3, r3, #0xF
88 beq 2f
89
90 // Copy up to 15 bytes (count in r3).
91 sub r2, r2, r3
92 movs ip, r3, lsl #31
93
94 itt mi
95 ldrbmi lr, [r1], #1
96 strbmi lr, [r0], #1
97 itttt cs
98 ldrbcs ip, [r1], #1
99 ldrbcs lr, [r1], #1
100 strbcs ip, [r0], #1
101 strbcs lr, [r0], #1
102
103 movs ip, r3, lsl #29
104 bge 1f
105 // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
106 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
107 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1081: bcc 2f
109 // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
110 vld1.8 {d0}, [r1]!
111 vst1.8 {d0}, [r0, :64]!
112
1132: // Make sure we have at least 64 bytes to copy.
114 subs r2, r2, #64
115 blo 2f
116
1171: // The main loop copies 64 bytes at a time.
118 vld1.8 {d0 - d3}, [r1]!
119 vld1.8 {d4 - d7}, [r1]!
120 pld [r1, #(64*4)]
121 subs r2, r2, #64
122 vst1.8 {d0 - d3}, [r0, :128]!
123 vst1.8 {d4 - d7}, [r0, :128]!
124 bhs 1b
125
1262: // Fix-up the remaining count and make sure we have >= 32 bytes left.
127 adds r2, r2, #32
128 blo 3f
129
130 // 32 bytes. These cache lines were already preloaded.
131 vld1.8 {d0 - d3}, [r1]!
132 sub r2, r2, #32
133 vst1.8 {d0 - d3}, [r0, :128]!
1343: // Less than 32 left.
135 add r2, r2, #32
136 tst r2, #0x10
137 beq .L_copy_less_than_16_unknown_align
138 // Copies 16 bytes, destination 128 bits aligned.
139 vld1.8 {d0, d1}, [r1]!
140 vst1.8 {d0, d1}, [r0, :128]!
141
142.L_copy_less_than_16_unknown_align:
143 // Copy up to 15 bytes (count in r2).
144 movs ip, r2, lsl #29
145 bcc 1f
146 vld1.8 {d0}, [r1]!
147 vst1.8 {d0}, [r0]!
1481: bge 2f
149 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
150 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
151
1522: // Copy 0 to 4 bytes.
153 lsls r2, r2, #31
154 itt ne
155 ldrbne lr, [r1], #1
156 strbne lr, [r0], #1
157 itttt cs
158 ldrbcs ip, [r1], #1
159 ldrbcs lr, [r1]
160 strbcs ip, [r0], #1
161 strbcs lr, [r0]
162
163 pop {r0, pc}
164
165.L_check_alignment:
166 // If src and dst cannot both be aligned to a word boundary,
167 // use the unaligned copy version.
168 eor r3, r0, r1
169 ands r3, r3, #0x3
170 bne .L_copy_unknown_alignment
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700171END(MEMCPY_BASE)
172
Nick Kralevich32bbf8a2013-10-02 16:54:58 -0700173ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700174 .cfi_def_cfa_offset 8
175 .cfi_rel_offset r0, 0
176 .cfi_rel_offset lr, 4
177
Christopher Ferris5f45d582013-08-07 13:09:51 -0700178 // To try and improve performance, stack layout changed,
179 // i.e., not keeping the stack looking like users expect
180 // (highest numbered register at highest address).
Christopher Ferris5f45d582013-08-07 13:09:51 -0700181 strd r4, r5, [sp, #-8]!
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700182 .cfi_adjust_cfa_offset 8
183 .cfi_rel_offset r4, 0
184 .cfi_rel_offset r5, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -0700185 strd r6, r7, [sp, #-8]!
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700186 .cfi_adjust_cfa_offset 8
187 .cfi_rel_offset r6, 0
188 .cfi_rel_offset r7, 0
Christopher Ferris5f45d582013-08-07 13:09:51 -0700189 strd r8, r9, [sp, #-8]!
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700190 .cfi_adjust_cfa_offset 8
191 .cfi_rel_offset r8, 0
192 .cfi_rel_offset r9, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -0700193
194 // Optimized for already aligned dst code.
195 ands ip, r0, #3
196 bne .L_dst_not_word_aligned
197
198.L_word_aligned:
199 // Align the destination buffer to 8 bytes, to make sure double
200 // loads and stores don't cross a cache line boundary,
201 // as they are then more expensive even if the data is in the cache
202 // (require two load/store issue cycles instead of one).
203 // If only one of the buffers is not 8 bytes aligned,
204 // then it's more important to align dst than src,
205 // because there is more penalty for stores
206 // than loads that cross a cacheline boundary.
207 // This check and realignment are only done if there is >= 832
208 // bytes to copy.
209
210 // Dst is word aligned, but check if it is already double word aligned.
211 ands r3, r0, #4
212 beq 1f
213 ldr r3, [r1], #4
214 str r3, [r0], #4
215 sub r2, #4
216
2171: // Can only get here if > 64 bytes to copy, so don't do check r2.
218 sub r2, #64
219
2202: // Every loop iteration copies 64 bytes.
221 .irp offset, #0, #8, #16, #24, #32
222 ldrd r4, r5, [r1, \offset]
223 strd r4, r5, [r0, \offset]
224 .endr
225
226 ldrd r4, r5, [r1, #40]
227 ldrd r6, r7, [r1, #48]
228 ldrd r8, r9, [r1, #56]
229
230 // Keep the pld as far from the next load as possible.
231 // The amount to prefetch was determined experimentally using
232 // large sizes, and verifying the prefetch size does not affect
233 // the smaller copies too much.
234 // WARNING: If the ldrd and strd instructions get too far away
235 // from each other, performance suffers. Three loads
236 // in a row is the best tradeoff.
237 pld [r1, #(64*16)]
238 strd r4, r5, [r0, #40]
239 strd r6, r7, [r0, #48]
240 strd r8, r9, [r0, #56]
241
242 add r0, r0, #64
243 add r1, r1, #64
244 subs r2, r2, #64
245 bge 2b
246
247 // Fix-up the remaining count and make sure we have >= 32 bytes left.
248 adds r2, r2, #32
249 blo 4f
250
251 // Copy 32 bytes. These cache lines were already preloaded.
252 .irp offset, #0, #8, #16, #24
253 ldrd r4, r5, [r1, \offset]
254 strd r4, r5, [r0, \offset]
255 .endr
256 add r1, r1, #32
257 add r0, r0, #32
258 sub r2, r2, #32
2594: // Less than 32 left.
260 add r2, r2, #32
261 tst r2, #0x10
262 beq 5f
263 // Copy 16 bytes.
264 .irp offset, #0, #8
265 ldrd r4, r5, [r1, \offset]
266 strd r4, r5, [r0, \offset]
267 .endr
268 add r1, r1, #16
269 add r0, r0, #16
270
2715: // Copy up to 15 bytes (count in r2).
272 movs ip, r2, lsl #29
273 bcc 1f
274 // Copy 8 bytes.
275 ldrd r4, r5, [r1], #8
276 strd r4, r5, [r0], #8
2771: bge 2f
278 // Copy 4 bytes.
279 ldr r4, [r1], #4
280 str r4, [r0], #4
2812: // Copy 0 to 4 bytes.
282 lsls r2, r2, #31
283 itt ne
284 ldrbne lr, [r1], #1
285 strbne lr, [r0], #1
286 itttt cs
287 ldrbcs ip, [r1], #1
288 ldrbcs lr, [r1]
289 strbcs ip, [r0], #1
290 strbcs lr, [r0]
291
292 // Restore registers: optimized pop {r0, pc}
293 ldrd r8, r9, [sp], #8
294 ldrd r6, r7, [sp], #8
295 ldrd r4, r5, [sp], #8
296 pop {r0, pc}
297
298.L_dst_not_word_aligned:
299 // Align dst to word.
300 rsb ip, ip, #4
301 cmp ip, #2
302
303 itt gt
304 ldrbgt lr, [r1], #1
305 strbgt lr, [r0], #1
306
307 itt ge
308 ldrbge lr, [r1], #1
309 strbge lr, [r0], #1
310
311 ldrb lr, [r1], #1
312 strb lr, [r0], #1
313
314 sub r2, r2, ip
315
316 // Src is guaranteed to be at least word aligned by this point.
317 b .L_word_aligned
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700318END(MEMCPY_BASE_ALIGNED)