blob: 647e0653fb5a93f040ee1dddf75b259f15b2d386 [file] [log] [blame]
Christopher Ferris5f45d582013-08-07 13:09:51 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28/*
29 * Copyright (c) 2013 ARM Ltd
30 * All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. The name of the company may not be used to endorse or promote
41 * products derived from this software without specific prior written
42 * permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
45 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
46 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
47 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
49 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
50 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
51 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
52 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
53 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 */
55
56 // Assumes that n >= 0, and dst, src are valid pointers.
57 // For any sizes less than 832 use the neon code that doesn't
58 // care about the src alignment. This avoids any checks
59 // for src alignment, and offers the best improvement since
60 // smaller sized copies are dominated by the overhead of
61 // the pre and post main loop.
62 // For larger copies, if src and dst cannot both be aligned to
63 // word boundaries, use the neon code.
64 // For all other copies, align dst to a double word boundary
65 // and copy using LDRD/STRD instructions.
66
67 cmp r2, #16
68 blo .L_copy_less_than_16_unknown_align
69
70 cmp r2, #832
71 bge .L_check_alignment
72
73.L_copy_unknown_alignment:
74 // Unknown alignment of src and dst.
75 // Assumes that the first few bytes have already been prefetched.
76
77 // Align destination to 128 bits. The mainloop store instructions
78 // require this alignment or they will throw an exception.
79 rsb r3, r0, #0
80 ands r3, r3, #0xF
81 beq 2f
82
83 // Copy up to 15 bytes (count in r3).
84 sub r2, r2, r3
85 movs ip, r3, lsl #31
86
87 itt mi
88 ldrbmi lr, [r1], #1
89 strbmi lr, [r0], #1
90 itttt cs
91 ldrbcs ip, [r1], #1
92 ldrbcs lr, [r1], #1
93 strbcs ip, [r0], #1
94 strbcs lr, [r0], #1
95
96 movs ip, r3, lsl #29
97 bge 1f
98 // Copies 4 bytes, dst 32 bits aligned before, at least 64 bits after.
99 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
100 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1011: bcc 2f
102 // Copies 8 bytes, dst 64 bits aligned before, at least 128 bits after.
103 vld1.8 {d0}, [r1]!
104 vst1.8 {d0}, [r0, :64]!
105
1062: // Make sure we have at least 64 bytes to copy.
107 subs r2, r2, #64
108 blo 2f
109
1101: // The main loop copies 64 bytes at a time.
111 vld1.8 {d0 - d3}, [r1]!
112 vld1.8 {d4 - d7}, [r1]!
113 pld [r1, #(64*4)]
114 subs r2, r2, #64
115 vst1.8 {d0 - d3}, [r0, :128]!
116 vst1.8 {d4 - d7}, [r0, :128]!
117 bhs 1b
118
1192: // Fix-up the remaining count and make sure we have >= 32 bytes left.
120 adds r2, r2, #32
121 blo 3f
122
123 // 32 bytes. These cache lines were already preloaded.
124 vld1.8 {d0 - d3}, [r1]!
125 sub r2, r2, #32
126 vst1.8 {d0 - d3}, [r0, :128]!
1273: // Less than 32 left.
128 add r2, r2, #32
129 tst r2, #0x10
130 beq .L_copy_less_than_16_unknown_align
131 // Copies 16 bytes, destination 128 bits aligned.
132 vld1.8 {d0, d1}, [r1]!
133 vst1.8 {d0, d1}, [r0, :128]!
134
135.L_copy_less_than_16_unknown_align:
136 // Copy up to 15 bytes (count in r2).
137 movs ip, r2, lsl #29
138 bcc 1f
139 vld1.8 {d0}, [r1]!
140 vst1.8 {d0}, [r0]!
1411: bge 2f
142 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
143 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
144
1452: // Copy 0 to 4 bytes.
146 lsls r2, r2, #31
147 itt ne
148 ldrbne lr, [r1], #1
149 strbne lr, [r0], #1
150 itttt cs
151 ldrbcs ip, [r1], #1
152 ldrbcs lr, [r1]
153 strbcs ip, [r0], #1
154 strbcs lr, [r0]
155
156 pop {r0, pc}
157
158.L_check_alignment:
159 // If src and dst cannot both be aligned to a word boundary,
160 // use the unaligned copy version.
161 eor r3, r0, r1
162 ands r3, r3, #0x3
163 bne .L_copy_unknown_alignment
164
165 // To try and improve performance, stack layout changed,
166 // i.e., not keeping the stack looking like users expect
167 // (highest numbered register at highest address).
168 // TODO: Add debug frame directives.
169 // We don't need exception unwind directives, because the code below
170 // does not throw any exceptions and does not call any other functions.
171 // Generally, newlib functions like this lack debug information for
172 // assembler source.
173 .save {r4, r5}
174 strd r4, r5, [sp, #-8]!
175 .save {r6, r7}
176 strd r6, r7, [sp, #-8]!
177 .save {r8, r9}
178 strd r8, r9, [sp, #-8]!
179
180 // Optimized for already aligned dst code.
181 ands ip, r0, #3
182 bne .L_dst_not_word_aligned
183
184.L_word_aligned:
185 // Align the destination buffer to 8 bytes, to make sure double
186 // loads and stores don't cross a cache line boundary,
187 // as they are then more expensive even if the data is in the cache
188 // (require two load/store issue cycles instead of one).
189 // If only one of the buffers is not 8 bytes aligned,
190 // then it's more important to align dst than src,
191 // because there is more penalty for stores
192 // than loads that cross a cacheline boundary.
193 // This check and realignment are only done if there is >= 832
194 // bytes to copy.
195
196 // Dst is word aligned, but check if it is already double word aligned.
197 ands r3, r0, #4
198 beq 1f
199 ldr r3, [r1], #4
200 str r3, [r0], #4
201 sub r2, #4
202
2031: // Can only get here if > 64 bytes to copy, so don't do check r2.
204 sub r2, #64
205
2062: // Every loop iteration copies 64 bytes.
207 .irp offset, #0, #8, #16, #24, #32
208 ldrd r4, r5, [r1, \offset]
209 strd r4, r5, [r0, \offset]
210 .endr
211
212 ldrd r4, r5, [r1, #40]
213 ldrd r6, r7, [r1, #48]
214 ldrd r8, r9, [r1, #56]
215
216 // Keep the pld as far from the next load as possible.
217 // The amount to prefetch was determined experimentally using
218 // large sizes, and verifying the prefetch size does not affect
219 // the smaller copies too much.
220 // WARNING: If the ldrd and strd instructions get too far away
221 // from each other, performance suffers. Three loads
222 // in a row is the best tradeoff.
223 pld [r1, #(64*16)]
224 strd r4, r5, [r0, #40]
225 strd r6, r7, [r0, #48]
226 strd r8, r9, [r0, #56]
227
228 add r0, r0, #64
229 add r1, r1, #64
230 subs r2, r2, #64
231 bge 2b
232
233 // Fix-up the remaining count and make sure we have >= 32 bytes left.
234 adds r2, r2, #32
235 blo 4f
236
237 // Copy 32 bytes. These cache lines were already preloaded.
238 .irp offset, #0, #8, #16, #24
239 ldrd r4, r5, [r1, \offset]
240 strd r4, r5, [r0, \offset]
241 .endr
242 add r1, r1, #32
243 add r0, r0, #32
244 sub r2, r2, #32
2454: // Less than 32 left.
246 add r2, r2, #32
247 tst r2, #0x10
248 beq 5f
249 // Copy 16 bytes.
250 .irp offset, #0, #8
251 ldrd r4, r5, [r1, \offset]
252 strd r4, r5, [r0, \offset]
253 .endr
254 add r1, r1, #16
255 add r0, r0, #16
256
2575: // Copy up to 15 bytes (count in r2).
258 movs ip, r2, lsl #29
259 bcc 1f
260 // Copy 8 bytes.
261 ldrd r4, r5, [r1], #8
262 strd r4, r5, [r0], #8
2631: bge 2f
264 // Copy 4 bytes.
265 ldr r4, [r1], #4
266 str r4, [r0], #4
2672: // Copy 0 to 4 bytes.
268 lsls r2, r2, #31
269 itt ne
270 ldrbne lr, [r1], #1
271 strbne lr, [r0], #1
272 itttt cs
273 ldrbcs ip, [r1], #1
274 ldrbcs lr, [r1]
275 strbcs ip, [r0], #1
276 strbcs lr, [r0]
277
278 // Restore registers: optimized pop {r0, pc}
279 ldrd r8, r9, [sp], #8
280 ldrd r6, r7, [sp], #8
281 ldrd r4, r5, [sp], #8
282 pop {r0, pc}
283
284.L_dst_not_word_aligned:
285 // Align dst to word.
286 rsb ip, ip, #4
287 cmp ip, #2
288
289 itt gt
290 ldrbgt lr, [r1], #1
291 strbgt lr, [r0], #1
292
293 itt ge
294 ldrbge lr, [r1], #1
295 strbge lr, [r0], #1
296
297 ldrb lr, [r1], #1
298 strb lr, [r0], #1
299
300 sub r2, r2, ip
301
302 // Src is guaranteed to be at least word aligned by this point.
303 b .L_word_aligned