blob: c385657cab205eb3f270d7c07117eb6a8a2a5ccf [file] [log] [blame]
Christopher Ferris5f45d582013-08-07 13:09:51 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * This code assumes it is running on a processor that supports all arm v7
31 * instructions, that supports neon instructions, and that has a 32 byte
32 * cache line.
33 */
34
Nick Kralevich32bbf8a2013-10-02 16:54:58 -070035ENTRY_PRIVATE(MEMCPY_BASE)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070036 .save {r0, lr}
37 .cfi_def_cfa_offset 8
38 .cfi_rel_offset r0, 0
39 .cfi_rel_offset lr, 4
40
Christopher Ferris5f45d582013-08-07 13:09:51 -070041 // Check so divider is at least 16 bytes, needed for alignment code.
42 cmp r2, #16
43 blo 5f
44
Christopher Ferris5f45d582013-08-07 13:09:51 -070045 /* check if buffers are aligned. If so, run arm-only version */
46 eor r3, r0, r1
47 ands r3, r3, #0x3
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070048 beq __memcpy_base_aligned
Christopher Ferris5f45d582013-08-07 13:09:51 -070049
50 /* Check the upper size limit for Neon unaligned memory access in memcpy */
51 cmp r2, #224
52 blo 3f
53
54 /* align destination to 16 bytes for the write-buffer */
55 rsb r3, r0, #0
56 ands r3, r3, #0xF
57 beq 3f
58
59 /* copy up to 15-bytes (count in r3) */
60 sub r2, r2, r3
61 movs ip, r3, lsl #31
62 itt mi
63 ldrbmi lr, [r1], #1
64 strbmi lr, [r0], #1
65 itttt cs
66 ldrbcs ip, [r1], #1
67 ldrbcs lr, [r1], #1
68 strbcs ip, [r0], #1
69 strbcs lr, [r0], #1
70 movs ip, r3, lsl #29
71 bge 1f
72 // copies 4 bytes, destination 32-bits aligned
73 vld1.32 {d0[0]}, [r1]!
74 vst1.32 {d0[0]}, [r0, :32]!
751: bcc 2f
76 // copies 8 bytes, destination 64-bits aligned
77 vld1.8 {d0}, [r1]!
78 vst1.8 {d0}, [r0, :64]!
792:
80 /* preload immediately the next cache line, which we may need */
81 pld [r1, #0]
82 pld [r1, #(32 * 2)]
833:
84 /* make sure we have at least 64 bytes to copy */
85 subs r2, r2, #64
86 blo 2f
87
88 /* preload all the cache lines we need */
89 pld [r1, #(32 * 4)]
90 pld [r1, #(32 * 6)]
91
921: /* The main loop copies 64 bytes at a time */
93 vld1.8 {d0 - d3}, [r1]!
94 vld1.8 {d4 - d7}, [r1]!
95 pld [r1, #(32 * 6)]
96 subs r2, r2, #64
97 vst1.8 {d0 - d3}, [r0]!
98 vst1.8 {d4 - d7}, [r0]!
99 bhs 1b
100
1012: /* fix-up the remaining count and make sure we have >= 32 bytes left */
102 add r2, r2, #64
103 subs r2, r2, #32
104 blo 4f
105
1063: /* 32 bytes at a time. These cache lines were already preloaded */
107 vld1.8 {d0 - d3}, [r1]!
108 subs r2, r2, #32
109 vst1.8 {d0 - d3}, [r0]!
110 bhs 3b
111
1124: /* less than 32 left */
113 add r2, r2, #32
114 tst r2, #0x10
115 beq 5f
116 // copies 16 bytes, 128-bits aligned
117 vld1.8 {d0, d1}, [r1]!
118 vst1.8 {d0, d1}, [r0]!
1195: /* copy up to 15-bytes (count in r2) */
120 movs ip, r2, lsl #29
121 bcc 1f
122 vld1.8 {d0}, [r1]!
123 vst1.8 {d0}, [r0]!
1241: bge 2f
125 vld1.32 {d0[0]}, [r1]!
126 vst1.32 {d0[0]}, [r0]!
1272: movs ip, r2, lsl #31
128 itt mi
129 ldrbmi r3, [r1], #1
130 strbmi r3, [r0], #1
131 itttt cs
132 ldrbcs ip, [r1], #1
133 ldrbcs lr, [r1], #1
134 strbcs ip, [r0], #1
135 strbcs lr, [r0], #1
136
137 ldmfd sp!, {r0, lr}
138 bx lr
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700139END(MEMCPY_BASE)
140
Nick Kralevich32bbf8a2013-10-02 16:54:58 -0700141ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700142 .save {r0, lr}
143 .cfi_def_cfa_offset 8
144 .cfi_rel_offset r0, 0
145 .cfi_rel_offset lr, 4
146
Christopher Ferris5f45d582013-08-07 13:09:51 -0700147 /* Simple arm-only copy loop to handle aligned copy operations */
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700148 stmfd sp!, {r4-r8}
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700149 .save {r4-r8}
150 .cfi_adjust_cfa_offset 20
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700151 .cfi_rel_offset r4, 0
152 .cfi_rel_offset r5, 4
153 .cfi_rel_offset r6, 8
154 .cfi_rel_offset r7, 12
155 .cfi_rel_offset r8, 16
Christopher Ferris5f45d582013-08-07 13:09:51 -0700156 pld [r1, #(32 * 4)]
157
158 /* Check alignment */
159 rsb r3, r1, #0
160 ands r3, #3
161 beq 2f
162
163 /* align source to 32 bits. We need to insert 2 instructions between
164 * a ldr[b|h] and str[b|h] because byte and half-word instructions
165 * stall 2 cycles.
166 */
167 movs r12, r3, lsl #31
168 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
169 itt mi
170 ldrbmi r3, [r1], #1
171 strbmi r3, [r0], #1
172 itttt cs
173 ldrbcs r4, [r1], #1
174 ldrbcs r5, [r1], #1
175 strbcs r4, [r0], #1
176 strbcs r5, [r0], #1
177
1782:
179 subs r2, r2, #64
180 blt 4f
181
1823: /* Main copy loop, copying 64 bytes at a time */
183 pld [r1, #(32 * 8)]
184 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
185 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
186 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
187 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
188 subs r2, r2, #64
189 bge 3b
190
1914: /* Check if there are > 32 bytes left */
192 adds r2, r2, #64
193 subs r2, r2, #32
194 blt 5f
195
196 /* Copy 32 bytes */
197 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
198 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
199 subs r2, #32
200
2015: /* Handle any remaining bytes */
202 adds r2, #32
203 beq 6f
204
205 movs r12, r2, lsl #28
206 itt cs
207 ldmiacs r1!, {r3, r4, r5, r6} /* 16 bytes */
208 stmiacs r0!, {r3, r4, r5, r6}
209 itt mi
210 ldmiami r1!, {r7, r8} /* 8 bytes */
211 stmiami r0!, {r7, r8}
212 movs r12, r2, lsl #30
213 itt cs
214 ldrcs r3, [r1], #4 /* 4 bytes */
215 strcs r3, [r0], #4
216 itt mi
217 ldrhmi r4, [r1], #2 /* 2 bytes */
218 strhmi r4, [r0], #2
219 tst r2, #0x1
220 itt ne
221 ldrbne r3, [r1] /* last byte */
222 strbne r3, [r0]
2236:
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700224 ldmfd sp!, {r4-r8}
Christopher Ferris5f45d582013-08-07 13:09:51 -0700225 ldmfd sp!, {r0, pc}
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700226END(MEMCPY_BASE_ALIGNED)