blob: 5e813050a29bbdb37a5c446ed599e136bf5e7332 [file] [log] [blame]
Christopher Ferris5f45d582013-08-07 13:09:51 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * This code assumes it is running on a processor that supports all arm v7
31 * instructions, that supports neon instructions, and that has a 32 byte
32 * cache line.
33 */
34
Nick Kralevich32bbf8a2013-10-02 16:54:58 -070035ENTRY_PRIVATE(MEMCPY_BASE)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070036 .cfi_def_cfa_offset 8
37 .cfi_rel_offset r0, 0
38 .cfi_rel_offset lr, 4
39
Christopher Ferris5f45d582013-08-07 13:09:51 -070040 // Check so divider is at least 16 bytes, needed for alignment code.
41 cmp r2, #16
42 blo 5f
43
Christopher Ferris5f45d582013-08-07 13:09:51 -070044 /* check if buffers are aligned. If so, run arm-only version */
45 eor r3, r0, r1
46 ands r3, r3, #0x3
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070047 beq __memcpy_base_aligned
Christopher Ferris5f45d582013-08-07 13:09:51 -070048
49 /* Check the upper size limit for Neon unaligned memory access in memcpy */
50 cmp r2, #224
51 blo 3f
52
53 /* align destination to 16 bytes for the write-buffer */
54 rsb r3, r0, #0
55 ands r3, r3, #0xF
56 beq 3f
57
58 /* copy up to 15-bytes (count in r3) */
59 sub r2, r2, r3
60 movs ip, r3, lsl #31
61 itt mi
62 ldrbmi lr, [r1], #1
63 strbmi lr, [r0], #1
64 itttt cs
65 ldrbcs ip, [r1], #1
66 ldrbcs lr, [r1], #1
67 strbcs ip, [r0], #1
68 strbcs lr, [r0], #1
69 movs ip, r3, lsl #29
70 bge 1f
71 // copies 4 bytes, destination 32-bits aligned
72 vld1.32 {d0[0]}, [r1]!
73 vst1.32 {d0[0]}, [r0, :32]!
741: bcc 2f
75 // copies 8 bytes, destination 64-bits aligned
76 vld1.8 {d0}, [r1]!
77 vst1.8 {d0}, [r0, :64]!
782:
79 /* preload immediately the next cache line, which we may need */
80 pld [r1, #0]
81 pld [r1, #(32 * 2)]
823:
83 /* make sure we have at least 64 bytes to copy */
84 subs r2, r2, #64
85 blo 2f
86
87 /* preload all the cache lines we need */
88 pld [r1, #(32 * 4)]
89 pld [r1, #(32 * 6)]
90
911: /* The main loop copies 64 bytes at a time */
92 vld1.8 {d0 - d3}, [r1]!
93 vld1.8 {d4 - d7}, [r1]!
94 pld [r1, #(32 * 6)]
95 subs r2, r2, #64
96 vst1.8 {d0 - d3}, [r0]!
97 vst1.8 {d4 - d7}, [r0]!
98 bhs 1b
99
1002: /* fix-up the remaining count and make sure we have >= 32 bytes left */
101 add r2, r2, #64
102 subs r2, r2, #32
103 blo 4f
104
1053: /* 32 bytes at a time. These cache lines were already preloaded */
106 vld1.8 {d0 - d3}, [r1]!
107 subs r2, r2, #32
108 vst1.8 {d0 - d3}, [r0]!
109 bhs 3b
110
1114: /* less than 32 left */
112 add r2, r2, #32
113 tst r2, #0x10
114 beq 5f
115 // copies 16 bytes, 128-bits aligned
116 vld1.8 {d0, d1}, [r1]!
117 vst1.8 {d0, d1}, [r0]!
1185: /* copy up to 15-bytes (count in r2) */
119 movs ip, r2, lsl #29
120 bcc 1f
121 vld1.8 {d0}, [r1]!
122 vst1.8 {d0}, [r0]!
1231: bge 2f
124 vld1.32 {d0[0]}, [r1]!
125 vst1.32 {d0[0]}, [r0]!
1262: movs ip, r2, lsl #31
127 itt mi
128 ldrbmi r3, [r1], #1
129 strbmi r3, [r0], #1
130 itttt cs
131 ldrbcs ip, [r1], #1
132 ldrbcs lr, [r1], #1
133 strbcs ip, [r0], #1
134 strbcs lr, [r0], #1
135
136 ldmfd sp!, {r0, lr}
137 bx lr
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700138END(MEMCPY_BASE)
139
Nick Kralevich32bbf8a2013-10-02 16:54:58 -0700140ENTRY_PRIVATE(MEMCPY_BASE_ALIGNED)
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700141 .cfi_def_cfa_offset 8
142 .cfi_rel_offset r0, 0
143 .cfi_rel_offset lr, 4
144
Christopher Ferris5f45d582013-08-07 13:09:51 -0700145 /* Simple arm-only copy loop to handle aligned copy operations */
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700146 stmfd sp!, {r4-r8}
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700147 .cfi_adjust_cfa_offset 20
Christopher Ferrisbd7fe1d2013-08-20 11:20:48 -0700148 .cfi_rel_offset r4, 0
149 .cfi_rel_offset r5, 4
150 .cfi_rel_offset r6, 8
151 .cfi_rel_offset r7, 12
152 .cfi_rel_offset r8, 16
Christopher Ferris5f45d582013-08-07 13:09:51 -0700153 pld [r1, #(32 * 4)]
154
155 /* Check alignment */
156 rsb r3, r1, #0
157 ands r3, #3
158 beq 2f
159
160 /* align source to 32 bits. We need to insert 2 instructions between
161 * a ldr[b|h] and str[b|h] because byte and half-word instructions
162 * stall 2 cycles.
163 */
164 movs r12, r3, lsl #31
165 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
166 itt mi
167 ldrbmi r3, [r1], #1
168 strbmi r3, [r0], #1
169 itttt cs
170 ldrbcs r4, [r1], #1
171 ldrbcs r5, [r1], #1
172 strbcs r4, [r0], #1
173 strbcs r5, [r0], #1
174
1752:
176 subs r2, r2, #64
177 blt 4f
178
1793: /* Main copy loop, copying 64 bytes at a time */
180 pld [r1, #(32 * 8)]
181 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
182 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
183 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
184 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
185 subs r2, r2, #64
186 bge 3b
187
1884: /* Check if there are > 32 bytes left */
189 adds r2, r2, #64
190 subs r2, r2, #32
191 blt 5f
192
193 /* Copy 32 bytes */
194 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
195 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
196 subs r2, #32
197
1985: /* Handle any remaining bytes */
199 adds r2, #32
200 beq 6f
201
202 movs r12, r2, lsl #28
203 itt cs
204 ldmiacs r1!, {r3, r4, r5, r6} /* 16 bytes */
205 stmiacs r0!, {r3, r4, r5, r6}
206 itt mi
207 ldmiami r1!, {r7, r8} /* 8 bytes */
208 stmiami r0!, {r7, r8}
209 movs r12, r2, lsl #30
210 itt cs
211 ldrcs r3, [r1], #4 /* 4 bytes */
212 strcs r3, [r0], #4
213 itt mi
214 ldrhmi r4, [r1], #2 /* 2 bytes */
215 strhmi r4, [r0], #2
216 tst r2, #0x1
217 itt ne
218 ldrbne r3, [r1] /* last byte */
219 strbne r3, [r0]
2206:
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700221 ldmfd sp!, {r4-r8}
Christopher Ferris5f45d582013-08-07 13:09:51 -0700222 ldmfd sp!, {r0, pc}
Christopher Ferrisa57c9c02013-08-21 09:41:12 -0700223END(MEMCPY_BASE_ALIGNED)