blob: c5d42ce1d3c7acb2324efa9e5609b895cb157c63 [file] [log] [blame]
Christopher Ferris0cc59dd2014-09-24 17:05:20 -07001/* Copyright (c) 2012, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#define dstin x0
36#define src x1
37#define count x2
38#define tmp1 x3
39#define tmp1w w3
40#define tmp2 x4
41#define tmp2w w4
42#define tmp3 x5
43#define tmp3w w5
44#define dst x6
45
46#define A_l x7
47#define A_h x8
48#define B_l x9
49#define B_h x10
50#define C_l x11
51#define C_h x12
52#define D_l x13
53#define D_h x14
54
55 mov dst, dstin
56 cmp count, #64
57 b.ge .Lcpy_not_short
58 cmp count, #15
59 b.le .Ltail15tiny
60
61 /* Deal with small copies quickly by dropping straight into the
62 * exit block. */
63.Ltail63:
64 /* Copy up to 48 bytes of data. At this point we only need the
65 * bottom 6 bits of count to be accurate. */
66 ands tmp1, count, #0x30
67 b.eq .Ltail15
68 add dst, dst, tmp1
69 add src, src, tmp1
70 cmp tmp1w, #0x20
71 b.eq 1f
72 b.lt 2f
73 ldp A_l, A_h, [src, #-48]
74 stp A_l, A_h, [dst, #-48]
751:
76 ldp A_l, A_h, [src, #-32]
77 stp A_l, A_h, [dst, #-32]
782:
79 ldp A_l, A_h, [src, #-16]
80 stp A_l, A_h, [dst, #-16]
81
82.Ltail15:
83 ands count, count, #15
84 beq 1f
85 add src, src, count
86 ldp A_l, A_h, [src, #-16]
87 add dst, dst, count
88 stp A_l, A_h, [dst, #-16]
891:
90 ret
91
92.Ltail15tiny:
93 /* Copy up to 15 bytes of data. Does not assume additional data
94 being copied. */
95 tbz count, #3, 1f
96 ldr tmp1, [src], #8
97 str tmp1, [dst], #8
981:
99 tbz count, #2, 1f
100 ldr tmp1w, [src], #4
101 str tmp1w, [dst], #4
1021:
103 tbz count, #1, 1f
104 ldrh tmp1w, [src], #2
105 strh tmp1w, [dst], #2
1061:
107 tbz count, #0, 1f
108 ldrb tmp1w, [src]
109 strb tmp1w, [dst]
1101:
111 ret
112
113.Lcpy_not_short:
114 /* We don't much care about the alignment of DST, but we want SRC
115 * to be 128-bit (16 byte) aligned so that we don't cross cache line
116 * boundaries on both loads and stores. */
117 neg tmp2, src
118 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
119 b.eq 2f
120 sub count, count, tmp2
121 /* Copy more data than needed; it's faster than jumping
122 * around copying sub-Quadword quantities. We know that
123 * it can't overrun. */
124 ldp A_l, A_h, [src]
125 add src, src, tmp2
126 stp A_l, A_h, [dst]
127 add dst, dst, tmp2
128 /* There may be less than 63 bytes to go now. */
129 cmp count, #63
130 b.le .Ltail63
1312:
132 subs count, count, #128
133 b.ge .Lcpy_body_large
134 /* Less than 128 bytes to copy, so handle 64 here and then jump
135 * to the tail. */
136 ldp A_l, A_h, [src]
137 ldp B_l, B_h, [src, #16]
138 ldp C_l, C_h, [src, #32]
139 ldp D_l, D_h, [src, #48]
140 stp A_l, A_h, [dst]
141 stp B_l, B_h, [dst, #16]
142 stp C_l, C_h, [dst, #32]
143 stp D_l, D_h, [dst, #48]
144 tst count, #0x3f
145 add src, src, #64
146 add dst, dst, #64
147 b.ne .Ltail63
148 ret
149
150 /* Critical loop. Start at a new cache line boundary. Assuming
151 * 64 bytes per line this ensures the entire loop is in one line. */
152 .p2align 6
153.Lcpy_body_large:
154 /* There are at least 128 bytes to copy. */
155 ldp A_l, A_h, [src, #0]
156 sub dst, dst, #16 /* Pre-bias. */
157 ldp B_l, B_h, [src, #16]
158 ldp C_l, C_h, [src, #32]
159 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
1601:
161 stp A_l, A_h, [dst, #16]
162 ldp A_l, A_h, [src, #16]
163 stp B_l, B_h, [dst, #32]
164 ldp B_l, B_h, [src, #32]
165 stp C_l, C_h, [dst, #48]
166 ldp C_l, C_h, [src, #48]
167 stp D_l, D_h, [dst, #64]!
168 ldp D_l, D_h, [src, #64]!
169 subs count, count, #64
170 b.ge 1b
171 stp A_l, A_h, [dst, #16]
172 stp B_l, B_h, [dst, #32]
173 stp C_l, C_h, [dst, #48]
174 stp D_l, D_h, [dst, #64]
175 add src, src, #16
176 add dst, dst, #64 + 16
177 tst count, #0x3f
178 b.ne .Ltail63
179 ret