blob: dd300c372ce1966148910c86bd472d87b3066d16 [file] [log] [blame]
Paul Mundt4466b202008-12-12 16:34:44 +09001/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
2/* Modified by SuperH, Inc. September 2003 */
3!
4! Fast SH memcpy
5!
6! by Toshiyasu Morita (tm@netcom.com)
7! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
8! SH5 code Copyright 2002 SuperH Ltd.
9!
10! Entry: ARG0: destination pointer
11! ARG1: source pointer
12! ARG2: byte count
13!
14! Exit: RESULT: destination pointer
15! any other registers in the range r0-r7: trashed
16!
17! Notes: Usually one wants to do small reads and write a longword, but
18! unfortunately it is difficult in some cases to concatanate bytes
19! into a longword on the SH, so this does a longword read and small
20! writes.
21!
22! This implementation makes two assumptions about how it is called:
23!
24! 1.: If the byte count is nonzero, the address of the last byte to be
25! copied is unsigned greater than the address of the first byte to
26! be copied. This could be easily swapped for a signed comparison,
27! but the algorithm used needs some comparison.
28!
29! 2.: When there are two or three bytes in the last word of an 11-or-more
30! bytes memory chunk to b copied, the rest of the word can be read
31! without side effects.
32! This could be easily changed by increasing the minumum size of
33! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
34! however, this would cost a few extra cyles on average.
35! For SHmedia, the assumption is that any quadword can be read in its
36! enirety if at least one byte is included in the copy.
37!
38
39 .section .text..SHmedia32,"ax"
40 .globl memcpy
41 .type memcpy, @function
42 .align 5
43
44memcpy:
45
46#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
47#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
48#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
49#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
50
51 ld.b r3,0,r63
52 pta/l Large,tr0
53 movi 25,r0
54 bgeu/u r4,r0,tr0
55 nsb r4,r0
56 shlli r0,5,r0
57 movi (L1-L0+63*32 + 1) & 0xffff,r1
58 sub r1, r0, r0
59L0: ptrel r0,tr0
60 add r2,r4,r5
61 ptabs r18,tr1
62 add r3,r4,r6
63 blink tr0,r63
64
65/* Rearranged to make cut2 safe */
66 .balign 8
67L4_7: /* 4..7 byte memcpy cntd. */
68 stlo.l r2, 0, r0
69 or r6, r7, r6
70 sthi.l r5, -1, r6
71 stlo.l r5, -4, r6
72 blink tr1,r63
73
74 .balign 8
75L1: /* 0 byte memcpy */
76 nop
77 blink tr1,r63
78 nop
79 nop
80 nop
81 nop
82
83L2_3: /* 2 or 3 byte memcpy cntd. */
84 st.b r5,-1,r6
85 blink tr1,r63
86
87 /* 1 byte memcpy */
88 ld.b r3,0,r0
89 st.b r2,0,r0
90 blink tr1,r63
91
92L8_15: /* 8..15 byte memcpy cntd. */
93 stlo.q r2, 0, r0
94 or r6, r7, r6
95 sthi.q r5, -1, r6
96 stlo.q r5, -8, r6
97 blink tr1,r63
98
99 /* 2 or 3 byte memcpy */
100 ld.b r3,0,r0
101 ld.b r2,0,r63
102 ld.b r3,1,r1
103 st.b r2,0,r0
104 pta/l L2_3,tr0
105 ld.b r6,-1,r6
106 st.b r2,1,r1
107 blink tr0, r63
108
109 /* 4 .. 7 byte memcpy */
110 LDUAL (r3, 0, r0, r1)
111 pta L4_7, tr0
112 ldlo.l r6, -4, r7
113 or r0, r1, r0
114 sthi.l r2, 3, r0
115 ldhi.l r6, -1, r6
116 blink tr0, r63
117
118 /* 8 .. 15 byte memcpy */
119 LDUAQ (r3, 0, r0, r1)
120 pta L8_15, tr0
121 ldlo.q r6, -8, r7
122 or r0, r1, r0
123 sthi.q r2, 7, r0
124 ldhi.q r6, -1, r6
125 blink tr0, r63
126
127 /* 16 .. 24 byte memcpy */
128 LDUAQ (r3, 0, r0, r1)
129 LDUAQ (r3, 8, r8, r9)
130 or r0, r1, r0
131 sthi.q r2, 7, r0
132 or r8, r9, r8
133 sthi.q r2, 15, r8
134 ldlo.q r6, -8, r7
135 ldhi.q r6, -1, r6
136 stlo.q r2, 8, r8
137 stlo.q r2, 0, r0
138 or r6, r7, r6
139 sthi.q r5, -1, r6
140 stlo.q r5, -8, r6
141 blink tr1,r63
142
143Large:
144 ld.b r2, 0, r63
145 pta/l Loop_ua, tr1
146 ori r3, -8, r7
147 sub r2, r7, r22
148 sub r3, r2, r6
149 add r2, r4, r5
150 ldlo.q r3, 0, r0
151 addi r5, -16, r5
152 movi 64+8, r27 // could subtract r7 from that.
153 stlo.q r2, 0, r0
154 sthi.q r2, 7, r0
155 ldx.q r22, r6, r0
156 bgtu/l r27, r4, tr1
157
158 addi r5, -48, r27
159 pta/l Loop_line, tr0
160 addi r6, 64, r36
161 addi r6, -24, r19
162 addi r6, -16, r20
163 addi r6, -8, r21
164
165Loop_line:
166 ldx.q r22, r36, r63
167 alloco r22, 32
168 addi r22, 32, r22
169 ldx.q r22, r19, r23
170 sthi.q r22, -25, r0
171 ldx.q r22, r20, r24
172 ldx.q r22, r21, r25
173 stlo.q r22, -32, r0
174 ldx.q r22, r6, r0
175 sthi.q r22, -17, r23
176 sthi.q r22, -9, r24
177 sthi.q r22, -1, r25
178 stlo.q r22, -24, r23
179 stlo.q r22, -16, r24
180 stlo.q r22, -8, r25
181 bgeu r27, r22, tr0
182
183Loop_ua:
184 addi r22, 8, r22
185 sthi.q r22, -1, r0
186 stlo.q r22, -8, r0
187 ldx.q r22, r6, r0
188 bgtu/l r5, r22, tr1
189
190 add r3, r4, r7
191 ldlo.q r7, -8, r1
192 sthi.q r22, 7, r0
193 ldhi.q r7, -1, r7
194 ptabs r18,tr1
195 stlo.q r22, 0, r0
196 or r1, r7, r1
197 sthi.q r5, 15, r1
198 stlo.q r5, 8, r1
199 blink tr1, r63
200
201 .size memcpy,.-memcpy