blob: 2a62816d2ddd54c61d7ef2054653651b3e87fb52 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001!
2! Fast SH memcpy
3!
4! by Toshiyasu Morita (tm@netcom.com)
5! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
6! SH5 code Copyright 2002 SuperH Ltd.
7!
8! Entry: ARG0: destination pointer
9! ARG1: source pointer
10! ARG2: byte count
11!
12! Exit: RESULT: destination pointer
13! any other registers in the range r0-r7: trashed
14!
15! Notes: Usually one wants to do small reads and write a longword, but
16! unfortunately it is difficult in some cases to concatanate bytes
17! into a longword on the SH, so this does a longword read and small
18! writes.
19!
20! This implementation makes two assumptions about how it is called:
21!
22! 1.: If the byte count is nonzero, the address of the last byte to be
23! copied is unsigned greater than the address of the first byte to
24! be copied. This could be easily swapped for a signed comparison,
25! but the algorithm used needs some comparison.
26!
27! 2.: When there are two or three bytes in the last word of an 11-or-more
28! bytes memory chunk to b copied, the rest of the word can be read
29! without side effects.
30! This could be easily changed by increasing the minumum size of
31! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
32! however, this would cost a few extra cyles on average.
33! For SHmedia, the assumption is that any quadword can be read in its
34! enirety if at least one byte is included in the copy.
35
36/* Imported into Linux kernel by Richard Curnow. This is used to implement the
37 __copy_user function in the general case, so it has to be a distinct
38 function from intra-kernel memcpy to allow for exception fix-ups in the
39 event that the user pointer is bad somewhere in the copy (e.g. due to
40 running off the end of the vma).
41
42 Note, this algorithm will be slightly wasteful in the case where the source
43 and destination pointers are equally aligned, because the stlo/sthi pairs
44 could then be merged back into single stores. If there are a lot of cache
45 misses, this is probably offset by the stall lengths on the preloads.
46
47*/
48
49/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
50 * erratum. The first two prefetches are nop-ed out to avoid upsetting the
51 * instruction counts used in the jump address calculation.
52 * */
53
54 .section .text..SHmedia32,"ax"
55 .little
56 .balign 32
57 .global copy_user_memcpy
58 .global copy_user_memcpy_end
59copy_user_memcpy:
60
61#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
62#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
63#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
64#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
65
66 nop ! ld.b r3,0,r63 ! TAKum03020
67 pta/l Large,tr0
68 movi 25,r0
69 bgeu/u r4,r0,tr0
70 nsb r4,r0
71 shlli r0,5,r0
72 movi (L1-L0+63*32 + 1) & 0xffff,r1
73 sub r1, r0, r0
74L0: ptrel r0,tr0
75 add r2,r4,r5
76 ptabs r18,tr1
77 add r3,r4,r6
78 blink tr0,r63
79
80/* Rearranged to make cut2 safe */
81 .balign 8
82L4_7: /* 4..7 byte memcpy cntd. */
83 stlo.l r2, 0, r0
84 or r6, r7, r6
85 sthi.l r5, -1, r6
86 stlo.l r5, -4, r6
87 blink tr1,r63
88
89 .balign 8
90L1: /* 0 byte memcpy */
91 nop
92 blink tr1,r63
93 nop
94 nop
95 nop
96 nop
97
98L2_3: /* 2 or 3 byte memcpy cntd. */
99 st.b r5,-1,r6
100 blink tr1,r63
101
102 /* 1 byte memcpy */
103 ld.b r3,0,r0
104 st.b r2,0,r0
105 blink tr1,r63
106
107L8_15: /* 8..15 byte memcpy cntd. */
108 stlo.q r2, 0, r0
109 or r6, r7, r6
110 sthi.q r5, -1, r6
111 stlo.q r5, -8, r6
112 blink tr1,r63
113
114 /* 2 or 3 byte memcpy */
115 ld.b r3,0,r0
116 nop ! ld.b r2,0,r63 ! TAKum03020
117 ld.b r3,1,r1
118 st.b r2,0,r0
119 pta/l L2_3,tr0
120 ld.b r6,-1,r6
121 st.b r2,1,r1
122 blink tr0, r63
123
124 /* 4 .. 7 byte memcpy */
125 LDUAL (r3, 0, r0, r1)
126 pta L4_7, tr0
127 ldlo.l r6, -4, r7
128 or r0, r1, r0
129 sthi.l r2, 3, r0
130 ldhi.l r6, -1, r6
131 blink tr0, r63
132
133 /* 8 .. 15 byte memcpy */
134 LDUAQ (r3, 0, r0, r1)
135 pta L8_15, tr0
136 ldlo.q r6, -8, r7
137 or r0, r1, r0
138 sthi.q r2, 7, r0
139 ldhi.q r6, -1, r6
140 blink tr0, r63
141
142 /* 16 .. 24 byte memcpy */
143 LDUAQ (r3, 0, r0, r1)
144 LDUAQ (r3, 8, r8, r9)
145 or r0, r1, r0
146 sthi.q r2, 7, r0
147 or r8, r9, r8
148 sthi.q r2, 15, r8
149 ldlo.q r6, -8, r7
150 ldhi.q r6, -1, r6
151 stlo.q r2, 8, r8
152 stlo.q r2, 0, r0
153 or r6, r7, r6
154 sthi.q r5, -1, r6
155 stlo.q r5, -8, r6
156 blink tr1,r63
157
158Large:
159 ! ld.b r2, 0, r63 ! TAKum03020
160 pta/l Loop_ua, tr1
161 ori r3, -8, r7
162 sub r2, r7, r22
163 sub r3, r2, r6
164 add r2, r4, r5
165 ldlo.q r3, 0, r0
166 addi r5, -16, r5
167 movi 64+8, r27 ! could subtract r7 from that.
168 stlo.q r2, 0, r0
169 sthi.q r2, 7, r0
170 ldx.q r22, r6, r0
171 bgtu/l r27, r4, tr1
172
173 addi r5, -48, r27
174 pta/l Loop_line, tr0
175 addi r6, 64, r36
176 addi r6, -24, r19
177 addi r6, -16, r20
178 addi r6, -8, r21
179
180Loop_line:
181 ! ldx.q r22, r36, r63 ! TAKum03020
182 alloco r22, 32
183 synco
184 addi r22, 32, r22
185 ldx.q r22, r19, r23
186 sthi.q r22, -25, r0
187 ldx.q r22, r20, r24
188 ldx.q r22, r21, r25
189 stlo.q r22, -32, r0
190 ldx.q r22, r6, r0
191 sthi.q r22, -17, r23
192 sthi.q r22, -9, r24
193 sthi.q r22, -1, r25
194 stlo.q r22, -24, r23
195 stlo.q r22, -16, r24
196 stlo.q r22, -8, r25
197 bgeu r27, r22, tr0
198
199Loop_ua:
200 addi r22, 8, r22
201 sthi.q r22, -1, r0
202 stlo.q r22, -8, r0
203 ldx.q r22, r6, r0
204 bgtu/l r5, r22, tr1
205
206 add r3, r4, r7
207 ldlo.q r7, -8, r1
208 sthi.q r22, 7, r0
209 ldhi.q r7, -1, r7
210 ptabs r18,tr1
211 stlo.q r22, 0, r0
212 or r1, r7, r1
213 sthi.q r5, 15, r1
214 stlo.q r5, 8, r1
215 blink tr1, r63
216copy_user_memcpy_end:
217 nop