blob: 3a534b2baa0f490e092943a40e3b1c67c215ed45 [file] [log] [blame]
Ralf Baechlee03b5262007-02-19 16:59:24 +00001/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Unified implementation of memcpy, memmove and the __copy_user backend.
7 *
8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc.
11 * memcpy/copy_user author: Mark Vandevoorde
12 *
13 * Mnemonic names for arguments to memcpy/__copy_user
14 */
15
16/*
17 * Hack to resolve longstanding prefetch issue
18 *
19 * Prefetching may be fatal on some systems if we're prefetching beyond the
20 * end of memory on some systems. It's also a seriously bad idea on non
21 * dma-coherent systems.
22 */
23#if !defined(CONFIG_DMA_COHERENT) || !defined(CONFIG_DMA_IP27)
24#undef CONFIG_CPU_HAS_PREFETCH
25#endif
26#ifdef CONFIG_MIPS_MALTA
27#undef CONFIG_CPU_HAS_PREFETCH
28#endif
29
30#include <asm/asm.h>
31#include <asm/asm-offsets.h>
32#include <asm/regdef.h>
33
34#define dst a0
35#define src a1
36#define len a2
37
38/*
39 * Spec
40 *
41 * memcpy copies len bytes from src to dst and sets v0 to dst.
42 * It assumes that
43 * - src and dst don't overlap
44 * - src is readable
45 * - dst is writable
46 * memcpy uses the standard calling convention
47 *
48 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
49 * the number of uncopied bytes due to an exception caused by a read or write.
50 * __copy_user assumes that src and dst don't overlap, and that the call is
51 * implementing one of the following:
52 * copy_to_user
53 * - src is readable (no exceptions when reading src)
54 * copy_from_user
55 * - dst is writable (no exceptions when writing dst)
56 * __copy_user uses a non-standard calling convention; see
57 * include/asm-mips/uaccess.h
58 *
59 * When an exception happens on a load, the handler must
60 # ensure that all of the destination buffer is overwritten to prevent
61 * leaking information to user mode programs.
62 */
63
64/*
65 * Implementation
66 */
67
68/*
69 * The exception handler for loads requires that:
70 * 1- AT contain the address of the byte just past the end of the source
71 * of the copy,
72 * 2- src_entry <= src < AT, and
73 * 3- (dst - src) == (dst_entry - src_entry),
74 * The _entry suffix denotes values when __copy_user was called.
75 *
76 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
77 * (2) is met by incrementing src by the number of bytes copied
78 * (3) is met by not doing loads between a pair of increments of dst and src
79 *
80 * The exception handlers for stores adjust len (if necessary) and return.
81 * These handlers do not need to overwrite any data.
82 *
83 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
84 * they're not protected.
85 */
86
87#define EXC(inst_reg,addr,handler) \
889: inst_reg, addr; \
89 .section __ex_table,"a"; \
90 PTR 9b, handler; \
91 .previous
92
93/*
94 * Only on the 64-bit kernel we can made use of 64-bit registers.
95 */
96#ifdef CONFIG_64BIT
97#define USE_DOUBLE
98#endif
99
100#ifdef USE_DOUBLE
101
102#define LOAD ld
103#define LOADL ldl
104#define LOADR ldr
105#define STOREL sdl
106#define STORER sdr
107#define STORE sd
108#define ADD daddu
109#define SUB dsubu
110#define SRL dsrl
111#define SRA dsra
112#define SLL dsll
113#define SLLV dsllv
114#define SRLV dsrlv
115#define NBYTES 8
116#define LOG_NBYTES 3
117
118/*
119 * As we are sharing code base with the mips32 tree (which use the o32 ABI
120 * register definitions). We need to redefine the register definitions from
121 * the n64 ABI register naming to the o32 ABI register naming.
122 */
123#undef t0
124#undef t1
125#undef t2
126#undef t3
127#define t0 $8
128#define t1 $9
129#define t2 $10
130#define t3 $11
131#define t4 $12
132#define t5 $13
133#define t6 $14
134#define t7 $15
135
136#else
137
138#define LOAD lw
139#define LOADL lwl
140#define LOADR lwr
141#define STOREL swl
142#define STORER swr
143#define STORE sw
144#define ADD addu
145#define SUB subu
146#define SRL srl
147#define SLL sll
148#define SRA sra
149#define SLLV sllv
150#define SRLV srlv
151#define NBYTES 4
152#define LOG_NBYTES 2
153
154#endif /* USE_DOUBLE */
155
156#ifdef CONFIG_CPU_LITTLE_ENDIAN
157#define LDFIRST LOADR
158#define LDREST LOADL
159#define STFIRST STORER
160#define STREST STOREL
161#define SHIFT_DISCARD SLLV
162#else
163#define LDFIRST LOADL
164#define LDREST LOADR
165#define STFIRST STOREL
166#define STREST STORER
167#define SHIFT_DISCARD SRLV
168#endif
169
170#define FIRST(unit) ((unit)*NBYTES)
171#define REST(unit) (FIRST(unit)+NBYTES-1)
172#define UNIT(unit) FIRST(unit)
173
174#define ADDRMASK (NBYTES-1)
175
176 .text
177 .set noreorder
178 .set noat
179
180/*
181 * A combined memcpy/__copy_user
182 * __copy_user sets len to 0 for success; else to an upper bound of
183 * the number of uncopied bytes.
184 * memcpy sets v0 to dst.
185 */
186 .align 5
187LEAF(__copy_user_inatomic)
188 /*
189 * Note: dst & src may be unaligned, len may be 0
190 * Temps
191 */
192#define rem t8
193
194 /*
195 * The "issue break"s below are very approximate.
196 * Issue delays for dcache fills will perturb the schedule, as will
197 * load queue full replay traps, etc.
198 *
199 * If len < NBYTES use byte operations.
200 */
201 PREF( 0, 0(src) )
202 PREF( 1, 0(dst) )
203 sltu t2, len, NBYTES
204 and t1, dst, ADDRMASK
205 PREF( 0, 1*32(src) )
206 PREF( 1, 1*32(dst) )
207 bnez t2, copy_bytes_checklen
208 and t0, src, ADDRMASK
209 PREF( 0, 2*32(src) )
210 PREF( 1, 2*32(dst) )
211 bnez t1, dst_unaligned
212 nop
213 bnez t0, src_unaligned_dst_aligned
214 /*
215 * use delay slot for fall-through
216 * src and dst are aligned; need to compute rem
217 */
218both_aligned:
219 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
220 beqz t0, cleanup_both_aligned # len < 8*NBYTES
221 and rem, len, (8*NBYTES-1) # rem = len % (8*NBYTES)
222 PREF( 0, 3*32(src) )
223 PREF( 1, 3*32(dst) )
224 .align 4
2251:
226EXC( LOAD t0, UNIT(0)(src), l_exc)
227EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
228EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
229EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
230 SUB len, len, 8*NBYTES
231EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
232EXC( LOAD t7, UNIT(5)(src), l_exc_copy)
233 STORE t0, UNIT(0)(dst)
234 STORE t1, UNIT(1)(dst)
235EXC( LOAD t0, UNIT(6)(src), l_exc_copy)
236EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
237 ADD src, src, 8*NBYTES
238 ADD dst, dst, 8*NBYTES
239 STORE t2, UNIT(-6)(dst)
240 STORE t3, UNIT(-5)(dst)
241 STORE t4, UNIT(-4)(dst)
242 STORE t7, UNIT(-3)(dst)
243 STORE t0, UNIT(-2)(dst)
244 STORE t1, UNIT(-1)(dst)
245 PREF( 0, 8*32(src) )
246 PREF( 1, 8*32(dst) )
247 bne len, rem, 1b
248 nop
249
250 /*
251 * len == rem == the number of bytes left to copy < 8*NBYTES
252 */
253cleanup_both_aligned:
254 beqz len, done
255 sltu t0, len, 4*NBYTES
256 bnez t0, less_than_4units
257 and rem, len, (NBYTES-1) # rem = len % NBYTES
258 /*
259 * len >= 4*NBYTES
260 */
261EXC( LOAD t0, UNIT(0)(src), l_exc)
262EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
263EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
264EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
265 SUB len, len, 4*NBYTES
266 ADD src, src, 4*NBYTES
267 STORE t0, UNIT(0)(dst)
268 STORE t1, UNIT(1)(dst)
269 STORE t2, UNIT(2)(dst)
270 STORE t3, UNIT(3)(dst)
271 beqz len, done
272 ADD dst, dst, 4*NBYTES
273less_than_4units:
274 /*
275 * rem = len % NBYTES
276 */
277 beq rem, len, copy_bytes
278 nop
2791:
280EXC( LOAD t0, 0(src), l_exc)
281 ADD src, src, NBYTES
282 SUB len, len, NBYTES
283 STORE t0, 0(dst)
284 bne rem, len, 1b
285 ADD dst, dst, NBYTES
286
287 /*
288 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
289 * A loop would do only a byte at a time with possible branch
290 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
291 * because can't assume read-access to dst. Instead, use
292 * STREST dst, which doesn't require read access to dst.
293 *
294 * This code should perform better than a simple loop on modern,
295 * wide-issue mips processors because the code has fewer branches and
296 * more instruction-level parallelism.
297 */
298#define bits t2
299 beqz len, done
300 ADD t1, dst, len # t1 is just past last byte of dst
301 li bits, 8*NBYTES
302 SLL rem, len, 3 # rem = number of bits to keep
303EXC( LOAD t0, 0(src), l_exc)
304 SUB bits, bits, rem # bits = number of bits to discard
305 SHIFT_DISCARD t0, t0, bits
306 STREST t0, -1(t1)
307 jr ra
308 move len, zero
309dst_unaligned:
310 /*
311 * dst is unaligned
312 * t0 = src & ADDRMASK
313 * t1 = dst & ADDRMASK; T1 > 0
314 * len >= NBYTES
315 *
316 * Copy enough bytes to align dst
317 * Set match = (src and dst have same alignment)
318 */
319#define match rem
320EXC( LDFIRST t3, FIRST(0)(src), l_exc)
321 ADD t2, zero, NBYTES
322EXC( LDREST t3, REST(0)(src), l_exc_copy)
323 SUB t2, t2, t1 # t2 = number of bytes copied
324 xor match, t0, t1
325 STFIRST t3, FIRST(0)(dst)
326 beq len, t2, done
327 SUB len, len, t2
328 ADD dst, dst, t2
329 beqz match, both_aligned
330 ADD src, src, t2
331
332src_unaligned_dst_aligned:
333 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
334 PREF( 0, 3*32(src) )
335 beqz t0, cleanup_src_unaligned
336 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
337 PREF( 1, 3*32(dst) )
3381:
339/*
340 * Avoid consecutive LD*'s to the same register since some mips
341 * implementations can't issue them in the same cycle.
342 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
343 * are to the same unit (unless src is aligned, but it's not).
344 */
345EXC( LDFIRST t0, FIRST(0)(src), l_exc)
346EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
347 SUB len, len, 4*NBYTES
348EXC( LDREST t0, REST(0)(src), l_exc_copy)
349EXC( LDREST t1, REST(1)(src), l_exc_copy)
350EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
351EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
352EXC( LDREST t2, REST(2)(src), l_exc_copy)
353EXC( LDREST t3, REST(3)(src), l_exc_copy)
354 PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
355 ADD src, src, 4*NBYTES
356#ifdef CONFIG_CPU_SB1
357 nop # improves slotting
358#endif
359 STORE t0, UNIT(0)(dst)
360 STORE t1, UNIT(1)(dst)
361 STORE t2, UNIT(2)(dst)
362 STORE t3, UNIT(3)(dst)
363 PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
364 bne len, rem, 1b
365 ADD dst, dst, 4*NBYTES
366
367cleanup_src_unaligned:
368 beqz len, done
369 and rem, len, NBYTES-1 # rem = len % NBYTES
370 beq rem, len, copy_bytes
371 nop
3721:
373EXC( LDFIRST t0, FIRST(0)(src), l_exc)
374EXC( LDREST t0, REST(0)(src), l_exc_copy)
375 ADD src, src, NBYTES
376 SUB len, len, NBYTES
377 STORE t0, 0(dst)
378 bne len, rem, 1b
379 ADD dst, dst, NBYTES
380
381copy_bytes_checklen:
382 beqz len, done
383 nop
384copy_bytes:
385 /* 0 < len < NBYTES */
386#define COPY_BYTE(N) \
387EXC( lb t0, N(src), l_exc); \
388 SUB len, len, 1; \
389 beqz len, done; \
390 sb t0, N(dst)
391
392 COPY_BYTE(0)
393 COPY_BYTE(1)
394#ifdef USE_DOUBLE
395 COPY_BYTE(2)
396 COPY_BYTE(3)
397 COPY_BYTE(4)
398 COPY_BYTE(5)
399#endif
400EXC( lb t0, NBYTES-2(src), l_exc)
401 SUB len, len, 1
402 jr ra
403 sb t0, NBYTES-2(dst)
404done:
405 jr ra
406 nop
407 END(__copy_user_inatomic)
408
409l_exc_copy:
410 /*
411 * Copy bytes from src until faulting load address (or until a
412 * lb faults)
413 *
414 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
415 * may be more than a byte beyond the last address.
416 * Hence, the lb below may get an exception.
417 *
418 * Assumes src < THREAD_BUADDR($28)
419 */
420 LOAD t0, TI_TASK($28)
421 nop
422 LOAD t0, THREAD_BUADDR(t0)
4231:
424EXC( lb t1, 0(src), l_exc)
425 ADD src, src, 1
426 sb t1, 0(dst) # can't fault -- we're copy_from_user
427 bne src, t0, 1b
428 ADD dst, dst, 1
429l_exc:
430 LOAD t0, TI_TASK($28)
431 nop
432 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
433 nop
434 SUB len, AT, t0 # len number of uncopied bytes
435 jr ra
436 nop