Blame - arch/mips/lib/memcpy-inatomic.S - kernel/msm-4.19

blob: 3a534b2baa0f490e092943a40e3b1c67c215ed45 [file] [log] [blame]

Ralf Baechle	e03b526	2007-02-19 16:59:24 +0000	[diff] [blame]	1	/*
				2	* This file is subject to the terms and conditions of the GNU General Public
				3	* License. See the file "COPYING" in the main directory of this archive
				4	* for more details.
				5	*
				6	* Unified implementation of memcpy, memmove and the __copy_user backend.
				7	*
				8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
				9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
				10	* Copyright (C) 2002 Broadcom, Inc.
				11	* memcpy/copy_user author: Mark Vandevoorde
				12	*
				13	* Mnemonic names for arguments to memcpy/__copy_user
				14	*/
				15
				16	/*
				17	* Hack to resolve longstanding prefetch issue
				18	*
				19	* Prefetching may be fatal on some systems if we're prefetching beyond the
				20	* end of memory on some systems. It's also a seriously bad idea on non
				21	* dma-coherent systems.
				22	*/
				23	#if !defined(CONFIG_DMA_COHERENT) \|\| !defined(CONFIG_DMA_IP27)
				24	#undef CONFIG_CPU_HAS_PREFETCH
				25	#endif
				26	#ifdef CONFIG_MIPS_MALTA
				27	#undef CONFIG_CPU_HAS_PREFETCH
				28	#endif
				29
				30	#include <asm/asm.h>
				31	#include <asm/asm-offsets.h>
				32	#include <asm/regdef.h>
				33
				34	#define dst a0
				35	#define src a1
				36	#define len a2
				37
				38	/*
				39	* Spec
				40	*
				41	* memcpy copies len bytes from src to dst and sets v0 to dst.
				42	* It assumes that
				43	* - src and dst don't overlap
				44	* - src is readable
				45	* - dst is writable
				46	* memcpy uses the standard calling convention
				47	*
				48	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
				49	* the number of uncopied bytes due to an exception caused by a read or write.
				50	* __copy_user assumes that src and dst don't overlap, and that the call is
				51	* implementing one of the following:
				52	* copy_to_user
				53	* - src is readable (no exceptions when reading src)
				54	* copy_from_user
				55	* - dst is writable (no exceptions when writing dst)
				56	* __copy_user uses a non-standard calling convention; see
				57	* include/asm-mips/uaccess.h
				58	*
				59	* When an exception happens on a load, the handler must
				60	# ensure that all of the destination buffer is overwritten to prevent
				61	* leaking information to user mode programs.
				62	*/
				63
				64	/*
				65	* Implementation
				66	*/
				67
				68	/*
				69	* The exception handler for loads requires that:
				70	* 1- AT contain the address of the byte just past the end of the source
				71	* of the copy,
				72	* 2- src_entry <= src < AT, and
				73	* 3- (dst - src) == (dst_entry - src_entry),
				74	* The _entry suffix denotes values when __copy_user was called.
				75	*
				76	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
				77	* (2) is met by incrementing src by the number of bytes copied
				78	* (3) is met by not doing loads between a pair of increments of dst and src
				79	*
				80	* The exception handlers for stores adjust len (if necessary) and return.
				81	* These handlers do not need to overwrite any data.
				82	*
				83	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
				84	* they're not protected.
				85	*/
				86
				87	#define EXC(inst_reg,addr,handler) \
				88	9: inst_reg, addr; \
				89	.section __ex_table,"a"; \
				90	PTR 9b, handler; \
				91	.previous
				92
				93	/*
				94	* Only on the 64-bit kernel we can made use of 64-bit registers.
				95	*/
				96	#ifdef CONFIG_64BIT
				97	#define USE_DOUBLE
				98	#endif
				99
				100	#ifdef USE_DOUBLE
				101
				102	#define LOAD ld
				103	#define LOADL ldl
				104	#define LOADR ldr
				105	#define STOREL sdl
				106	#define STORER sdr
				107	#define STORE sd
				108	#define ADD daddu
				109	#define SUB dsubu
				110	#define SRL dsrl
				111	#define SRA dsra
				112	#define SLL dsll
				113	#define SLLV dsllv
				114	#define SRLV dsrlv
				115	#define NBYTES 8
				116	#define LOG_NBYTES 3
				117
				118	/*
				119	* As we are sharing code base with the mips32 tree (which use the o32 ABI
				120	* register definitions). We need to redefine the register definitions from
				121	* the n64 ABI register naming to the o32 ABI register naming.
				122	*/
				123	#undef t0
				124	#undef t1
				125	#undef t2
				126	#undef t3
				127	#define t0 $8
				128	#define t1 $9
				129	#define t2 $10
				130	#define t3 $11
				131	#define t4 $12
				132	#define t5 $13
				133	#define t6 $14
				134	#define t7 $15
				135
				136	#else
				137
				138	#define LOAD lw
				139	#define LOADL lwl
				140	#define LOADR lwr
				141	#define STOREL swl
				142	#define STORER swr
				143	#define STORE sw
				144	#define ADD addu
				145	#define SUB subu
				146	#define SRL srl
				147	#define SLL sll
				148	#define SRA sra
				149	#define SLLV sllv
				150	#define SRLV srlv
				151	#define NBYTES 4
				152	#define LOG_NBYTES 2
				153
				154	#endif /* USE_DOUBLE */
				155
				156	#ifdef CONFIG_CPU_LITTLE_ENDIAN
				157	#define LDFIRST LOADR
				158	#define LDREST LOADL
				159	#define STFIRST STORER
				160	#define STREST STOREL
				161	#define SHIFT_DISCARD SLLV
				162	#else
				163	#define LDFIRST LOADL
				164	#define LDREST LOADR
				165	#define STFIRST STOREL
				166	#define STREST STORER
				167	#define SHIFT_DISCARD SRLV
				168	#endif
				169
				170	#define FIRST(unit) ((unit)*NBYTES)
				171	#define REST(unit) (FIRST(unit)+NBYTES-1)
				172	#define UNIT(unit) FIRST(unit)
				173
				174	#define ADDRMASK (NBYTES-1)
				175
				176	.text
				177	.set noreorder
				178	.set noat
				179
				180	/*
				181	* A combined memcpy/__copy_user
				182	* __copy_user sets len to 0 for success; else to an upper bound of
				183	* the number of uncopied bytes.
				184	* memcpy sets v0 to dst.
				185	*/
				186	.align 5
				187	LEAF(__copy_user_inatomic)
				188	/*
				189	* Note: dst & src may be unaligned, len may be 0
				190	* Temps
				191	*/
				192	#define rem t8
				193
				194	/*
				195	* The "issue break"s below are very approximate.
				196	* Issue delays for dcache fills will perturb the schedule, as will
				197	* load queue full replay traps, etc.
				198	*
				199	* If len < NBYTES use byte operations.
				200	*/
				201	PREF( 0, 0(src) )
				202	PREF( 1, 0(dst) )
				203	sltu t2, len, NBYTES
				204	and t1, dst, ADDRMASK
				205	PREF( 0, 1*32(src) )
				206	PREF( 1, 1*32(dst) )
				207	bnez t2, copy_bytes_checklen
				208	and t0, src, ADDRMASK
				209	PREF( 0, 2*32(src) )
				210	PREF( 1, 2*32(dst) )
				211	bnez t1, dst_unaligned
				212	nop
				213	bnez t0, src_unaligned_dst_aligned
				214	/*
				215	* use delay slot for fall-through
				216	* src and dst are aligned; need to compute rem
				217	*/
				218	both_aligned:
				219	SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
				220	beqz t0, cleanup_both_aligned # len < 8*NBYTES
				221	and rem, len, (8NBYTES-1) # rem = len % (8NBYTES)
				222	PREF( 0, 3*32(src) )
				223	PREF( 1, 3*32(dst) )
				224	.align 4
				225	1:
				226	EXC( LOAD t0, UNIT(0)(src), l_exc)
				227	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				228	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				229	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				230	SUB len, len, 8*NBYTES
				231	EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
				232	EXC( LOAD t7, UNIT(5)(src), l_exc_copy)
				233	STORE t0, UNIT(0)(dst)
				234	STORE t1, UNIT(1)(dst)
				235	EXC( LOAD t0, UNIT(6)(src), l_exc_copy)
				236	EXC( LOAD t1, UNIT(7)(src), l_exc_copy)
				237	ADD src, src, 8*NBYTES
				238	ADD dst, dst, 8*NBYTES
				239	STORE t2, UNIT(-6)(dst)
				240	STORE t3, UNIT(-5)(dst)
				241	STORE t4, UNIT(-4)(dst)
				242	STORE t7, UNIT(-3)(dst)
				243	STORE t0, UNIT(-2)(dst)
				244	STORE t1, UNIT(-1)(dst)
				245	PREF( 0, 8*32(src) )
				246	PREF( 1, 8*32(dst) )
				247	bne len, rem, 1b
				248	nop
				249
				250	/*
				251	* len == rem == the number of bytes left to copy < 8*NBYTES
				252	*/
				253	cleanup_both_aligned:
				254	beqz len, done
				255	sltu t0, len, 4*NBYTES
				256	bnez t0, less_than_4units
				257	and rem, len, (NBYTES-1) # rem = len % NBYTES
				258	/*
				259	* len >= 4*NBYTES
				260	*/
				261	EXC( LOAD t0, UNIT(0)(src), l_exc)
				262	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				263	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				264	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				265	SUB len, len, 4*NBYTES
				266	ADD src, src, 4*NBYTES
				267	STORE t0, UNIT(0)(dst)
				268	STORE t1, UNIT(1)(dst)
				269	STORE t2, UNIT(2)(dst)
				270	STORE t3, UNIT(3)(dst)
				271	beqz len, done
				272	ADD dst, dst, 4*NBYTES
				273	less_than_4units:
				274	/*
				275	* rem = len % NBYTES
				276	*/
				277	beq rem, len, copy_bytes
				278	nop
				279	1:
				280	EXC( LOAD t0, 0(src), l_exc)
				281	ADD src, src, NBYTES
				282	SUB len, len, NBYTES
				283	STORE t0, 0(dst)
				284	bne rem, len, 1b
				285	ADD dst, dst, NBYTES
				286
				287	/*
				288	* src and dst are aligned, need to copy rem bytes (rem < NBYTES)
				289	* A loop would do only a byte at a time with possible branch
				290	* mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
				291	* because can't assume read-access to dst. Instead, use
				292	* STREST dst, which doesn't require read access to dst.
				293	*
				294	* This code should perform better than a simple loop on modern,
				295	* wide-issue mips processors because the code has fewer branches and
				296	* more instruction-level parallelism.
				297	*/
				298	#define bits t2
				299	beqz len, done
				300	ADD t1, dst, len # t1 is just past last byte of dst
				301	li bits, 8*NBYTES
				302	SLL rem, len, 3 # rem = number of bits to keep
				303	EXC( LOAD t0, 0(src), l_exc)
				304	SUB bits, bits, rem # bits = number of bits to discard
				305	SHIFT_DISCARD t0, t0, bits
				306	STREST t0, -1(t1)
				307	jr ra
				308	move len, zero
				309	dst_unaligned:
				310	/*
				311	* dst is unaligned
				312	* t0 = src & ADDRMASK
				313	* t1 = dst & ADDRMASK; T1 > 0
				314	* len >= NBYTES
				315	*
				316	* Copy enough bytes to align dst
				317	* Set match = (src and dst have same alignment)
				318	*/
				319	#define match rem
				320	EXC( LDFIRST t3, FIRST(0)(src), l_exc)
				321	ADD t2, zero, NBYTES
				322	EXC( LDREST t3, REST(0)(src), l_exc_copy)
				323	SUB t2, t2, t1 # t2 = number of bytes copied
				324	xor match, t0, t1
				325	STFIRST t3, FIRST(0)(dst)
				326	beq len, t2, done
				327	SUB len, len, t2
				328	ADD dst, dst, t2
				329	beqz match, both_aligned
				330	ADD src, src, t2
				331
				332	src_unaligned_dst_aligned:
				333	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
				334	PREF( 0, 3*32(src) )
				335	beqz t0, cleanup_src_unaligned
				336	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
				337	PREF( 1, 3*32(dst) )
				338	1:
				339	/*
				340	* Avoid consecutive LD*'s to the same register since some mips
				341	* implementations can't issue them in the same cycle.
				342	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
				343	* are to the same unit (unless src is aligned, but it's not).
				344	*/
				345	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
				346	EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
				347	SUB len, len, 4*NBYTES
				348	EXC( LDREST t0, REST(0)(src), l_exc_copy)
				349	EXC( LDREST t1, REST(1)(src), l_exc_copy)
				350	EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
				351	EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
				352	EXC( LDREST t2, REST(2)(src), l_exc_copy)
				353	EXC( LDREST t3, REST(3)(src), l_exc_copy)
				354	PREF( 0, 9*32(src) ) # 0 is PREF_LOAD (not streamed)
				355	ADD src, src, 4*NBYTES
				356	#ifdef CONFIG_CPU_SB1
				357	nop # improves slotting
				358	#endif
				359	STORE t0, UNIT(0)(dst)
				360	STORE t1, UNIT(1)(dst)
				361	STORE t2, UNIT(2)(dst)
				362	STORE t3, UNIT(3)(dst)
				363	PREF( 1, 9*32(dst) ) # 1 is PREF_STORE (not streamed)
				364	bne len, rem, 1b
				365	ADD dst, dst, 4*NBYTES
				366
				367	cleanup_src_unaligned:
				368	beqz len, done
				369	and rem, len, NBYTES-1 # rem = len % NBYTES
				370	beq rem, len, copy_bytes
				371	nop
				372	1:
				373	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
				374	EXC( LDREST t0, REST(0)(src), l_exc_copy)
				375	ADD src, src, NBYTES
				376	SUB len, len, NBYTES
				377	STORE t0, 0(dst)
				378	bne len, rem, 1b
				379	ADD dst, dst, NBYTES
				380
				381	copy_bytes_checklen:
				382	beqz len, done
				383	nop
				384	copy_bytes:
				385	/* 0 < len < NBYTES */
				386	#define COPY_BYTE(N) \
				387	EXC( lb t0, N(src), l_exc); \
				388	SUB len, len, 1; \
				389	beqz len, done; \
				390	sb t0, N(dst)
				391
				392	COPY_BYTE(0)
				393	COPY_BYTE(1)
				394	#ifdef USE_DOUBLE
				395	COPY_BYTE(2)
				396	COPY_BYTE(3)
				397	COPY_BYTE(4)
				398	COPY_BYTE(5)
				399	#endif
				400	EXC( lb t0, NBYTES-2(src), l_exc)
				401	SUB len, len, 1
				402	jr ra
				403	sb t0, NBYTES-2(dst)
				404	done:
				405	jr ra
				406	nop
				407	END(__copy_user_inatomic)
				408
				409	l_exc_copy:
				410	/*
				411	* Copy bytes from src until faulting load address (or until a
				412	* lb faults)
				413	*
				414	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
				415	* may be more than a byte beyond the last address.
				416	* Hence, the lb below may get an exception.
				417	*
				418	* Assumes src < THREAD_BUADDR($28)
				419	*/
				420	LOAD t0, TI_TASK($28)
				421	nop
				422	LOAD t0, THREAD_BUADDR(t0)
				423	1:
				424	EXC( lb t1, 0(src), l_exc)
				425	ADD src, src, 1
				426	sb t1, 0(dst) # can't fault -- we're copy_from_user
				427	bne src, t0, 1b
				428	ADD dst, dst, 1
				429	l_exc:
				430	LOAD t0, TI_TASK($28)
				431	nop
				432	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
				433	nop
				434	SUB len, AT, t0 # len number of uncopied bytes
				435	jr ra
				436	nop