Blame - arch/mips/cavium-octeon/octeon-memcpy.S - kernel/msm-4.9

blob: 88e0cddca2057addb1ad823844cc3e2351e0c2e7 [file] [log] [blame]

David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame^]	1	/*
				2	* This file is subject to the terms and conditions of the GNU General Public
				3	* License. See the file "COPYING" in the main directory of this archive
				4	* for more details.
				5	*
				6	* Unified implementation of memcpy, memmove and the __copy_user backend.
				7	*
				8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
				9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
				10	* Copyright (C) 2002 Broadcom, Inc.
				11	* memcpy/copy_user author: Mark Vandevoorde
				12	*
				13	* Mnemonic names for arguments to memcpy/__copy_user
				14	*/
				15
				16	#include <asm/asm.h>
				17	#include <asm/asm-offsets.h>
				18	#include <asm/regdef.h>
				19
				20	#define dst a0
				21	#define src a1
				22	#define len a2
				23
				24	/*
				25	* Spec
				26	*
				27	* memcpy copies len bytes from src to dst and sets v0 to dst.
				28	* It assumes that
				29	* - src and dst don't overlap
				30	* - src is readable
				31	* - dst is writable
				32	* memcpy uses the standard calling convention
				33	*
				34	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
				35	* the number of uncopied bytes due to an exception caused by a read or write.
				36	* __copy_user assumes that src and dst don't overlap, and that the call is
				37	* implementing one of the following:
				38	* copy_to_user
				39	* - src is readable (no exceptions when reading src)
				40	* copy_from_user
				41	* - dst is writable (no exceptions when writing dst)
				42	* __copy_user uses a non-standard calling convention; see
				43	* arch/mips/include/asm/uaccess.h
				44	*
				45	* When an exception happens on a load, the handler must
				46	# ensure that all of the destination buffer is overwritten to prevent
				47	* leaking information to user mode programs.
				48	*/
				49
				50	/*
				51	* Implementation
				52	*/
				53
				54	/*
				55	* The exception handler for loads requires that:
				56	* 1- AT contain the address of the byte just past the end of the source
				57	* of the copy,
				58	* 2- src_entry <= src < AT, and
				59	* 3- (dst - src) == (dst_entry - src_entry),
				60	* The _entry suffix denotes values when __copy_user was called.
				61	*
				62	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
				63	* (2) is met by incrementing src by the number of bytes copied
				64	* (3) is met by not doing loads between a pair of increments of dst and src
				65	*
				66	* The exception handlers for stores adjust len (if necessary) and return.
				67	* These handlers do not need to overwrite any data.
				68	*
				69	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
				70	* they're not protected.
				71	*/
				72
				73	#define EXC(inst_reg,addr,handler) \
				74	9: inst_reg, addr; \
				75	.section __ex_table,"a"; \
				76	PTR 9b, handler; \
				77	.previous
				78
				79	/*
				80	* Only on the 64-bit kernel we can made use of 64-bit registers.
				81	*/
				82	#ifdef CONFIG_64BIT
				83	#define USE_DOUBLE
				84	#endif
				85
				86	#ifdef USE_DOUBLE
				87
				88	#define LOAD ld
				89	#define LOADL ldl
				90	#define LOADR ldr
				91	#define STOREL sdl
				92	#define STORER sdr
				93	#define STORE sd
				94	#define ADD daddu
				95	#define SUB dsubu
				96	#define SRL dsrl
				97	#define SRA dsra
				98	#define SLL dsll
				99	#define SLLV dsllv
				100	#define SRLV dsrlv
				101	#define NBYTES 8
				102	#define LOG_NBYTES 3
				103
				104	/*
				105	* As we are sharing code base with the mips32 tree (which use the o32 ABI
				106	* register definitions). We need to redefine the register definitions from
				107	* the n64 ABI register naming to the o32 ABI register naming.
				108	*/
				109	#undef t0
				110	#undef t1
				111	#undef t2
				112	#undef t3
				113	#define t0 $8
				114	#define t1 $9
				115	#define t2 $10
				116	#define t3 $11
				117	#define t4 $12
				118	#define t5 $13
				119	#define t6 $14
				120	#define t7 $15
				121
				122	#else
				123
				124	#define LOAD lw
				125	#define LOADL lwl
				126	#define LOADR lwr
				127	#define STOREL swl
				128	#define STORER swr
				129	#define STORE sw
				130	#define ADD addu
				131	#define SUB subu
				132	#define SRL srl
				133	#define SLL sll
				134	#define SRA sra
				135	#define SLLV sllv
				136	#define SRLV srlv
				137	#define NBYTES 4
				138	#define LOG_NBYTES 2
				139
				140	#endif /* USE_DOUBLE */
				141
				142	#ifdef CONFIG_CPU_LITTLE_ENDIAN
				143	#define LDFIRST LOADR
				144	#define LDREST LOADL
				145	#define STFIRST STORER
				146	#define STREST STOREL
				147	#define SHIFT_DISCARD SLLV
				148	#else
				149	#define LDFIRST LOADL
				150	#define LDREST LOADR
				151	#define STFIRST STOREL
				152	#define STREST STORER
				153	#define SHIFT_DISCARD SRLV
				154	#endif
				155
				156	#define FIRST(unit) ((unit)*NBYTES)
				157	#define REST(unit) (FIRST(unit)+NBYTES-1)
				158	#define UNIT(unit) FIRST(unit)
				159
				160	#define ADDRMASK (NBYTES-1)
				161
				162	.text
				163	.set noreorder
				164	.set noat
				165
				166	/*
				167	* A combined memcpy/__copy_user
				168	* __copy_user sets len to 0 for success; else to an upper bound of
				169	* the number of uncopied bytes.
				170	* memcpy sets v0 to dst.
				171	*/
				172	.align 5
				173	LEAF(memcpy) /* a0=dst a1=src a2=len */
				174	move v0, dst /* return value */
				175	__memcpy:
				176	FEXPORT(__copy_user)
				177	/*
				178	* Note: dst & src may be unaligned, len may be 0
				179	* Temps
				180	*/
				181	#
				182	# Octeon doesn't care if the destination is unaligned. The hardware
				183	# can fix it faster than we can special case the assembly.
				184	#
				185	pref 0, 0(src)
				186	sltu t0, len, NBYTES # Check if < 1 word
				187	bnez t0, copy_bytes_checklen
				188	and t0, src, ADDRMASK # Check if src unaligned
				189	bnez t0, src_unaligned
				190	sltu t0, len, 4*NBYTES # Check if < 4 words
				191	bnez t0, less_than_4units
				192	sltu t0, len, 8*NBYTES # Check if < 8 words
				193	bnez t0, less_than_8units
				194	sltu t0, len, 16*NBYTES # Check if < 16 words
				195	bnez t0, cleanup_both_aligned
				196	sltu t0, len, 128+1 # Check if len < 129
				197	bnez t0, 1f # Skip prefetch if len is too short
				198	sltu t0, len, 256+1 # Check if len < 257
				199	bnez t0, 1f # Skip prefetch if len is too short
				200	pref 0, 128(src) # We must not prefetch invalid addresses
				201	#
				202	# This is where we loop if there is more than 128 bytes left
				203	2: pref 0, 256(src) # We must not prefetch invalid addresses
				204	#
				205	# This is where we loop if we can't prefetch anymore
				206	1:
				207	EXC( LOAD t0, UNIT(0)(src), l_exc)
				208	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				209	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				210	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				211	SUB len, len, 16*NBYTES
				212	EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
				213	EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
				214	EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
				215	EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
				216	EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
				217	EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
				218	EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
				219	EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
				220	EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
				221	EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
				222	EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
				223	ADD src, src, 16*NBYTES
				224	EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
				225	ADD dst, dst, 16*NBYTES
				226	EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
				227	EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
				228	EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
				229	EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
				230	EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
				231	EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
				232	EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
				233	EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
				234	EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
				235	EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
				236	EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
				237	EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
				238	EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
				239	EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
				240	EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
				241	EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
				242	sltu t0, len, 256+1 # See if we can prefetch more
				243	beqz t0, 2b
				244	sltu t0, len, 128 # See if we can loop more time
				245	beqz t0, 1b
				246	nop
				247	#
				248	# Jump here if there are less than 16*NBYTES left.
				249	#
				250	cleanup_both_aligned:
				251	beqz len, done
				252	sltu t0, len, 8*NBYTES
				253	bnez t0, less_than_8units
				254	nop
				255	EXC( LOAD t0, UNIT(0)(src), l_exc)
				256	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				257	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				258	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				259	SUB len, len, 8*NBYTES
				260	EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
				261	EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
				262	EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
				263	EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
				264	EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
				265	EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
				266	EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
				267	EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
				268	EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
				269	EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
				270	EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
				271	EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
				272	ADD src, src, 8*NBYTES
				273	beqz len, done
				274	ADD dst, dst, 8*NBYTES
				275	#
				276	# Jump here if there are less than 8*NBYTES left.
				277	#
				278	less_than_8units:
				279	sltu t0, len, 4*NBYTES
				280	bnez t0, less_than_4units
				281	nop
				282	EXC( LOAD t0, UNIT(0)(src), l_exc)
				283	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				284	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				285	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				286	SUB len, len, 4*NBYTES
				287	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
				288	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
				289	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
				290	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
				291	ADD src, src, 4*NBYTES
				292	beqz len, done
				293	ADD dst, dst, 4*NBYTES
				294	#
				295	# Jump here if there are less than 4*NBYTES left. This means
				296	# we may need to copy up to 3 NBYTES words.
				297	#
				298	less_than_4units:
				299	sltu t0, len, 1*NBYTES
				300	bnez t0, copy_bytes_checklen
				301	nop
				302	#
				303	# 1) Copy NBYTES, then check length again
				304	#
				305	EXC( LOAD t0, 0(src), l_exc)
				306	SUB len, len, NBYTES
				307	sltu t1, len, 8
				308	EXC( STORE t0, 0(dst), s_exc_p1u)
				309	ADD src, src, NBYTES
				310	bnez t1, copy_bytes_checklen
				311	ADD dst, dst, NBYTES
				312	#
				313	# 2) Copy NBYTES, then check length again
				314	#
				315	EXC( LOAD t0, 0(src), l_exc)
				316	SUB len, len, NBYTES
				317	sltu t1, len, 8
				318	EXC( STORE t0, 0(dst), s_exc_p1u)
				319	ADD src, src, NBYTES
				320	bnez t1, copy_bytes_checklen
				321	ADD dst, dst, NBYTES
				322	#
				323	# 3) Copy NBYTES, then check length again
				324	#
				325	EXC( LOAD t0, 0(src), l_exc)
				326	SUB len, len, NBYTES
				327	ADD src, src, NBYTES
				328	ADD dst, dst, NBYTES
				329	b copy_bytes_checklen
				330	EXC( STORE t0, -8(dst), s_exc_p1u)
				331
				332	src_unaligned:
				333	#define rem t8
				334	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
				335	beqz t0, cleanup_src_unaligned
				336	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
				337	1:
				338	/*
				339	* Avoid consecutive LD*'s to the same register since some mips
				340	* implementations can't issue them in the same cycle.
				341	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
				342	* are to the same unit (unless src is aligned, but it's not).
				343	*/
				344	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
				345	EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
				346	SUB len, len, 4*NBYTES
				347	EXC( LDREST t0, REST(0)(src), l_exc_copy)
				348	EXC( LDREST t1, REST(1)(src), l_exc_copy)
				349	EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
				350	EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
				351	EXC( LDREST t2, REST(2)(src), l_exc_copy)
				352	EXC( LDREST t3, REST(3)(src), l_exc_copy)
				353	ADD src, src, 4*NBYTES
				354	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
				355	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
				356	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
				357	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
				358	bne len, rem, 1b
				359	ADD dst, dst, 4*NBYTES
				360
				361	cleanup_src_unaligned:
				362	beqz len, done
				363	and rem, len, NBYTES-1 # rem = len % NBYTES
				364	beq rem, len, copy_bytes
				365	nop
				366	1:
				367	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
				368	EXC( LDREST t0, REST(0)(src), l_exc_copy)
				369	SUB len, len, NBYTES
				370	EXC( STORE t0, 0(dst), s_exc_p1u)
				371	ADD src, src, NBYTES
				372	bne len, rem, 1b
				373	ADD dst, dst, NBYTES
				374
				375	copy_bytes_checklen:
				376	beqz len, done
				377	nop
				378	copy_bytes:
				379	/* 0 < len < NBYTES */
				380	#define COPY_BYTE(N) \
				381	EXC( lb t0, N(src), l_exc); \
				382	SUB len, len, 1; \
				383	beqz len, done; \
				384	EXC( sb t0, N(dst), s_exc_p1)
				385
				386	COPY_BYTE(0)
				387	COPY_BYTE(1)
				388	#ifdef USE_DOUBLE
				389	COPY_BYTE(2)
				390	COPY_BYTE(3)
				391	COPY_BYTE(4)
				392	COPY_BYTE(5)
				393	#endif
				394	EXC( lb t0, NBYTES-2(src), l_exc)
				395	SUB len, len, 1
				396	jr ra
				397	EXC( sb t0, NBYTES-2(dst), s_exc_p1)
				398	done:
				399	jr ra
				400	nop
				401	END(memcpy)
				402
				403	l_exc_copy:
				404	/*
				405	* Copy bytes from src until faulting load address (or until a
				406	* lb faults)
				407	*
				408	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
				409	* may be more than a byte beyond the last address.
				410	* Hence, the lb below may get an exception.
				411	*
				412	* Assumes src < THREAD_BUADDR($28)
				413	*/
				414	LOAD t0, TI_TASK($28)
				415	nop
				416	LOAD t0, THREAD_BUADDR(t0)
				417	1:
				418	EXC( lb t1, 0(src), l_exc)
				419	ADD src, src, 1
				420	sb t1, 0(dst) # can't fault -- we're copy_from_user
				421	bne src, t0, 1b
				422	ADD dst, dst, 1
				423	l_exc:
				424	LOAD t0, TI_TASK($28)
				425	nop
				426	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
				427	nop
				428	SUB len, AT, t0 # len number of uncopied bytes
				429	/*
				430	* Here's where we rely on src and dst being incremented in tandem,
				431	* See (3) above.
				432	* dst += (fault addr - src) to put dst at first byte to clear
				433	*/
				434	ADD dst, t0 # compute start address in a1
				435	SUB dst, src
				436	/*
				437	* Clear len bytes starting at dst. Can't call __bzero because it
				438	* might modify len. An inefficient loop for these rare times...
				439	*/
				440	beqz len, done
				441	SUB src, len, 1
				442	1: sb zero, 0(dst)
				443	ADD dst, dst, 1
				444	bnez src, 1b
				445	SUB src, src, 1
				446	jr ra
				447	nop
				448
				449
				450	#define SEXC(n) \
				451	s_exc_p ## n ## u: \
				452	jr ra; \
				453	ADD len, len, n*NBYTES
				454
				455	SEXC(16)
				456	SEXC(15)
				457	SEXC(14)
				458	SEXC(13)
				459	SEXC(12)
				460	SEXC(11)
				461	SEXC(10)
				462	SEXC(9)
				463	SEXC(8)
				464	SEXC(7)
				465	SEXC(6)
				466	SEXC(5)
				467	SEXC(4)
				468	SEXC(3)
				469	SEXC(2)
				470	SEXC(1)
				471
				472	s_exc_p1:
				473	jr ra
				474	ADD len, len, 1
				475	s_exc:
				476	jr ra
				477	nop
				478
				479	.align 5
				480	LEAF(memmove)
				481	ADD t0, a0, a2
				482	ADD t1, a1, a2
				483	sltu t0, a1, t0 # dst + len <= src -> memcpy
				484	sltu t1, a0, t1 # dst >= src + len -> memcpy
				485	and t0, t1
				486	beqz t0, __memcpy
				487	move v0, a0 /* return value */
				488	beqz a2, r_out
				489	END(memmove)
				490
				491	/* fall through to __rmemcpy */
				492	LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
				493	sltu t0, a1, a0
				494	beqz t0, r_end_bytes_up # src >= dst
				495	nop
				496	ADD a0, a2 # dst = dst + len
				497	ADD a1, a2 # src = src + len
				498
				499	r_end_bytes:
				500	lb t0, -1(a1)
				501	SUB a2, a2, 0x1
				502	sb t0, -1(a0)
				503	SUB a1, a1, 0x1
				504	bnez a2, r_end_bytes
				505	SUB a0, a0, 0x1
				506
				507	r_out:
				508	jr ra
				509	move a2, zero
				510
				511	r_end_bytes_up:
				512	lb t0, (a1)
				513	SUB a2, a2, 0x1
				514	sb t0, (a0)
				515	ADD a1, a1, 0x1
				516	bnez a2, r_end_bytes_up
				517	ADD a0, a0, 0x1
				518
				519	jr ra
				520	move a2, zero
				521	END(__rmemcpy)