Blame - arch/mips/cavium-octeon/octeon-memcpy.S - kernel/msm-4.9

blob: 64e08df51d65f76b79310b1df6d52e2aa78e2a97 [file] [log] [blame]

David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	1	/*
				2	* This file is subject to the terms and conditions of the GNU General Public
				3	* License. See the file "COPYING" in the main directory of this archive
				4	* for more details.
				5	*
				6	* Unified implementation of memcpy, memmove and the __copy_user backend.
				7	*
				8	* Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
				9	* Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
				10	* Copyright (C) 2002 Broadcom, Inc.
				11	* memcpy/copy_user author: Mark Vandevoorde
				12	*
				13	* Mnemonic names for arguments to memcpy/__copy_user
				14	*/
				15
				16	#include <asm/asm.h>
				17	#include <asm/asm-offsets.h>
				18	#include <asm/regdef.h>
				19
				20	#define dst a0
				21	#define src a1
				22	#define len a2
				23
				24	/*
				25	* Spec
				26	*
				27	* memcpy copies len bytes from src to dst and sets v0 to dst.
				28	* It assumes that
				29	* - src and dst don't overlap
				30	* - src is readable
				31	* - dst is writable
				32	* memcpy uses the standard calling convention
				33	*
				34	* __copy_user copies up to len bytes from src to dst and sets a2 (len) to
				35	* the number of uncopied bytes due to an exception caused by a read or write.
				36	* __copy_user assumes that src and dst don't overlap, and that the call is
				37	* implementing one of the following:
				38	* copy_to_user
				39	* - src is readable (no exceptions when reading src)
				40	* copy_from_user
				41	* - dst is writable (no exceptions when writing dst)
				42	* __copy_user uses a non-standard calling convention; see
				43	* arch/mips/include/asm/uaccess.h
				44	*
				45	* When an exception happens on a load, the handler must
				46	# ensure that all of the destination buffer is overwritten to prevent
				47	* leaking information to user mode programs.
				48	*/
				49
				50	/*
				51	* Implementation
				52	*/
				53
				54	/*
				55	* The exception handler for loads requires that:
				56	* 1- AT contain the address of the byte just past the end of the source
				57	* of the copy,
				58	* 2- src_entry <= src < AT, and
				59	* 3- (dst - src) == (dst_entry - src_entry),
				60	* The _entry suffix denotes values when __copy_user was called.
				61	*
				62	* (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
				63	* (2) is met by incrementing src by the number of bytes copied
				64	* (3) is met by not doing loads between a pair of increments of dst and src
				65	*
				66	* The exception handlers for stores adjust len (if necessary) and return.
				67	* These handlers do not need to overwrite any data.
				68	*
				69	* For __rmemcpy and memmove an exception is always a kernel bug, therefore
				70	* they're not protected.
				71	*/
				72
				73	#define EXC(inst_reg,addr,handler) \
				74	9: inst_reg, addr; \
				75	.section __ex_table,"a"; \
				76	PTR 9b, handler; \
				77	.previous
				78
				79	/*
				80	* Only on the 64-bit kernel we can made use of 64-bit registers.
				81	*/
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	82
				83	#define LOAD ld
				84	#define LOADL ldl
				85	#define LOADR ldr
				86	#define STOREL sdl
				87	#define STORER sdr
				88	#define STORE sd
				89	#define ADD daddu
				90	#define SUB dsubu
				91	#define SRL dsrl
				92	#define SRA dsra
				93	#define SLL dsll
				94	#define SLLV dsllv
				95	#define SRLV dsrlv
				96	#define NBYTES 8
				97	#define LOG_NBYTES 3
				98
				99	/*
				100	* As we are sharing code base with the mips32 tree (which use the o32 ABI
				101	* register definitions). We need to redefine the register definitions from
				102	* the n64 ABI register naming to the o32 ABI register naming.
				103	*/
				104	#undef t0
				105	#undef t1
				106	#undef t2
				107	#undef t3
				108	#define t0 $8
				109	#define t1 $9
				110	#define t2 $10
				111	#define t3 $11
				112	#define t4 $12
				113	#define t5 $13
				114	#define t6 $14
				115	#define t7 $15
				116
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	117	#ifdef CONFIG_CPU_LITTLE_ENDIAN
				118	#define LDFIRST LOADR
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	119	#define LDREST LOADL
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	120	#define STFIRST STORER
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	121	#define STREST STOREL
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	122	#define SHIFT_DISCARD SLLV
				123	#else
				124	#define LDFIRST LOADL
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	125	#define LDREST LOADR
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	126	#define STFIRST STOREL
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	127	#define STREST STORER
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	128	#define SHIFT_DISCARD SRLV
				129	#endif
				130
				131	#define FIRST(unit) ((unit)*NBYTES)
				132	#define REST(unit) (FIRST(unit)+NBYTES-1)
				133	#define UNIT(unit) FIRST(unit)
				134
				135	#define ADDRMASK (NBYTES-1)
				136
				137	.text
				138	.set noreorder
				139	.set noat
				140
				141	/*
David Daney	914f848	2012-06-06 14:50:17 -0700	[diff] [blame]	142	* t7 is used as a flag to note inatomic mode.
				143	*/
				144	LEAF(__copy_user_inatomic)
				145	b __copy_user_common
				146	li t7, 1
				147	END(__copy_user_inatomic)
				148
				149	/*
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	150	* A combined memcpy/__copy_user
				151	* __copy_user sets len to 0 for success; else to an upper bound of
				152	* the number of uncopied bytes.
				153	* memcpy sets v0 to dst.
				154	*/
				155	.align 5
				156	LEAF(memcpy) /* a0=dst a1=src a2=len */
				157	move v0, dst /* return value */
				158	__memcpy:
				159	FEXPORT(__copy_user)
David Daney	914f848	2012-06-06 14:50:17 -0700	[diff] [blame]	160	li t7, 0 /* not inatomic */
				161	__copy_user_common:
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	162	/*
				163	* Note: dst & src may be unaligned, len may be 0
				164	* Temps
				165	*/
				166	#
				167	# Octeon doesn't care if the destination is unaligned. The hardware
				168	# can fix it faster than we can special case the assembly.
				169	#
				170	pref 0, 0(src)
				171	sltu t0, len, NBYTES # Check if < 1 word
				172	bnez t0, copy_bytes_checklen
				173	and t0, src, ADDRMASK # Check if src unaligned
				174	bnez t0, src_unaligned
				175	sltu t0, len, 4*NBYTES # Check if < 4 words
				176	bnez t0, less_than_4units
				177	sltu t0, len, 8*NBYTES # Check if < 8 words
				178	bnez t0, less_than_8units
				179	sltu t0, len, 16*NBYTES # Check if < 16 words
				180	bnez t0, cleanup_both_aligned
				181	sltu t0, len, 128+1 # Check if len < 129
				182	bnez t0, 1f # Skip prefetch if len is too short
				183	sltu t0, len, 256+1 # Check if len < 257
				184	bnez t0, 1f # Skip prefetch if len is too short
				185	pref 0, 128(src) # We must not prefetch invalid addresses
				186	#
				187	# This is where we loop if there is more than 128 bytes left
				188	2: pref 0, 256(src) # We must not prefetch invalid addresses
				189	#
				190	# This is where we loop if we can't prefetch anymore
				191	1:
				192	EXC( LOAD t0, UNIT(0)(src), l_exc)
				193	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				194	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				195	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				196	SUB len, len, 16*NBYTES
				197	EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
				198	EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
				199	EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
				200	EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
				201	EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
				202	EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
				203	EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
				204	EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
				205	EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
				206	EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
				207	EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
				208	ADD src, src, 16*NBYTES
				209	EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
				210	ADD dst, dst, 16*NBYTES
				211	EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
				212	EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
				213	EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
				214	EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
				215	EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
				216	EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
				217	EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
				218	EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
				219	EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
				220	EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
				221	EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
				222	EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
				223	EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
				224	EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
				225	EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
				226	EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
				227	sltu t0, len, 256+1 # See if we can prefetch more
				228	beqz t0, 2b
				229	sltu t0, len, 128 # See if we can loop more time
				230	beqz t0, 1b
				231	nop
				232	#
				233	# Jump here if there are less than 16*NBYTES left.
				234	#
				235	cleanup_both_aligned:
				236	beqz len, done
				237	sltu t0, len, 8*NBYTES
				238	bnez t0, less_than_8units
				239	nop
				240	EXC( LOAD t0, UNIT(0)(src), l_exc)
				241	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				242	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				243	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				244	SUB len, len, 8*NBYTES
				245	EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
				246	EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
				247	EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
				248	EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
				249	EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
				250	EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
				251	EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
				252	EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
				253	EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
				254	EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
				255	EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
				256	EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
				257	ADD src, src, 8*NBYTES
				258	beqz len, done
				259	ADD dst, dst, 8*NBYTES
				260	#
				261	# Jump here if there are less than 8*NBYTES left.
				262	#
				263	less_than_8units:
				264	sltu t0, len, 4*NBYTES
				265	bnez t0, less_than_4units
				266	nop
				267	EXC( LOAD t0, UNIT(0)(src), l_exc)
				268	EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
				269	EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
				270	EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
				271	SUB len, len, 4*NBYTES
				272	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
				273	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
				274	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
				275	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
				276	ADD src, src, 4*NBYTES
				277	beqz len, done
				278	ADD dst, dst, 4*NBYTES
				279	#
				280	# Jump here if there are less than 4*NBYTES left. This means
				281	# we may need to copy up to 3 NBYTES words.
				282	#
				283	less_than_4units:
				284	sltu t0, len, 1*NBYTES
				285	bnez t0, copy_bytes_checklen
				286	nop
				287	#
				288	# 1) Copy NBYTES, then check length again
				289	#
				290	EXC( LOAD t0, 0(src), l_exc)
				291	SUB len, len, NBYTES
				292	sltu t1, len, 8
				293	EXC( STORE t0, 0(dst), s_exc_p1u)
				294	ADD src, src, NBYTES
				295	bnez t1, copy_bytes_checklen
				296	ADD dst, dst, NBYTES
				297	#
				298	# 2) Copy NBYTES, then check length again
				299	#
				300	EXC( LOAD t0, 0(src), l_exc)
				301	SUB len, len, NBYTES
				302	sltu t1, len, 8
				303	EXC( STORE t0, 0(dst), s_exc_p1u)
				304	ADD src, src, NBYTES
				305	bnez t1, copy_bytes_checklen
				306	ADD dst, dst, NBYTES
				307	#
				308	# 3) Copy NBYTES, then check length again
				309	#
				310	EXC( LOAD t0, 0(src), l_exc)
				311	SUB len, len, NBYTES
				312	ADD src, src, NBYTES
				313	ADD dst, dst, NBYTES
				314	b copy_bytes_checklen
				315	EXC( STORE t0, -8(dst), s_exc_p1u)
				316
				317	src_unaligned:
				318	#define rem t8
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	319	SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	320	beqz t0, cleanup_src_unaligned
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	321	and rem, len, (4NBYTES-1) # rem = len % 4NBYTES
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	322	1:
				323	/*
				324	* Avoid consecutive LD*'s to the same register since some mips
				325	* implementations can't issue them in the same cycle.
				326	* It's OK to load FIRST(N+1) before REST(N) because the two addresses
				327	* are to the same unit (unless src is aligned, but it's not).
				328	*/
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	329	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
				330	EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
				331	SUB len, len, 4*NBYTES
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	332	EXC( LDREST t0, REST(0)(src), l_exc_copy)
				333	EXC( LDREST t1, REST(1)(src), l_exc_copy)
Ralf Baechle	7034228	2013-01-22 12:59:30 +0100	[diff] [blame]	334	EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
				335	EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	336	EXC( LDREST t2, REST(2)(src), l_exc_copy)
				337	EXC( LDREST t3, REST(3)(src), l_exc_copy)
				338	ADD src, src, 4*NBYTES
				339	EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
				340	EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
				341	EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
				342	EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
				343	bne len, rem, 1b
				344	ADD dst, dst, 4*NBYTES
				345
				346	cleanup_src_unaligned:
				347	beqz len, done
				348	and rem, len, NBYTES-1 # rem = len % NBYTES
				349	beq rem, len, copy_bytes
				350	nop
				351	1:
				352	EXC( LDFIRST t0, FIRST(0)(src), l_exc)
				353	EXC( LDREST t0, REST(0)(src), l_exc_copy)
				354	SUB len, len, NBYTES
				355	EXC( STORE t0, 0(dst), s_exc_p1u)
				356	ADD src, src, NBYTES
				357	bne len, rem, 1b
				358	ADD dst, dst, NBYTES
				359
				360	copy_bytes_checklen:
				361	beqz len, done
				362	nop
				363	copy_bytes:
				364	/* 0 < len < NBYTES */
				365	#define COPY_BYTE(N) \
				366	EXC( lb t0, N(src), l_exc); \
				367	SUB len, len, 1; \
				368	beqz len, done; \
				369	EXC( sb t0, N(dst), s_exc_p1)
				370
				371	COPY_BYTE(0)
				372	COPY_BYTE(1)
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	373	COPY_BYTE(2)
				374	COPY_BYTE(3)
				375	COPY_BYTE(4)
				376	COPY_BYTE(5)
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	377	EXC( lb t0, NBYTES-2(src), l_exc)
				378	SUB len, len, 1
				379	jr ra
				380	EXC( sb t0, NBYTES-2(dst), s_exc_p1)
				381	done:
				382	jr ra
				383	nop
				384	END(memcpy)
				385
				386	l_exc_copy:
				387	/*
				388	* Copy bytes from src until faulting load address (or until a
				389	* lb faults)
				390	*
				391	* When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
				392	* may be more than a byte beyond the last address.
				393	* Hence, the lb below may get an exception.
				394	*
				395	* Assumes src < THREAD_BUADDR($28)
				396	*/
				397	LOAD t0, TI_TASK($28)
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	398	LOAD t0, THREAD_BUADDR(t0)
				399	1:
				400	EXC( lb t1, 0(src), l_exc)
				401	ADD src, src, 1
				402	sb t1, 0(dst) # can't fault -- we're copy_from_user
				403	bne src, t0, 1b
				404	ADD dst, dst, 1
				405	l_exc:
				406	LOAD t0, TI_TASK($28)
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	407	LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	408	SUB len, AT, t0 # len number of uncopied bytes
David Daney	914f848	2012-06-06 14:50:17 -0700	[diff] [blame]	409	bnez t7, 2f /* Skip the zeroing out part if inatomic */
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	410	/*
				411	* Here's where we rely on src and dst being incremented in tandem,
				412	* See (3) above.
				413	* dst += (fault addr - src) to put dst at first byte to clear
				414	*/
				415	ADD dst, t0 # compute start address in a1
				416	SUB dst, src
				417	/*
				418	* Clear len bytes starting at dst. Can't call __bzero because it
				419	* might modify len. An inefficient loop for these rare times...
				420	*/
				421	beqz len, done
				422	SUB src, len, 1
				423	1: sb zero, 0(dst)
				424	ADD dst, dst, 1
				425	bnez src, 1b
				426	SUB src, src, 1
David Daney	914f848	2012-06-06 14:50:17 -0700	[diff] [blame]	427	2: jr ra
David Daney	5b3b168	2009-01-08 16:46:40 -0800	[diff] [blame]	428	nop
				429
				430
				431	#define SEXC(n) \
				432	s_exc_p ## n ## u: \
				433	jr ra; \
				434	ADD len, len, n*NBYTES
				435
				436	SEXC(16)
				437	SEXC(15)
				438	SEXC(14)
				439	SEXC(13)
				440	SEXC(12)
				441	SEXC(11)
				442	SEXC(10)
				443	SEXC(9)
				444	SEXC(8)
				445	SEXC(7)
				446	SEXC(6)
				447	SEXC(5)
				448	SEXC(4)
				449	SEXC(3)
				450	SEXC(2)
				451	SEXC(1)
				452
				453	s_exc_p1:
				454	jr ra
				455	ADD len, len, 1
				456	s_exc:
				457	jr ra
				458	nop
				459
				460	.align 5
				461	LEAF(memmove)
				462	ADD t0, a0, a2
				463	ADD t1, a1, a2
				464	sltu t0, a1, t0 # dst + len <= src -> memcpy
				465	sltu t1, a0, t1 # dst >= src + len -> memcpy
				466	and t0, t1
				467	beqz t0, __memcpy
				468	move v0, a0 /* return value */
				469	beqz a2, r_out
				470	END(memmove)
				471
				472	/* fall through to __rmemcpy */
				473	LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
				474	sltu t0, a1, a0
				475	beqz t0, r_end_bytes_up # src >= dst
				476	nop
				477	ADD a0, a2 # dst = dst + len
				478	ADD a1, a2 # src = src + len
				479
				480	r_end_bytes:
				481	lb t0, -1(a1)
				482	SUB a2, a2, 0x1
				483	sb t0, -1(a0)
				484	SUB a1, a1, 0x1
				485	bnez a2, r_end_bytes
				486	SUB a0, a0, 0x1
				487
				488	r_out:
				489	jr ra
				490	move a2, zero
				491
				492	r_end_bytes_up:
				493	lb t0, (a1)
				494	SUB a2, a2, 0x1
				495	sb t0, (a0)
				496	ADD a1, a1, 0x1
				497	bnez a2, r_end_bytes_up
				498	ADD a0, a0, 0x1
				499
				500	jr ra
				501	move a2, zero
				502	END(__rmemcpy)