Blame - arch/alpha/lib/ev6-strncpy_from_user.S - kernel/msm-4.19

blob: d2e28178caccc8b59fb2aea43b88e2934e54a7db [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* arch/alpha/lib/ev6-strncpy_from_user.S
				3	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
				4	*
				5	* Just like strncpy except in the return value:
				6	*
				7	* -EFAULT if an exception occurs before the terminator is copied.
				8	* N if the buffer filled.
				9	*
				10	* Otherwise the length of the string is returned.
				11	*
				12	* Much of the information about 21264 scheduling/coding comes from:
				13	* Compiler Writer's Guide for the Alpha 21264
				14	* abbreviated as 'CWG' in other comments here
				15	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
				16	* Scheduling notation:
				17	* E - either cluster
				18	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
				19	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
				20	* A bunch of instructions got moved and temp registers were changed
				21	* to aid in scheduling. Control flow was also re-arranged to eliminate
				22	* branches, and to provide longer code sequences to enable better scheduling.
				23	* A total rewrite (using byte load/stores for start & tail sequences)
				24	* is desirable, but very difficult to do without a from-scratch rewrite.
				25	* Save that for the future.
				26	*/
				27
				28
				29	#include <asm/errno.h>
				30	#include <asm/regdef.h>
				31
				32
				33	/* Allow an exception for an insn; exit if we get one. */
				34	#define EX(x,y...) \
				35	99: x,##y; \
				36	.section __ex_table,"a"; \
				37	.long 99b - .; \
				38	lda $31, $exception-99b($0); \
				39	.previous
				40
				41
				42	.set noat
				43	.set noreorder
				44	.text
				45
				46	.globl __strncpy_from_user
				47	.ent __strncpy_from_user
				48	.frame $30, 0, $26
				49	.prologue 0
				50
				51	.align 4
				52	__strncpy_from_user:
				53	and a0, 7, t3 # E : find dest misalignment
				54	beq a2, $zerolength # U :
				55
				56	/* Are source and destination co-aligned? */
				57	mov a0, v0 # E : save the string start
				58	xor a0, a1, t4 # E :
				59	EX( ldq_u t1, 0(a1) ) # L : Latency=3 load first quadword
				60	ldq_u t0, 0(a0) # L : load first (partial) aligned dest quadword
				61
				62	addq a2, t3, a2 # E : bias count by dest misalignment
				63	subq a2, 1, a3 # E :
				64	addq zero, 1, t10 # E :
				65	and t4, 7, t4 # E : misalignment between the two
				66
				67	and a3, 7, t6 # E : number of tail bytes
				68	sll t10, t6, t10 # E : t10 = bitmask of last count byte
				69	bne t4, $unaligned # U :
				70	lda t2, -1 # E : build a mask against false zero
				71
				72	/*
				73	* We are co-aligned; take care of a partial first word.
				74	* On entry to this basic block:
				75	* t0 == the first destination word for masking back in
				76	* t1 == the first source word.
				77	*/
				78
				79	srl a3, 3, a2 # E : a2 = loop counter = (count - 1)/8
				80	addq a1, 8, a1 # E :
				81	mskqh t2, a1, t2 # U : detection in the src word
				82	nop
				83
				84	/* Create the 1st output word and detect 0's in the 1st input word. */
				85	mskqh t1, a1, t3 # U :
				86	mskql t0, a1, t0 # U : assemble the first output word
				87	ornot t1, t2, t2 # E :
				88	nop
				89
				90	cmpbge zero, t2, t8 # E : bits set iff null found
				91	or t0, t3, t0 # E :
				92	beq a2, $a_eoc # U :
				93	bne t8, $a_eos # U : 2nd branch in a quad. Bad.
				94
				95	/* On entry to this basic block:
				96	* t0 == a source quad not containing a null.
				97	* a0 - current aligned destination address
				98	* a1 - current aligned source address
				99	* a2 - count of quadwords to move.
				100	* NOTE: Loop improvement - unrolling this is going to be
				101	* a huge win, since we're going to stall otherwise.
				102	* Fix this later. For _really_ large copies, look
				103	* at using wh64 on a look-ahead basis. See the code
				104	* in clear_user.S and copy_user.S.
				105	* Presumably, since (a0) and (a1) do not overlap (by C definition)
				106	* Lots of nops here:
				107	* - Separate loads from stores
				108	* - Keep it to 1 branch/quadpack so the branch predictor
				109	* can train.
				110	*/
				111	$a_loop:
				112	stq_u t0, 0(a0) # L :
				113	addq a0, 8, a0 # E :
				114	nop
				115	subq a2, 1, a2 # E :
				116
				117	EX( ldq_u t0, 0(a1) ) # L :
				118	addq a1, 8, a1 # E :
				119	cmpbge zero, t0, t8 # E : Stall 2 cycles on t0
				120	beq a2, $a_eoc # U :
				121
				122	beq t8, $a_loop # U :
				123	nop
				124	nop
				125	nop
				126
				127	/* Take care of the final (partial) word store. At this point
				128	* the end-of-count bit is set in t8 iff it applies.
				129	*
				130	* On entry to this basic block we have:
				131	* t0 == the source word containing the null
				132	* t8 == the cmpbge mask that found it.
				133	*/
				134	$a_eos:
				135	negq t8, t12 # E : find low bit set
				136	and t8, t12, t12 # E :
				137
				138	/* We're doing a partial word store and so need to combine
				139	our source and original destination words. */
				140	ldq_u t1, 0(a0) # L :
				141	subq t12, 1, t6 # E :
				142
				143	or t12, t6, t8 # E :
				144	zapnot t0, t8, t0 # U : clear src bytes > null
				145	zap t1, t8, t1 # U : clear dst bytes <= null
				146	or t0, t1, t0 # E :
				147
				148	stq_u t0, 0(a0) # L :
				149	br $finish_up # L0 :
				150	nop
				151	nop
				152
				153	/* Add the end-of-count bit to the eos detection bitmask. */
				154	.align 4
				155	$a_eoc:
				156	or t10, t8, t8
				157	br $a_eos
				158	nop
				159	nop
				160
				161
				162	/* The source and destination are not co-aligned. Align the destination
				163	and cope. We have to be very careful about not reading too much and
				164	causing a SEGV. */
				165
				166	.align 4
				167	$u_head:
				168	/* We know just enough now to be able to assemble the first
				169	full source word. We can still find a zero at the end of it
				170	that prevents us from outputting the whole thing.
				171
				172	On entry to this basic block:
				173	t0 == the first dest word, unmasked
				174	t1 == the shifted low bits of the first source word
				175	t6 == bytemask that is -1 in dest word bytes */
				176
				177	EX( ldq_u t2, 8(a1) ) # L : load second src word
				178	addq a1, 8, a1 # E :
				179	mskql t0, a0, t0 # U : mask trailing garbage in dst
				180	extqh t2, a1, t4 # U :
				181
				182	or t1, t4, t1 # E : first aligned src word complete
				183	mskqh t1, a0, t1 # U : mask leading garbage in src
				184	or t0, t1, t0 # E : first output word complete
				185	or t0, t6, t6 # E : mask original data for zero test
				186
				187	cmpbge zero, t6, t8 # E :
				188	beq a2, $u_eocfin # U :
				189	bne t8, $u_final # U : bad news - 2nd branch in a quad
				190	lda t6, -1 # E : mask out the bits we have
				191
				192	mskql t6, a1, t6 # U : already seen
				193	stq_u t0, 0(a0) # L : store first output word
				194	or t6, t2, t2 # E :
				195	cmpbge zero, t2, t8 # E : find nulls in second partial
				196
				197	addq a0, 8, a0 # E :
				198	subq a2, 1, a2 # E :
				199	bne t8, $u_late_head_exit # U :
				200	nop
				201
				202	/* Finally, we've got all the stupid leading edge cases taken care
				203	of and we can set up to enter the main loop. */
				204
				205	extql t2, a1, t1 # U : position hi-bits of lo word
				206	EX( ldq_u t2, 8(a1) ) # L : read next high-order source word
				207	addq a1, 8, a1 # E :
				208	cmpbge zero, t2, t8 # E :
				209
				210	beq a2, $u_eoc # U :
				211	bne t8, $u_eos # U :
				212	nop
				213	nop
				214
				215	/* Unaligned copy main loop. In order to avoid reading too much,
				216	the loop is structured to detect zeros in aligned source words.
				217	This has, unfortunately, effectively pulled half of a loop
				218	iteration out into the head and half into the tail, but it does
				219	prevent nastiness from accumulating in the very thing we want
				220	to run as fast as possible.
				221
				222	On entry to this basic block:
				223	t1 == the shifted high-order bits from the previous source word
				224	t2 == the unshifted current source word
				225
				226	We further know that t2 does not contain a null terminator. */
				227
				228	/*
				229	* Extra nops here:
				230	* separate load quads from store quads
				231	* only one branch/quad to permit predictor training
				232	*/
				233
				234	.align 4
				235	$u_loop:
				236	extqh t2, a1, t0 # U : extract high bits for current word
				237	addq a1, 8, a1 # E :
				238	extql t2, a1, t3 # U : extract low bits for next time
				239	addq a0, 8, a0 # E :
				240
				241	or t0, t1, t0 # E : current dst word now complete
				242	EX( ldq_u t2, 0(a1) ) # L : load high word for next time
				243	subq a2, 1, a2 # E :
				244	nop
				245
				246	stq_u t0, -8(a0) # L : save the current word
				247	mov t3, t1 # E :
				248	cmpbge zero, t2, t8 # E : test new word for eos
				249	beq a2, $u_eoc # U :
				250
				251	beq t8, $u_loop # U :
				252	nop
				253	nop
				254	nop
				255
				256	/* We've found a zero somewhere in the source word we just read.
				257	If it resides in the lower half, we have one (probably partial)
				258	word to write out, and if it resides in the upper half, we
				259	have one full and one partial word left to write out.
				260
				261	On entry to this basic block:
				262	t1 == the shifted high-order bits from the previous source word
				263	t2 == the unshifted current source word. */
				264	.align 4
				265	$u_eos:
				266	extqh t2, a1, t0 # U :
				267	or t0, t1, t0 # E : first (partial) source word complete
				268	cmpbge zero, t0, t8 # E : is the null in this first bit?
				269	nop
				270
				271	bne t8, $u_final # U :
				272	stq_u t0, 0(a0) # L : the null was in the high-order bits
				273	addq a0, 8, a0 # E :
				274	subq a2, 1, a2 # E :
				275
				276	.align 4
				277	$u_late_head_exit:
				278	extql t2, a1, t0 # U :
				279	cmpbge zero, t0, t8 # E :
				280	or t8, t10, t6 # E :
				281	cmoveq a2, t6, t8 # E :
				282
				283	/* Take care of a final (probably partial) result word.
				284	On entry to this basic block:
				285	t0 == assembled source word
				286	t8 == cmpbge mask that found the null. */
				287	.align 4
				288	$u_final:
				289	negq t8, t6 # E : isolate low bit set
				290	and t6, t8, t12 # E :
				291	ldq_u t1, 0(a0) # L :
				292	subq t12, 1, t6 # E :
				293
				294	or t6, t12, t8 # E :
				295	zapnot t0, t8, t0 # U : kill source bytes > null
				296	zap t1, t8, t1 # U : kill dest bytes <= null
				297	or t0, t1, t0 # E :
				298
				299	stq_u t0, 0(a0) # E :
				300	br $finish_up # U :
				301	nop
				302	nop
				303
				304	.align 4
				305	$u_eoc: # end-of-count
				306	extqh t2, a1, t0 # U :
				307	or t0, t1, t0 # E :
				308	cmpbge zero, t0, t8 # E :
				309	nop
				310
				311	.align 4
				312	$u_eocfin: # end-of-count, final word
				313	or t10, t8, t8 # E :
				314	br $u_final # U :
				315	nop
				316	nop
				317
				318	/* Unaligned copy entry point. */
				319	.align 4
				320	$unaligned:
				321
				322	srl a3, 3, a2 # U : a2 = loop counter = (count - 1)/8
				323	and a0, 7, t4 # E : find dest misalignment
				324	and a1, 7, t5 # E : find src misalignment
				325	mov zero, t0 # E :
				326
				327	/* Conditionally load the first destination word and a bytemask
				328	with 0xff indicating that the destination byte is sacrosanct. */
				329
				330	mov zero, t6 # E :
				331	beq t4, 1f # U :
				332	ldq_u t0, 0(a0) # L :
				333	lda t6, -1 # E :
				334
				335	mskql t6, a0, t6 # E :
				336	nop
				337	nop
				338	nop
				339
				340	.align 4
				341	1:
				342	subq a1, t4, a1 # E : sub dest misalignment from src addr
				343	/* If source misalignment is larger than dest misalignment, we need
				344	extra startup checks to avoid SEGV. */
				345	cmplt t4, t5, t12 # E :
				346	extql t1, a1, t1 # U : shift src into place
				347	lda t2, -1 # E : for creating masks later
				348
				349	beq t12, $u_head # U :
				350	mskqh t2, t5, t2 # U : begin src byte validity mask
				351	cmpbge zero, t1, t8 # E : is there a zero?
				352	nop
				353
				354	extql t2, a1, t2 # U :
				355	or t8, t10, t5 # E : test for end-of-count too
				356	cmpbge zero, t2, t3 # E :
				357	cmoveq a2, t5, t8 # E : Latency=2, extra map slot
				358
				359	nop # E : goes with cmov
				360	andnot t8, t3, t8 # E :
				361	beq t8, $u_head # U :
				362	nop
				363
				364	/* At this point we've found a zero in the first partial word of
				365	the source. We need to isolate the valid source data and mask
				366	it into the original destination data. (Incidentally, we know
				367	that we'll need at least one byte of that original dest word.) */
				368
				369	ldq_u t0, 0(a0) # L :
				370	negq t8, t6 # E : build bitmask of bytes <= zero
				371	mskqh t1, t4, t1 # U :
				372	and t6, t8, t12 # E :
				373
				374	subq t12, 1, t6 # E :
				375	or t6, t12, t8 # E :
				376	zapnot t2, t8, t2 # U : prepare source word; mirror changes
				377	zapnot t1, t8, t1 # U : to source validity mask
				378
				379	andnot t0, t2, t0 # E : zero place for source to reside
				380	or t0, t1, t0 # E : and put it there
				381	stq_u t0, 0(a0) # L :
				382	nop
				383
				384	.align 4
				385	$finish_up:
				386	zapnot t0, t12, t4 # U : was last byte written null?
				387	and t12, 0xf0, t3 # E : binary search for the address of the
				388	cmovne t4, 1, t4 # E : Latency=2, extra map slot
				389	nop # E : with cmovne
				390
				391	and t12, 0xcc, t2 # E : last byte written
				392	and t12, 0xaa, t1 # E :
				393	cmovne t3, 4, t3 # E : Latency=2, extra map slot
				394	nop # E : with cmovne
				395
				396	bic a0, 7, t0
				397	cmovne t2, 2, t2 # E : Latency=2, extra map slot
				398	nop # E : with cmovne
				399	nop
				400
				401	cmovne t1, 1, t1 # E : Latency=2, extra map slot
				402	nop # E : with cmovne
				403	addq t0, t3, t0 # E :
				404	addq t1, t2, t1 # E :
				405
				406	addq t0, t1, t0 # E :
				407	addq t0, t4, t0 # add one if we filled the buffer
				408	subq t0, v0, v0 # find string length
				409	ret # L0 :
				410
				411	.align 4
				412	$zerolength:
				413	nop
				414	nop
				415	nop
				416	clr v0
				417
				418	$exception:
				419	nop
				420	nop
				421	nop
				422	ret
				423
				424	.end __strncpy_from_user