Blame - arch/alpha/lib/ev6-clear_user.S - kernel/msm

blob: 4f42a16b7f53d18cfc08f076d540011ad6ca5215 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* arch/alpha/lib/ev6-clear_user.S
				3	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
				4	*
				5	* Zero user space, handling exceptions as we go.
				6	*
				7	* We have to make sure that $0 is always up-to-date and contains the
				8	* right "bytes left to zero" value (and that it is updated only _after_
				9	* a successful copy). There is also some rather minor exception setup
				10	* stuff.
				11	*
				12	* NOTE! This is not directly C-callable, because the calling semantics
				13	* are different:
				14	*
				15	* Inputs:
				16	* length in $0
				17	* destination address in $6
				18	* exception pointer in $7
				19	* return address in $28 (exceptions expect it there)
				20	*
				21	* Outputs:
				22	* bytes left to copy in $0
				23	*
				24	* Clobbers:
				25	* $1,$2,$3,$4,$5,$6
				26	*
				27	* Much of the information about 21264 scheduling/coding comes from:
				28	* Compiler Writer's Guide for the Alpha 21264
				29	* abbreviated as 'CWG' in other comments here
				30	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
				31	* Scheduling notation:
				32	* E - either cluster
				33	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
				34	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
				35	* Try not to change the actual algorithm if possible for consistency.
				36	* Determining actual stalls (other than slotting) doesn't appear to be easy to do.
				37	* From perusing the source code context where this routine is called, it is
				38	* a fair assumption that significant fractions of entire pages are zeroed, so
				39	* it's going to be worth the effort to hand-unroll a big loop, and use wh64.
				40	* ASSUMPTION:
				41	* The believed purpose of only updating $0 after a store is that a signal
				42	* may come along during the execution of this chunk of code, and we don't
				43	* want to leave a hole (and we also want to avoid repeating lots of work)
				44	*/
				45
				46	/* Allow an exception for an insn; exit if we get one. */
				47	#define EX(x,y...) \
				48	99: x,##y; \
				49	.section __ex_table,"a"; \
				50	.long 99b - .; \
				51	lda $31, $exception-99b($31); \
				52	.previous
				53
				54	.set noat
				55	.set noreorder
				56	.align 4
				57
				58	.globl __do_clear_user
				59	.ent __do_clear_user
				60	.frame $30, 0, $28
				61	.prologue 0
				62
				63	# Pipeline info : Slotting & Comments
				64	__do_clear_user:
				65	and $6, 7, $4 # .. E .. .. : find dest head misalignment
				66	beq $0, $zerolength # U .. .. .. : U L U L
				67
				68	addq $0, $4, $1 # .. .. .. E : bias counter
				69	and $1, 7, $2 # .. .. E .. : number of misaligned bytes in tail
				70	# Note - we never actually use $2, so this is a moot computation
				71	# and we can rewrite this later...
				72	srl $1, 3, $1 # .. E .. .. : number of quadwords to clear
				73	beq $4, $headalign # U .. .. .. : U L U L
				74
				75	/*
				76	* Head is not aligned. Write (8 - $4) bytes to head of destination
				77	* This means $6 is known to be misaligned
				78	*/
				79	EX( ldq_u $5, 0($6) ) # .. .. .. L : load dst word to mask back in
				80	beq $1, $onebyte # .. .. U .. : sub-word store?
				81	mskql $5, $6, $5 # .. U .. .. : take care of misaligned head
				82	addq $6, 8, $6 # E .. .. .. : L U U L
				83
				84	EX( stq_u $5, -8($6) ) # .. .. .. L :
				85	subq $1, 1, $1 # .. .. E .. :
				86	addq $0, $4, $0 # .. E .. .. : bytes left -= 8 - misalignment
				87	subq $0, 8, $0 # E .. .. .. : U L U L
				88
				89	.align 4
				90	/*
				91	* (The .align directive ought to be a moot point)
				92	* values upon initial entry to the loop
				93	* $1 is number of quadwords to clear (zero is a valid value)
				94	* $2 is number of trailing bytes (0..7) ($2 never used...)
				95	* $6 is known to be aligned 0mod8
				96	*/
				97	$headalign:
				98	subq $1, 16, $4 # .. .. .. E : If < 16, we can not use the huge loop
				99	and $6, 0x3f, $2 # .. .. E .. : Forward work for huge loop
				100	subq $2, 0x40, $3 # .. E .. .. : bias counter (huge loop)
				101	blt $4, $trailquad # U .. .. .. : U L U L
				102
				103	/*
				104	* We know that we're going to do at least 16 quads, which means we are
				105	* going to be able to use the large block clear loop at least once.
				106	* Figure out how many quads we need to clear before we are 0mod64 aligned
				107	* so we can use the wh64 instruction.
				108	*/
				109
				110	nop # .. .. .. E
				111	nop # .. .. E ..
				112	nop # .. E .. ..
				113	beq $3, $bigalign # U .. .. .. : U L U L : Aligned 0mod64
				114
				115	$alignmod64:
				116	EX( stq_u $31, 0($6) ) # .. .. .. L
				117	addq $3, 8, $3 # .. .. E ..
				118	subq $0, 8, $0 # .. E .. ..
				119	nop # E .. .. .. : U L U L
				120
				121	nop # .. .. .. E
				122	subq $1, 1, $1 # .. .. E ..
				123	addq $6, 8, $6 # .. E .. ..
				124	blt $3, $alignmod64 # U .. .. .. : U L U L
				125
				126	$bigalign:
				127	/*
				128	* $0 is the number of bytes left
				129	* $1 is the number of quads left
				130	* $6 is aligned 0mod64
				131	* we know that we'll be taking a minimum of one trip through
				132	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
				133	* We are _not_ going to update $0 after every single store. That
				134	* would be silly, because there will be cross-cluster dependencies
				135	* no matter how the code is scheduled. By doing it in slightly
				136	* staggered fashion, we can still do this loop in 5 fetches
				137	* The worse case will be doing two extra quads in some future execution,
				138	* in the event of an interrupted clear.
				139	* Assumes the wh64 needs to be for 2 trips through the loop in the future
				140	* The wh64 is issued on for the starting destination address for trip +2
				141	* through the loop, and if there are less than two trips left, the target
				142	* address will be for the current trip.
				143	*/
				144	nop # E :
				145	nop # E :
				146	nop # E :
				147	bis $6,$6,$3 # E : U L U L : Initial wh64 address is dest
				148	/* This might actually help for the current trip... */
				149
				150	$do_wh64:
				151	wh64 ($3) # .. .. .. L1 : memory subsystem hint
				152	subq $1, 16, $4 # .. .. E .. : Forward calculation - repeat the loop?
				153	EX( stq_u $31, 0($6) ) # .. L .. ..
				154	subq $0, 8, $0 # E .. .. .. : U L U L
				155
				156	addq $6, 128, $3 # E : Target address of wh64
				157	EX( stq_u $31, 8($6) ) # L :
				158	EX( stq_u $31, 16($6) ) # L :
				159	subq $0, 16, $0 # E : U L L U
				160
				161	nop # E :
				162	EX( stq_u $31, 24($6) ) # L :
				163	EX( stq_u $31, 32($6) ) # L :
				164	subq $0, 168, $5 # E : U L L U : two trips through the loop left?
				165	/* 168 = 192 - 24, since we've already completed some stores */
				166
				167	subq $0, 16, $0 # E :
				168	EX( stq_u $31, 40($6) ) # L :
				169	EX( stq_u $31, 48($6) ) # L :
				170	cmovlt $5, $6, $3 # E : U L L U : Latency 2, extra mapping cycle
				171
				172	subq $1, 8, $1 # E :
				173	subq $0, 16, $0 # E :
				174	EX( stq_u $31, 56($6) ) # L :
				175	nop # E : U L U L
				176
				177	nop # E :
				178	subq $0, 8, $0 # E :
				179	addq $6, 64, $6 # E :
				180	bge $4, $do_wh64 # U : U L U L
				181
				182	$trailquad:
				183	# zero to 16 quadwords left to store, plus any trailing bytes
				184	# $1 is the number of quadwords left to go.
				185	#
				186	nop # .. .. .. E
				187	nop # .. .. E ..
				188	nop # .. E .. ..
				189	beq $1, $trailbytes # U .. .. .. : U L U L : Only 0..7 bytes to go
				190
				191	$onequad:
				192	EX( stq_u $31, 0($6) ) # .. .. .. L
				193	subq $1, 1, $1 # .. .. E ..
				194	subq $0, 8, $0 # .. E .. ..
				195	nop # E .. .. .. : U L U L
				196
				197	nop # .. .. .. E
				198	nop # .. .. E ..
				199	addq $6, 8, $6 # .. E .. ..
				200	bgt $1, $onequad # U .. .. .. : U L U L
				201
				202	# We have an unknown number of bytes left to go.
				203	$trailbytes:
				204	nop # .. .. .. E
				205	nop # .. .. E ..
				206	nop # .. E .. ..
				207	beq $0, $zerolength # U .. .. .. : U L U L
				208
				209	# $0 contains the number of bytes left to copy (0..31)
				210	# so we will use $0 as the loop counter
				211	# We know for a fact that $0 > 0 zero due to previous context
				212	$onebyte:
				213	EX( stb $31, 0($6) ) # .. .. .. L
				214	subq $0, 1, $0 # .. .. E .. :
				215	addq $6, 1, $6 # .. E .. .. :
				216	bgt $0, $onebyte # U .. .. .. : U L U L
				217
				218	$zerolength:
				219	$exception: # Destination for exception recovery(?)
				220	nop # .. .. .. E :
				221	nop # .. .. E .. :
				222	nop # .. E .. .. :
				223	ret $31, ($28), 1 # L0 .. .. .. : L U L U
				224	.end __do_clear_user
				225