Blame - arch/alpha/lib/ev6-copy_page.S - kernel/msm

blob: b789db19275443d092494ea80a41be772dedb96a [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame^]	1	/*
				2	* arch/alpha/lib/ev6-copy_page.S
				3	*
				4	* Copy an entire page.
				5	*/
				6
				7	/* The following comparison of this routine vs the normal copy_page.S
				8	was written by an unnamed ev6 hardware designer and forwarded to me
				9	via Steven Hobbs <hobbs@steven.zko.dec.com>.
				10
				11	First Problem: STQ overflows.
				12	-----------------------------
				13
				14	It would be nice if EV6 handled every resource overflow efficiently,
				15	but for some it doesn't. Including store queue overflows. It causes
				16	a trap and a restart of the pipe.
				17
				18	To get around this we sometimes use (to borrow a term from a VSSAD
				19	researcher) "aeration". The idea is to slow the rate at which the
				20	processor receives valid instructions by inserting nops in the fetch
				21	path. In doing so, you can prevent the overflow and actually make
				22	the code run faster. You can, of course, take advantage of the fact
				23	that the processor can fetch at most 4 aligned instructions per cycle.
				24
				25	I inserted enough nops to force it to take 10 cycles to fetch the
				26	loop code. In theory, EV6 should be able to execute this loop in
				27	9 cycles but I was not able to get it to run that fast -- the initial
				28	conditions were such that I could not reach this optimum rate on
				29	(chaotic) EV6. I wrote the code such that everything would issue
				30	in order.
				31
				32	Second Problem: Dcache index matches.
				33	-------------------------------------
				34
				35	If you are going to use this routine on random aligned pages, there
				36	is a 25% chance that the pages will be at the same dcache indices.
				37	This results in many nasty memory traps without care.
				38
				39	The solution is to schedule the prefetches to avoid the memory
				40	conflicts. I schedule the wh64 prefetches farther ahead of the
				41	read prefetches to avoid this problem.
				42
				43	Third Problem: Needs more prefetching.
				44	--------------------------------------
				45
				46	In order to improve the code I added deeper prefetching to take the
				47	most advantage of EV6's bandwidth.
				48
				49	I also prefetched the read stream. Note that adding the read prefetch
				50	forced me to add another cycle to the inner-most kernel - up to 11
				51	from the original 8 cycles per iteration. We could improve performance
				52	further by unrolling the loop and doing multiple prefetches per cycle.
				53
				54	I think that the code below will be very robust and fast code for the
				55	purposes of copying aligned pages. It is slower when both source and
				56	destination pages are in the dcache, but it is my guess that this is
				57	less important than the dcache miss case. */
				58
				59
				60	.text
				61	.align 4
				62	.global copy_page
				63	.ent copy_page
				64	copy_page:
				65	.prologue 0
				66
				67	/* Prefetch 5 read cachelines; write-hint 10 cache lines. */
				68	wh64 ($16)
				69	ldl $31,0($17)
				70	ldl $31,64($17)
				71	lda $1,1*64($16)
				72
				73	wh64 ($1)
				74	ldl $31,128($17)
				75	ldl $31,192($17)
				76	lda $1,2*64($16)
				77
				78	wh64 ($1)
				79	ldl $31,256($17)
				80	lda $18,118
				81	lda $1,3*64($16)
				82
				83	wh64 ($1)
				84	nop
				85	lda $1,4*64($16)
				86	lda $2,5*64($16)
				87
				88	wh64 ($1)
				89	wh64 ($2)
				90	lda $1,6*64($16)
				91	lda $2,7*64($16)
				92
				93	wh64 ($1)
				94	wh64 ($2)
				95	lda $1,8*64($16)
				96	lda $2,9*64($16)
				97
				98	wh64 ($1)
				99	wh64 ($2)
				100	lda $19,10*64($16)
				101	nop
				102
				103	/* Main prefetching/write-hinting loop. */
				104	1: ldq $0,0($17)
				105	ldq $1,8($17)
				106	unop
				107	unop
				108
				109	unop
				110	unop
				111	ldq $2,16($17)
				112	ldq $3,24($17)
				113
				114	ldq $4,32($17)
				115	ldq $5,40($17)
				116	unop
				117	unop
				118
				119	unop
				120	unop
				121	ldq $6,48($17)
				122	ldq $7,56($17)
				123
				124	ldl $31,320($17)
				125	unop
				126	unop
				127	unop
				128
				129	/* This gives the extra cycle of aeration above the minimum. */
				130	unop
				131	unop
				132	unop
				133	unop
				134
				135	wh64 ($19)
				136	unop
				137	unop
				138	unop
				139
				140	stq $0,0($16)
				141	subq $18,1,$18
				142	stq $1,8($16)
				143	unop
				144
				145	unop
				146	stq $2,16($16)
				147	addq $17,64,$17
				148	stq $3,24($16)
				149
				150	stq $4,32($16)
				151	stq $5,40($16)
				152	addq $19,64,$19
				153	unop
				154
				155	stq $6,48($16)
				156	stq $7,56($16)
				157	addq $16,64,$16
				158	bne $18, 1b
				159
				160	/* Prefetch the final 5 cache lines of the read stream. */
				161	lda $18,10
				162	ldl $31,320($17)
				163	ldl $31,384($17)
				164	ldl $31,448($17)
				165
				166	ldl $31,512($17)
				167	ldl $31,576($17)
				168	nop
				169	nop
				170
				171	/* Non-prefetching, non-write-hinting cleanup loop for the
				172	final 10 cache lines. */
				173	2: ldq $0,0($17)
				174	ldq $1,8($17)
				175	ldq $2,16($17)
				176	ldq $3,24($17)
				177
				178	ldq $4,32($17)
				179	ldq $5,40($17)
				180	ldq $6,48($17)
				181	ldq $7,56($17)
				182
				183	stq $0,0($16)
				184	subq $18,1,$18
				185	stq $1,8($16)
				186	addq $17,64,$17
				187
				188	stq $2,16($16)
				189	stq $3,24($16)
				190	stq $4,32($16)
				191	stq $5,40($16)
				192
				193	stq $6,48($16)
				194	stq $7,56($16)
				195	addq $16,64,$16
				196	bne $18, 2b
				197
				198	ret
				199	nop
				200	unop
				201	nop
				202
				203	.end copy_page