Blame - arch/alpha/lib/ev6-memset.S - kernel/msm-4.9

blob: d8b94e1c7fcad001c32142f4e4cc51d3cc8f6a5b [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* arch/alpha/lib/ev6-memset.S
				3	*
				4	* This is an efficient (and relatively small) implementation of the C library
				5	* "memset()" function for the 21264 implementation of Alpha.
				6	*
				7	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
				8	*
				9	* Much of the information about 21264 scheduling/coding comes from:
				10	* Compiler Writer's Guide for the Alpha 21264
				11	* abbreviated as 'CWG' in other comments here
				12	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
				13	* Scheduling notation:
				14	* E - either cluster
				15	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
				16	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
				17	* The algorithm for the leading and trailing quadwords remains the same,
				18	* however the loop has been unrolled to enable better memory throughput,
				19	* and the code has been replicated for each of the entry points: __memset
				20	* and __memsetw to permit better scheduling to eliminate the stalling
				21	* encountered during the mask replication.
				22	* A future enhancement might be to put in a byte store loop for really
				23	* small (say < 32 bytes) memset()s. Whether or not that change would be
				24	* a win in the kernel would depend upon the contextual usage.
				25	* WARNING: Maintaining this is going to be more work than the above version,
				26	* as fixes will need to be made in multiple places. The performance gain
				27	* is worth it.
				28	*/
				29
				30	.set noat
				31	.set noreorder
				32	.text
				33	.globl __memset
				34	.globl __memsetw
				35	.globl __constant_c_memset
				36	.globl memset
				37
				38	.ent __memset
				39	.align 5
				40	__memset:
				41	.frame $30,0,$26,0
				42	.prologue 0
				43
				44	/*
				45	* Serious stalling happens. The only way to mitigate this is to
				46	* undertake a major re-write to interleave the constant materialization
				47	* with other parts of the fall-through code. This is important, even
				48	* though it makes maintenance tougher.
				49	* Do this later.
				50	*/
				51	and $17,255,$1 # E : 00000000000000ch
				52	insbl $17,1,$2 # U : 000000000000ch00
				53	bis $16,$16,$0 # E : return value
				54	ble $18,end_b # U : zero length requested?
				55
				56	addq $18,$16,$6 # E : max address to write to
				57	bis $1,$2,$17 # E : 000000000000chch
				58	insbl $1,2,$3 # U : 0000000000ch0000
				59	insbl $1,3,$4 # U : 00000000ch000000
				60
				61	or $3,$4,$3 # E : 00000000chch0000
				62	inswl $17,4,$5 # U : 0000chch00000000
				63	xor $16,$6,$1 # E : will complete write be within one quadword?
				64	inswl $17,6,$2 # U : chch000000000000
				65
				66	or $17,$3,$17 # E : 00000000chchchch
				67	or $2,$5,$2 # E : chchchch00000000
				68	bic $1,7,$1 # E : fit within a single quadword?
				69	and $16,7,$3 # E : Target addr misalignment
				70
				71	or $17,$2,$17 # E : chchchchchchchch
				72	beq $1,within_quad_b # U :
				73	nop # E :
				74	beq $3,aligned_b # U : target is 0mod8
				75
				76	/*
				77	* Target address is misaligned, and won't fit within a quadword
				78	*/
				79	ldq_u $4,0($16) # L : Fetch first partial
				80	bis $16,$16,$5 # E : Save the address
				81	insql $17,$16,$2 # U : Insert new bytes
				82	subq $3,8,$3 # E : Invert (for addressing uses)
				83
				84	addq $18,$3,$18 # E : $18 is new count ($3 is negative)
				85	mskql $4,$16,$4 # U : clear relevant parts of the quad
				86	subq $16,$3,$16 # E : $16 is new aligned destination
				87	bis $2,$4,$1 # E : Final bytes
				88
				89	nop
				90	stq_u $1,0($5) # L : Store result
				91	nop
				92	nop
				93
				94	.align 4
				95	aligned_b:
				96	/*
				97	* We are now guaranteed to be quad aligned, with at least
				98	* one partial quad to write.
				99	*/
				100
				101	sra $18,3,$3 # U : Number of remaining quads to write
				102	and $18,7,$18 # E : Number of trailing bytes to write
				103	bis $16,$16,$5 # E : Save dest address
				104	beq $3,no_quad_b # U : tail stuff only
				105
				106	/*
				107	* it's worth the effort to unroll this and use wh64 if possible
				108	* Lifted a bunch of code from clear_user.S
				109	* At this point, entry values are:
				110	* $16 Current destination address
				111	* $5 A copy of $16
				112	* $6 The max quadword address to write to
				113	* $18 Number trailer bytes
				114	* $3 Number quads to write
				115	*/
				116
				117	and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
				118	subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
				119	subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
				120	blt $4, loop_b # U :
				121
				122	/*
				123	* We know we've got at least 16 quads, minimum of one trip
				124	* through unrolled loop. Do a quad at a time to get us 0mod64
				125	* aligned.
				126	*/
				127
				128	nop # E :
				129	nop # E :
				130	nop # E :
				131	beq $1, $bigalign_b # U :
				132
				133	$alignmod64_b:
				134	stq $17, 0($5) # L :
				135	subq $3, 1, $3 # E : For consistency later
				136	addq $1, 8, $1 # E : Increment towards zero for alignment
				137	addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
				138
				139	nop
				140	nop
				141	addq $5, 8, $5 # E : Inc address
				142	blt $1, $alignmod64_b # U :
				143
				144	$bigalign_b:
				145	/*
				146	* $3 - number quads left to go
				147	* $5 - target address (aligned 0mod64)
				148	* $17 - mask of stuff to store
				149	* Scratch registers available: $7, $2, $4, $1
				150	* we know that we'll be taking a minimum of one trip through
				151	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
				152	* Assumes the wh64 needs to be for 2 trips through the loop in the future
				153	* The wh64 is issued on for the starting destination address for trip +2
				154	* through the loop, and if there are less than two trips left, the target
				155	* address will be for the current trip.
				156	*/
				157
				158	$do_wh64_b:
				159	wh64 ($4) # L1 : memory subsystem write hint
				160	subq $3, 24, $2 # E : For determining future wh64 addresses
				161	stq $17, 0($5) # L :
				162	nop # E :
				163
				164	addq $5, 128, $4 # E : speculative target of next wh64
				165	stq $17, 8($5) # L :
				166	stq $17, 16($5) # L :
				167	addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
				168
				169	stq $17, 24($5) # L :
				170	stq $17, 32($5) # L :
				171	cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
				172	nop
				173
				174	stq $17, 40($5) # L :
				175	stq $17, 48($5) # L :
				176	subq $3, 16, $2 # E : Repeat the loop at least once more?
				177	nop
				178
				179	stq $17, 56($5) # L :
				180	addq $5, 64, $5 # E :
				181	subq $3, 8, $3 # E :
				182	bge $2, $do_wh64_b # U :
				183
				184	nop
				185	nop
				186	nop
				187	beq $3, no_quad_b # U : Might have finished already
				188
				189	.align 4
				190	/*
				191	* Simple loop for trailing quadwords, or for small amounts
				192	* of data (where we can't use an unrolled loop and wh64)
				193	*/
				194	loop_b:
				195	stq $17,0($5) # L :
				196	subq $3,1,$3 # E : Decrement number quads left
				197	addq $5,8,$5 # E : Inc address
				198	bne $3,loop_b # U : more?
				199
				200	no_quad_b:
				201	/*
				202	* Write 0..7 trailing bytes.
				203	*/
				204	nop # E :
				205	beq $18,end_b # U : All done?
				206	ldq $7,0($5) # L :
				207	mskqh $7,$6,$2 # U : Mask final quad
				208
				209	insqh $17,$6,$4 # U : New bits
				210	bis $2,$4,$1 # E : Put it all together
				211	stq $1,0($5) # L : And back to memory
				212	ret $31,($26),1 # L0 :
				213
				214	within_quad_b:
				215	ldq_u $1,0($16) # L :
				216	insql $17,$16,$2 # U : New bits
				217	mskql $1,$16,$4 # U : Clear old
				218	bis $2,$4,$2 # E : New result
				219
				220	mskql $2,$6,$4 # U :
				221	mskqh $1,$6,$2 # U :
				222	bis $2,$4,$1 # E :
				223	stq_u $1,0($16) # L :
				224
				225	end_b:
				226	nop
				227	nop
				228	nop
				229	ret $31,($26),1 # L0 :
				230	.end __memset
				231
				232	/*
				233	* This is the original body of code, prior to replication and
				234	* rescheduling. Leave it here, as there may be calls to this
				235	* entry point.
				236	*/
				237	.align 4
				238	.ent __constant_c_memset
				239	__constant_c_memset:
				240	.frame $30,0,$26,0
				241	.prologue 0
				242
				243	addq $18,$16,$6 # E : max address to write to
				244	bis $16,$16,$0 # E : return value
				245	xor $16,$6,$1 # E : will complete write be within one quadword?
				246	ble $18,end # U : zero length requested?
				247
				248	bic $1,7,$1 # E : fit within a single quadword
				249	beq $1,within_one_quad # U :
				250	and $16,7,$3 # E : Target addr misalignment
				251	beq $3,aligned # U : target is 0mod8
				252
				253	/*
				254	* Target address is misaligned, and won't fit within a quadword
				255	*/
				256	ldq_u $4,0($16) # L : Fetch first partial
				257	bis $16,$16,$5 # E : Save the address
				258	insql $17,$16,$2 # U : Insert new bytes
				259	subq $3,8,$3 # E : Invert (for addressing uses)
				260
				261	addq $18,$3,$18 # E : $18 is new count ($3 is negative)
				262	mskql $4,$16,$4 # U : clear relevant parts of the quad
				263	subq $16,$3,$16 # E : $16 is new aligned destination
				264	bis $2,$4,$1 # E : Final bytes
				265
				266	nop
				267	stq_u $1,0($5) # L : Store result
				268	nop
				269	nop
				270
				271	.align 4
				272	aligned:
				273	/*
				274	* We are now guaranteed to be quad aligned, with at least
				275	* one partial quad to write.
				276	*/
				277
				278	sra $18,3,$3 # U : Number of remaining quads to write
				279	and $18,7,$18 # E : Number of trailing bytes to write
				280	bis $16,$16,$5 # E : Save dest address
				281	beq $3,no_quad # U : tail stuff only
				282
				283	/*
				284	* it's worth the effort to unroll this and use wh64 if possible
				285	* Lifted a bunch of code from clear_user.S
				286	* At this point, entry values are:
				287	* $16 Current destination address
				288	* $5 A copy of $16
				289	* $6 The max quadword address to write to
				290	* $18 Number trailer bytes
				291	* $3 Number quads to write
				292	*/
				293
				294	and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
				295	subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
				296	subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
				297	blt $4, loop # U :
				298
				299	/*
				300	* We know we've got at least 16 quads, minimum of one trip
				301	* through unrolled loop. Do a quad at a time to get us 0mod64
				302	* aligned.
				303	*/
				304
				305	nop # E :
				306	nop # E :
				307	nop # E :
				308	beq $1, $bigalign # U :
				309
				310	$alignmod64:
				311	stq $17, 0($5) # L :
				312	subq $3, 1, $3 # E : For consistency later
				313	addq $1, 8, $1 # E : Increment towards zero for alignment
				314	addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
				315
				316	nop
				317	nop
				318	addq $5, 8, $5 # E : Inc address
				319	blt $1, $alignmod64 # U :
				320
				321	$bigalign:
				322	/*
				323	* $3 - number quads left to go
				324	* $5 - target address (aligned 0mod64)
				325	* $17 - mask of stuff to store
				326	* Scratch registers available: $7, $2, $4, $1
				327	* we know that we'll be taking a minimum of one trip through
				328	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
				329	* Assumes the wh64 needs to be for 2 trips through the loop in the future
				330	* The wh64 is issued on for the starting destination address for trip +2
				331	* through the loop, and if there are less than two trips left, the target
				332	* address will be for the current trip.
				333	*/
				334
				335	$do_wh64:
				336	wh64 ($4) # L1 : memory subsystem write hint
				337	subq $3, 24, $2 # E : For determining future wh64 addresses
				338	stq $17, 0($5) # L :
				339	nop # E :
				340
				341	addq $5, 128, $4 # E : speculative target of next wh64
				342	stq $17, 8($5) # L :
				343	stq $17, 16($5) # L :
				344	addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
				345
				346	stq $17, 24($5) # L :
				347	stq $17, 32($5) # L :
				348	cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
				349	nop
				350
				351	stq $17, 40($5) # L :
				352	stq $17, 48($5) # L :
				353	subq $3, 16, $2 # E : Repeat the loop at least once more?
				354	nop
				355
				356	stq $17, 56($5) # L :
				357	addq $5, 64, $5 # E :
				358	subq $3, 8, $3 # E :
				359	bge $2, $do_wh64 # U :
				360
				361	nop
				362	nop
				363	nop
				364	beq $3, no_quad # U : Might have finished already
				365
				366	.align 4
				367	/*
				368	* Simple loop for trailing quadwords, or for small amounts
				369	* of data (where we can't use an unrolled loop and wh64)
				370	*/
				371	loop:
				372	stq $17,0($5) # L :
				373	subq $3,1,$3 # E : Decrement number quads left
				374	addq $5,8,$5 # E : Inc address
				375	bne $3,loop # U : more?
				376
				377	no_quad:
				378	/*
				379	* Write 0..7 trailing bytes.
				380	*/
				381	nop # E :
				382	beq $18,end # U : All done?
				383	ldq $7,0($5) # L :
				384	mskqh $7,$6,$2 # U : Mask final quad
				385
				386	insqh $17,$6,$4 # U : New bits
				387	bis $2,$4,$1 # E : Put it all together
				388	stq $1,0($5) # L : And back to memory
				389	ret $31,($26),1 # L0 :
				390
				391	within_one_quad:
				392	ldq_u $1,0($16) # L :
				393	insql $17,$16,$2 # U : New bits
				394	mskql $1,$16,$4 # U : Clear old
				395	bis $2,$4,$2 # E : New result
				396
				397	mskql $2,$6,$4 # U :
				398	mskqh $1,$6,$2 # U :
				399	bis $2,$4,$1 # E :
				400	stq_u $1,0($16) # L :
				401
				402	end:
				403	nop
				404	nop
				405	nop
				406	ret $31,($26),1 # L0 :
				407	.end __constant_c_memset
				408
				409	/*
				410	* This is a replicant of the __constant_c_memset code, rescheduled
				411	* to mask stalls. Note that entry point names also had to change
				412	*/
				413	.align 5
				414	.ent __memsetw
				415
				416	__memsetw:
				417	.frame $30,0,$26,0
				418	.prologue 0
				419
				420	inswl $17,0,$5 # U : 000000000000c1c2
				421	inswl $17,2,$2 # U : 00000000c1c20000
				422	bis $16,$16,$0 # E : return value
				423	addq $18,$16,$6 # E : max address to write to
				424
				425	ble $18, end_w # U : zero length requested?
				426	inswl $17,4,$3 # U : 0000c1c200000000
				427	inswl $17,6,$4 # U : c1c2000000000000
				428	xor $16,$6,$1 # E : will complete write be within one quadword?
				429
				430	or $2,$5,$2 # E : 00000000c1c2c1c2
				431	or $3,$4,$17 # E : c1c2c1c200000000
				432	bic $1,7,$1 # E : fit within a single quadword
				433	and $16,7,$3 # E : Target addr misalignment
				434
				435	or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
				436	beq $1,within_quad_w # U :
				437	nop
				438	beq $3,aligned_w # U : target is 0mod8
				439
				440	/*
				441	* Target address is misaligned, and won't fit within a quadword
				442	*/
				443	ldq_u $4,0($16) # L : Fetch first partial
				444	bis $16,$16,$5 # E : Save the address
				445	insql $17,$16,$2 # U : Insert new bytes
				446	subq $3,8,$3 # E : Invert (for addressing uses)
				447
				448	addq $18,$3,$18 # E : $18 is new count ($3 is negative)
				449	mskql $4,$16,$4 # U : clear relevant parts of the quad
				450	subq $16,$3,$16 # E : $16 is new aligned destination
				451	bis $2,$4,$1 # E : Final bytes
				452
				453	nop
				454	stq_u $1,0($5) # L : Store result
				455	nop
				456	nop
				457
				458	.align 4
				459	aligned_w:
				460	/*
				461	* We are now guaranteed to be quad aligned, with at least
				462	* one partial quad to write.
				463	*/
				464
				465	sra $18,3,$3 # U : Number of remaining quads to write
				466	and $18,7,$18 # E : Number of trailing bytes to write
				467	bis $16,$16,$5 # E : Save dest address
				468	beq $3,no_quad_w # U : tail stuff only
				469
				470	/*
				471	* it's worth the effort to unroll this and use wh64 if possible
				472	* Lifted a bunch of code from clear_user.S
				473	* At this point, entry values are:
				474	* $16 Current destination address
				475	* $5 A copy of $16
				476	* $6 The max quadword address to write to
				477	* $18 Number trailer bytes
				478	* $3 Number quads to write
				479	*/
				480
				481	and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
				482	subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
				483	subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
				484	blt $4, loop_w # U :
				485
				486	/*
				487	* We know we've got at least 16 quads, minimum of one trip
				488	* through unrolled loop. Do a quad at a time to get us 0mod64
				489	* aligned.
				490	*/
				491
				492	nop # E :
				493	nop # E :
				494	nop # E :
				495	beq $1, $bigalign_w # U :
				496
				497	$alignmod64_w:
				498	stq $17, 0($5) # L :
				499	subq $3, 1, $3 # E : For consistency later
				500	addq $1, 8, $1 # E : Increment towards zero for alignment
				501	addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
				502
				503	nop
				504	nop
				505	addq $5, 8, $5 # E : Inc address
				506	blt $1, $alignmod64_w # U :
				507
				508	$bigalign_w:
				509	/*
				510	* $3 - number quads left to go
				511	* $5 - target address (aligned 0mod64)
				512	* $17 - mask of stuff to store
				513	* Scratch registers available: $7, $2, $4, $1
				514	* we know that we'll be taking a minimum of one trip through
				515	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
				516	* Assumes the wh64 needs to be for 2 trips through the loop in the future
				517	* The wh64 is issued on for the starting destination address for trip +2
				518	* through the loop, and if there are less than two trips left, the target
				519	* address will be for the current trip.
				520	*/
				521
				522	$do_wh64_w:
				523	wh64 ($4) # L1 : memory subsystem write hint
				524	subq $3, 24, $2 # E : For determining future wh64 addresses
				525	stq $17, 0($5) # L :
				526	nop # E :
				527
				528	addq $5, 128, $4 # E : speculative target of next wh64
				529	stq $17, 8($5) # L :
				530	stq $17, 16($5) # L :
				531	addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
				532
				533	stq $17, 24($5) # L :
				534	stq $17, 32($5) # L :
				535	cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
				536	nop
				537
				538	stq $17, 40($5) # L :
				539	stq $17, 48($5) # L :
				540	subq $3, 16, $2 # E : Repeat the loop at least once more?
				541	nop
				542
				543	stq $17, 56($5) # L :
				544	addq $5, 64, $5 # E :
				545	subq $3, 8, $3 # E :
				546	bge $2, $do_wh64_w # U :
				547
				548	nop
				549	nop
				550	nop
				551	beq $3, no_quad_w # U : Might have finished already
				552
				553	.align 4
				554	/*
				555	* Simple loop for trailing quadwords, or for small amounts
				556	* of data (where we can't use an unrolled loop and wh64)
				557	*/
				558	loop_w:
				559	stq $17,0($5) # L :
				560	subq $3,1,$3 # E : Decrement number quads left
				561	addq $5,8,$5 # E : Inc address
				562	bne $3,loop_w # U : more?
				563
				564	no_quad_w:
				565	/*
				566	* Write 0..7 trailing bytes.
				567	*/
				568	nop # E :
				569	beq $18,end_w # U : All done?
				570	ldq $7,0($5) # L :
				571	mskqh $7,$6,$2 # U : Mask final quad
				572
				573	insqh $17,$6,$4 # U : New bits
				574	bis $2,$4,$1 # E : Put it all together
				575	stq $1,0($5) # L : And back to memory
				576	ret $31,($26),1 # L0 :
				577
				578	within_quad_w:
				579	ldq_u $1,0($16) # L :
				580	insql $17,$16,$2 # U : New bits
				581	mskql $1,$16,$4 # U : Clear old
				582	bis $2,$4,$2 # E : New result
				583
				584	mskql $2,$6,$4 # U :
				585	mskqh $1,$6,$2 # U :
				586	bis $2,$4,$1 # E :
				587	stq_u $1,0($16) # L :
				588
				589	end_w:
				590	nop
				591	nop
				592	nop
				593	ret $31,($26),1 # L0 :
				594
				595	.end __memsetw
				596
				597	memset = __memset