Blame - arch/alpha/lib/ev6-memchr.S - kernel/msm-4.9

blob: a8e843dbcc23de767b8e9432966e12fa36454294 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* arch/alpha/lib/ev6-memchr.S
				3	*
				4	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
				5	*
				6	* Finds characters in a memory area. Optimized for the Alpha:
				7	*
				8	* - memory accessed as aligned quadwords only
				9	* - uses cmpbge to compare 8 bytes in parallel
				10	* - does binary search to find 0 byte in last
				11	* quadword (HAKMEM needed 12 instructions to
				12	* do this instead of the 9 instructions that
				13	* binary search needs).
				14	*
				15	* For correctness consider that:
				16	*
				17	* - only minimum number of quadwords may be accessed
				18	* - the third argument is an unsigned long
				19	*
				20	* Much of the information about 21264 scheduling/coding comes from:
				21	* Compiler Writer's Guide for the Alpha 21264
				22	* abbreviated as 'CWG' in other comments here
				23	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
				24	* Scheduling notation:
				25	* E - either cluster
				26	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
				27	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
				28	* Try not to change the actual algorithm if possible for consistency.
				29	*/
				30
				31	.set noreorder
				32	.set noat
				33
				34	.align 4
				35	.globl memchr
				36	.ent memchr
				37	memchr:
				38	.frame $30,0,$26,0
				39	.prologue 0
				40
				41	# Hack -- if someone passes in (size_t)-1, hoping to just
				42	# search til the end of the address space, we will overflow
				43	# below when we find the address of the last byte. Given
				44	# that we will never have a 56-bit address space, cropping
				45	# the length is the easiest way to avoid trouble.
				46	zap $18, 0x80, $5 # U : Bound length
				47	beq $18, $not_found # U :
				48	ldq_u $1, 0($16) # L : load first quadword Latency=3
				49	and $17, 0xff, $17 # E : L L U U : 00000000000000ch
				50
				51	insbl $17, 1, $2 # U : 000000000000ch00
				52	cmpult $18, 9, $4 # E : small (< 1 quad) string?
				53	or $2, $17, $17 # E : 000000000000chch
				54	lda $3, -1($31) # E : U L L U
				55
				56	sll $17, 16, $2 # U : 00000000chch0000
				57	addq $16, $5, $5 # E : Max search address
				58	or $2, $17, $17 # E : 00000000chchchch
				59	sll $17, 32, $2 # U : U L L U : chchchch00000000
				60
				61	or $2, $17, $17 # E : chchchchchchchch
				62	extql $1, $16, $7 # U : $7 is upper bits
				63	beq $4, $first_quad # U :
				64	ldq_u $6, -1($5) # L : L U U L : eight or less bytes to search Latency=3
				65
				66	extqh $6, $16, $6 # U : 2 cycle stall for $6
				67	mov $16, $0 # E :
				68	nop # E :
				69	or $7, $6, $1 # E : L U L U $1 = quadword starting at $16
				70
				71	# Deal with the case where at most 8 bytes remain to be searched
				72	# in $1. E.g.:
				73	# $18 = 6
				74	# $1 = ????c6c5c4c3c2c1
				75	$last_quad:
				76	negq $18, $6 # E :
				77	xor $17, $1, $1 # E :
				78	srl $3, $6, $6 # U : $6 = mask of $18 bits set
				79	cmpbge $31, $1, $2 # E : L U L U
				80
				81	nop
				82	nop
				83	and $2, $6, $2 # E :
				84	beq $2, $not_found # U : U L U L
				85
				86	$found_it:
				87	#if defined(__alpha_fix__) && defined(__alpha_cix__)
				88	/*
				89	* Since we are guaranteed to have set one of the bits, we don't
				90	* have to worry about coming back with a 0x40 out of cttz...
				91	*/
				92	cttz $2, $3 # U0 :
				93	addq $0, $3, $0 # E : All done
				94	nop # E :
				95	ret # L0 : L U L U
				96	#else
				97	/*
				98	* Slow and clunky. It can probably be improved.
				99	* An exercise left for others.
				100	*/
				101	negq $2, $3 # E :
				102	and $2, $3, $2 # E :
				103	and $2, 0x0f, $1 # E :
				104	addq $0, 4, $3 # E :
				105
				106	cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
				107	nop # E : keep with cmov
				108	and $2, 0x33, $1 # E :
				109	addq $0, 2, $3 # E : U L U L : 2 cycle stall on $0
				110
				111	cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
				112	nop # E : keep with cmov
				113	and $2, 0x55, $1 # E :
				114	addq $0, 1, $3 # E : U L U L : 2 cycle stall on $0
				115
				116	cmoveq $1, $3, $0 # E : Latency 2, extra map cycle
				117	nop
				118	nop
				119	ret # L0 : L U L U
				120	#endif
				121
				122	# Deal with the case where $18 > 8 bytes remain to be
				123	# searched. $16 may not be aligned.
				124	.align 4
				125	$first_quad:
				126	andnot $16, 0x7, $0 # E :
				127	insqh $3, $16, $2 # U : $2 = 0000ffffffffffff ($16<0:2> ff)
				128	xor $1, $17, $1 # E :
				129	or $1, $2, $1 # E : U L U L $1 = ====ffffffffffff
				130
				131	cmpbge $31, $1, $2 # E :
				132	bne $2, $found_it # U :
				133	# At least one byte left to process.
				134	ldq $1, 8($0) # L :
				135	subq $5, 1, $18 # E : U L U L
				136
				137	addq $0, 8, $0 # E :
				138	# Make $18 point to last quad to be accessed (the
				139	# last quad may or may not be partial).
				140	andnot $18, 0x7, $18 # E :
				141	cmpult $0, $18, $2 # E :
				142	beq $2, $final # U : U L U L
				143
				144	# At least two quads remain to be accessed.
				145
				146	subq $18, $0, $4 # E : $4 <- nr quads to be processed
				147	and $4, 8, $4 # E : odd number of quads?
				148	bne $4, $odd_quad_count # U :
				149	# At least three quads remain to be accessed
				150	mov $1, $4 # E : L U L U : move prefetched value to correct reg
				151
				152	.align 4
				153	$unrolled_loop:
				154	ldq $1, 8($0) # L : prefetch $1
				155	xor $17, $4, $2 # E :
				156	cmpbge $31, $2, $2 # E :
				157	bne $2, $found_it # U : U L U L
				158
				159	addq $0, 8, $0 # E :
				160	nop # E :
				161	nop # E :
				162	nop # E :
				163
				164	$odd_quad_count:
				165	xor $17, $1, $2 # E :
				166	ldq $4, 8($0) # L : prefetch $4
				167	cmpbge $31, $2, $2 # E :
				168	addq $0, 8, $6 # E :
				169
				170	bne $2, $found_it # U :
				171	cmpult $6, $18, $6 # E :
				172	addq $0, 8, $0 # E :
				173	nop # E :
				174
				175	bne $6, $unrolled_loop # U :
				176	mov $4, $1 # E : move prefetched value into $1
				177	nop # E :
				178	nop # E :
				179
				180	$final: subq $5, $0, $18 # E : $18 <- number of bytes left to do
				181	nop # E :
				182	nop # E :
				183	bne $18, $last_quad # U :
				184
				185	$not_found:
				186	mov $31, $0 # E :
				187	nop # E :
				188	nop # E :
				189	ret # L0 :
				190
				191	.end memchr