Blame - arch/alpha/lib/ev6-divide.S - kernel/msm-4.9

blob: 2a82b9be93fa290fdb6e859019db24839a55af18 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* arch/alpha/lib/ev6-divide.S
				3	*
				4	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
				5	*
				6	* Alpha division..
				7	*/
				8
				9	/*
				10	* The alpha chip doesn't provide hardware division, so we have to do it
				11	* by hand. The compiler expects the functions
				12	*
				13	* __divqu: 64-bit unsigned long divide
				14	* __remqu: 64-bit unsigned long remainder
				15	* __divqs/__remqs: signed 64-bit
				16	* __divlu/__remlu: unsigned 32-bit
				17	* __divls/__remls: signed 32-bit
				18	*
				19	* These are not normal C functions: instead of the normal
				20	* calling sequence, these expect their arguments in registers
				21	* $24 and $25, and return the result in $27. Register $28 may
				22	* be clobbered (assembly temporary), anything else must be saved.
				23	*
				24	* In short: painful.
				25	*
				26	* This is a rather simple bit-at-a-time algorithm: it's very good
				27	* at dividing random 64-bit numbers, but the more usual case where
				28	* the divisor is small is handled better by the DEC algorithm
				29	* using lookup tables. This uses much less memory, though, and is
				30	* nicer on the cache.. Besides, I don't know the copyright status
				31	* of the DEC code.
				32	*/
				33
				34	/*
				35	* My temporaries:
				36	* $0 - current bit
				37	* $1 - shifted divisor
				38	* $2 - modulus/quotient
				39	*
				40	* $23 - return address
				41	* $24 - dividend
				42	* $25 - divisor
				43	*
				44	* $27 - quotient/modulus
				45	* $28 - compare status
				46	*
				47	* Much of the information about 21264 scheduling/coding comes from:
				48	* Compiler Writer's Guide for the Alpha 21264
				49	* abbreviated as 'CWG' in other comments here
				50	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
				51	* Scheduling notation:
				52	* E - either cluster
				53	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
				54	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
				55	* Try not to change the actual algorithm if possible for consistency.
				56	*/
				57
				58	#define halt .long 0
				59
				60	/*
				61	* Select function type and registers
				62	*/
				63	#define mask $0
				64	#define divisor $1
				65	#define compare $28
				66	#define tmp1 $3
				67	#define tmp2 $4
				68
				69	#ifdef DIV
				70	#define DIV_ONLY(x,y...) x,##y
				71	#define MOD_ONLY(x,y...)
				72	#define func(x) __div##x
				73	#define modulus $2
				74	#define quotient $27
				75	#define GETSIGN(x) xor $24,$25,x
				76	#define STACK 48
				77	#else
				78	#define DIV_ONLY(x,y...)
				79	#define MOD_ONLY(x,y...) x,##y
				80	#define func(x) __rem##x
				81	#define modulus $27
				82	#define quotient $2
				83	#define GETSIGN(x) bis $24,$24,x
				84	#define STACK 32
				85	#endif
				86
				87	/*
				88	* For 32-bit operations, we need to extend to 64-bit
				89	*/
				90	#ifdef INTSIZE
				91	#define ufunction func(lu)
				92	#define sfunction func(l)
				93	#define LONGIFY(x) zapnot x,15,x
				94	#define SLONGIFY(x) addl x,0,x
				95	#else
				96	#define ufunction func(qu)
				97	#define sfunction func(q)
				98	#define LONGIFY(x)
				99	#define SLONGIFY(x)
				100	#endif
				101
				102	.set noat
				103	.align 4
				104	.globl ufunction
				105	.ent ufunction
				106	ufunction:
				107	subq $30,STACK,$30 # E :
				108	.frame $30,STACK,$23
				109	.prologue 0
				110
				111	7: stq $1, 0($30) # L :
				112	bis $25,$25,divisor # E :
				113	stq $2, 8($30) # L : L U L U
				114
				115	bis $24,$24,modulus # E :
				116	stq $0,16($30) # L :
				117	bis $31,$31,quotient # E :
				118	LONGIFY(divisor) # E : U L L U
				119
				120	stq tmp1,24($30) # L :
				121	LONGIFY(modulus) # E :
				122	bis $31,1,mask # E :
				123	DIV_ONLY(stq tmp2,32($30)) # L : L U U L
				124
				125	beq divisor, 9f /* div by zero */
				126	/*
				127	* In spite of the DIV_ONLY being either a non-instruction
				128	* or an actual stq, the addition of the .align directive
				129	* below ensures that label 1 is going to be nicely aligned
				130	*/
				131
				132	.align 4
				133	#ifdef INTSIZE
				134	/*
				135	* shift divisor left, using 3-bit shifts for
				136	* 32-bit divides as we can't overflow. Three-bit
				137	* shifts will result in looping three times less
				138	* here, but can result in two loops more later.
				139	* Thus using a large shift isn't worth it (and
				140	* s8add pairs better than a sll..)
				141	*/
				142	1: cmpult divisor,modulus,compare # E :
				143	s8addq divisor,$31,divisor # E :
				144	s8addq mask,$31,mask # E :
				145	bne compare,1b # U : U L U L
				146	#else
				147	1: cmpult divisor,modulus,compare # E :
				148	nop # E :
				149	nop # E :
				150	blt divisor, 2f # U : U L U L
				151
				152	addq divisor,divisor,divisor # E :
				153	addq mask,mask,mask # E :
				154	unop # E :
				155	bne compare,1b # U : U L U L
				156	#endif
				157
				158	/* ok, start to go right again.. */
				159	2:
				160	/*
				161	* Keep things nicely bundled... use a nop instead of not
				162	* having an instruction for DIV_ONLY
				163	*/
				164	#ifdef DIV
				165	DIV_ONLY(addq quotient,mask,tmp2) # E :
				166	#else
				167	nop # E :
				168	#endif
				169	srl mask,1,mask # U :
				170	cmpule divisor,modulus,compare # E :
				171	subq modulus,divisor,tmp1 # E :
				172
				173	#ifdef DIV
				174	DIV_ONLY(cmovne compare,tmp2,quotient) # E : Latency 2, extra map slot
				175	nop # E : as part of the cmovne
				176	srl divisor,1,divisor # U :
				177	nop # E : L U L U
				178
				179	nop # E :
				180	cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
				181	nop # E : as part of the cmovne
				182	bne mask,2b # U : U L U L
				183	#else
				184	srl divisor,1,divisor # U :
				185	cmovne compare,tmp1,modulus # E : Latency 2, extra map slot
				186	nop # E : as part of the cmovne
				187	bne mask,2b # U : U L L U
				188	#endif
				189
				190	9: ldq $1, 0($30) # L :
				191	ldq $2, 8($30) # L :
				192	nop # E :
				193	nop # E : U U L L
				194
				195	ldq $0,16($30) # L :
				196	ldq tmp1,24($30) # L :
				197	nop # E :
				198	nop # E :
				199
				200	#ifdef DIV
				201	DIV_ONLY(ldq tmp2,32($30)) # L :
				202	#else
				203	nop # E :
				204	#endif
				205	addq $30,STACK,$30 # E :
				206	ret $31,($23),1 # L0 : L U U L
				207	.end ufunction
				208
				209	/*
				210	* Uhh.. Ugly signed division. I'd rather not have it at all, but
				211	* it's needed in some circumstances. There are different ways to
				212	* handle this, really. This does:
				213	* -a / b = a / -b = -(a / b)
				214	* -a % b = -(a % b)
				215	* a % -b = a % b
				216	* which is probably not the best solution, but at least should
				217	* have the property that (x/y)*y + (x%y) = x.
				218	*/
				219	.align 4
				220	.globl sfunction
				221	.ent sfunction
				222	sfunction:
				223	subq $30,STACK,$30 # E :
				224	.frame $30,STACK,$23
				225	.prologue 0
				226	bis $24,$25,$28 # E :
				227	SLONGIFY($28) # E :
				228	bge $28,7b # U :
				229
				230	stq $24,0($30) # L :
				231	subq $31,$24,$28 # E :
				232	stq $25,8($30) # L :
				233	nop # E : U L U L
				234
				235	cmovlt $24,$28,$24 /* abs($24) */ # E : Latency 2, extra map slot
				236	nop # E : as part of the cmov
				237	stq $23,16($30) # L :
				238	subq $31,$25,$28 # E : U L U L
				239
				240	stq tmp1,24($30) # L :
				241	cmovlt $25,$28,$25 /* abs($25) */ # E : Latency 2, extra map slot
				242	nop # E :
				243	bsr $23,ufunction # L0: L U L U
				244
				245	ldq $24,0($30) # L :
				246	ldq $25,8($30) # L :
				247	GETSIGN($28) # E :
				248	subq $31,$27,tmp1 # E : U U L L
				249
				250	SLONGIFY($28) # E :
				251	ldq $23,16($30) # L :
				252	cmovlt $28,tmp1,$27 # E : Latency 2, extra map slot
				253	nop # E : U L L U : as part of the cmov
				254
				255	ldq tmp1,24($30) # L :
				256	nop # E : as part of the cmov
				257	addq $30,STACK,$30 # E :
				258	ret $31,($23),1 # L0 : L U U L
				259	.end sfunction