Blame - arch/blackfin/lib/muldi3.S - kernel/msm-4.9

blob: abf9b2a515b29d4849d97336234ef24459574bf1 [file] [log] [blame]

Robin Getz	96f1050	2009-09-24 14:11:24 +0000	[diff] [blame]	1	/*
				2	* Copyright 2008 Analog Devices Inc.
				3	*
Sonic Zhang	de45083	2012-05-17 14:45:27 +0800	[diff] [blame]	4	* Licensed under the Clear BSD license or the GPL-2 (or later)
Robin Getz	96f1050	2009-09-24 14:11:24 +0000	[diff] [blame]	5	*/
				6
Bernd Schmidt	71ae92f	2009-01-07 23:14:39 +0800	[diff] [blame]	7	.align 2
				8	.global ___muldi3;
				9	.type ___muldi3, STT_FUNC;
				10
				11	#ifdef CONFIG_ARITHMETIC_OPS_L1
				12	.section .l1.text
				13	#else
				14	.text
				15	#endif
				16
				17	/*
				18	R1:R0 * R3:R2
				19	= R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
				20	[X] = (R1.h * R3.h) * 2^96
				21	[X] + (R1.h * R3.l + R1.l * R3.h) * 2^80
				22	[X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
				23	[T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
				24	[T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
				25	[T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16
				26	[T4] + (R0.l * R2.l)
				27
				28	We can discard the first three lines marked "X" since we produce
				29	only a 64 bit result. So, we need ten 16-bit multiplies.
				30
				31	Individual mul-acc results:
				32	[E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
				33	[E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
				34	[E3] = R0.l * R2.h + R2.l * R0.h
				35	[E4] = R0.l * R2.l
				36
				37	We also need to add high parts from lower-level results to higher ones:
				38	E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
				39
				40	One interesting property is that all parts of the result that depend
				41	on the sign of the multiplication are discarded. Those would be the
				42	multiplications involving R1.h and R3.h, but only the top 16 bit of
				43	the 32 bit result depend on the sign, and since R1.h and R3.h only
				44	occur in E1, the top half of these results is cut off.
				45	So, we can just use FU mode for all of the 16-bit multiplies, and
				46	ignore questions of when to use mixed mode. */
				47
				48	___muldi3:
				49	/* [SP] technically is part of the caller's frame, but we can
				50	use it as scratch space. */
				51	A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) \|\| R3 = [SP + 12]; /* E1 */
				52	A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) \|\| [SP] = R4; /* E1 */
				53	A0 += A1; /* E1 */
				54	R4 = A0.w;
				55	A0 = R0.l * R3.l (FU); /* E2 */
				56	A0 += R2.l * R1.l (FU); /* E2 */
				57
				58	A1 = R2.L * R0.L (FU); /* E4 */
				59	R3 = A1.w;
				60	A1 = A1 >> 16; /* E3c */
				61	A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */
				62	A1 += R0.L * R2.H (FU); /* E3c */
				63	R0 = A1.w;
				64	A1 = A1 >> 16; /* E2c */
				65	A0 += A1; /* E2c */
				66	R1 = A0.w;
				67
				68	/* low(result) = low(E3c):low(E4) */
				69	R0 = PACK (R0.l, R3.l);
				70	/* high(result) = E2c + (E1 << 16) */
				71	R1.h = R1.h + R4.l (NS) \|\| R4 = [SP];
				72	RTS;
				73
				74	.size ___muldi3, .-___muldi3