Blame - arch/m68k/math-emu/multi_arith.h - kernel/msm

blob: 4ad0ca918e2e3e08426c5dd012adec1c29781b9e [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/* multi_arith.h: multi-precision integer arithmetic functions, needed
				2	to do extended-precision floating point.
				3
				4	(c) 1998 David Huggins-Daines.
				5
				6	Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
				7	David Mosberger-Tang.
				8
				9	You may copy, modify, and redistribute this file under the terms of
				10	the GNU General Public License, version 2, or any later version, at
				11	your convenience. */
				12
				13	/* Note:
				14
				15	These are not general multi-precision math routines. Rather, they
				16	implement the subset of integer arithmetic that we need in order to
				17	multiply, divide, and normalize 128-bit unsigned mantissae. */
				18
				19	#ifndef MULTI_ARITH_H
				20	#define MULTI_ARITH_H
				21
				22	#if 0 /* old code... */
				23
				24	/* Unsigned only, because we don't need signs to multiply and divide. */
				25	typedef unsigned int int128[4];
				26
				27	/* Word order */
				28	enum {
				29	MSW128,
				30	NMSW128,
				31	NLSW128,
				32	LSW128
				33	};
				34
				35	/* big-endian */
				36	#define LO_WORD(ll) (((unsigned int *) &ll)[1])
				37	#define HI_WORD(ll) (((unsigned int *) &ll)[0])
				38
				39	/* Convenience functions to stuff various integer values into int128s */
				40
				41	static inline void zero128(int128 a)
				42	{
				43	a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
				44	}
				45
				46	/* Human-readable word order in the arguments */
				47	static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
				48	unsigned int i0, int128 a)
				49	{
				50	a[LSW128] = i0;
				51	a[NLSW128] = i1;
				52	a[NMSW128] = i2;
				53	a[MSW128] = i3;
				54	}
				55
				56	/* Convenience functions (for testing as well) */
				57	static inline void int64_to_128(unsigned long long src, int128 dest)
				58	{
				59	dest[LSW128] = (unsigned int) src;
				60	dest[NLSW128] = src >> 32;
				61	dest[NMSW128] = dest[MSW128] = 0;
				62	}
				63
				64	static inline void int128_to_64(const int128 src, unsigned long long *dest)
				65	{
				66	*dest = src[LSW128] \| (long long) src[NLSW128] << 32;
				67	}
				68
				69	static inline void put_i128(const int128 a)
				70	{
				71	printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
				72	a[NLSW128], a[LSW128]);
				73	}
				74
				75	/* Internal shifters:
				76
				77	Note that these are only good for 0 < count < 32.
				78	*/
				79
				80	static inline void _lsl128(unsigned int count, int128 a)
				81	{
				82	a[MSW128] = (a[MSW128] << count) \| (a[NMSW128] >> (32 - count));
				83	a[NMSW128] = (a[NMSW128] << count) \| (a[NLSW128] >> (32 - count));
				84	a[NLSW128] = (a[NLSW128] << count) \| (a[LSW128] >> (32 - count));
				85	a[LSW128] <<= count;
				86	}
				87
				88	static inline void _lsr128(unsigned int count, int128 a)
				89	{
				90	a[LSW128] = (a[LSW128] >> count) \| (a[NLSW128] << (32 - count));
				91	a[NLSW128] = (a[NLSW128] >> count) \| (a[NMSW128] << (32 - count));
				92	a[NMSW128] = (a[NMSW128] >> count) \| (a[MSW128] << (32 - count));
				93	a[MSW128] >>= count;
				94	}
				95
				96	/* Should be faster, one would hope */
				97
				98	static inline void lslone128(int128 a)
				99	{
				100	asm volatile ("lsl.l #1,%0\n"
				101	"roxl.l #1,%1\n"
				102	"roxl.l #1,%2\n"
				103	"roxl.l #1,%3\n"
				104	:
				105	"=d" (a[LSW128]),
				106	"=d"(a[NLSW128]),
				107	"=d"(a[NMSW128]),
				108	"=d"(a[MSW128])
				109	:
				110	"0"(a[LSW128]),
				111	"1"(a[NLSW128]),
				112	"2"(a[NMSW128]),
				113	"3"(a[MSW128]));
				114	}
				115
				116	static inline void lsrone128(int128 a)
				117	{
				118	asm volatile ("lsr.l #1,%0\n"
				119	"roxr.l #1,%1\n"
				120	"roxr.l #1,%2\n"
				121	"roxr.l #1,%3\n"
				122	:
				123	"=d" (a[MSW128]),
				124	"=d"(a[NMSW128]),
				125	"=d"(a[NLSW128]),
				126	"=d"(a[LSW128])
				127	:
				128	"0"(a[MSW128]),
				129	"1"(a[NMSW128]),
				130	"2"(a[NLSW128]),
				131	"3"(a[LSW128]));
				132	}
				133
				134	/* Generalized 128-bit shifters:
				135
				136	These bit-shift to a multiple of 32, then move whole longwords. */
				137
				138	static inline void lsl128(unsigned int count, int128 a)
				139	{
				140	int wordcount, i;
				141
				142	if (count % 32)
				143	_lsl128(count % 32, a);
				144
				145	if (0 == (wordcount = count / 32))
				146	return;
				147
				148	/* argh, gak, endian-sensitive */
				149	for (i = 0; i < 4 - wordcount; i++) {
				150	a[i] = a[i + wordcount];
				151	}
				152	for (i = 3; i >= 4 - wordcount; --i) {
				153	a[i] = 0;
				154	}
				155	}
				156
				157	static inline void lsr128(unsigned int count, int128 a)
				158	{
				159	int wordcount, i;
				160
				161	if (count % 32)
				162	_lsr128(count % 32, a);
				163
				164	if (0 == (wordcount = count / 32))
				165	return;
				166
				167	for (i = 3; i >= wordcount; --i) {
				168	a[i] = a[i - wordcount];
				169	}
				170	for (i = 0; i < wordcount; i++) {
				171	a[i] = 0;
				172	}
				173	}
				174
				175	static inline int orl128(int a, int128 b)
				176	{
				177	b[LSW128] \|= a;
				178	}
				179
				180	static inline int btsthi128(const int128 a)
				181	{
				182	return a[MSW128] & 0x80000000;
				183	}
				184
				185	/* test bits (numbered from 0 = LSB) up to and including "top" */
				186	static inline int bftestlo128(int top, const int128 a)
				187	{
				188	int r = 0;
				189
				190	if (top > 31)
				191	r \|= a[LSW128];
				192	if (top > 63)
				193	r \|= a[NLSW128];
				194	if (top > 95)
				195	r \|= a[NMSW128];
				196
				197	r \|= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
				198
				199	return (r != 0);
				200	}
				201
				202	/* Aargh. We need these because GCC is broken */
				203	/* FIXME: do them in assembly, for goodness' sake! */
				204	static inline void mask64(int pos, unsigned long long *mask)
				205	{
				206	*mask = 0;
				207
				208	if (pos < 32) {
				209	LO_WORD(*mask) = (1 << pos) - 1;
				210	return;
				211	}
				212	LO_WORD(*mask) = -1;
				213	HI_WORD(*mask) = (1 << (pos - 32)) - 1;
				214	}
				215
				216	static inline void bset64(int pos, unsigned long long *dest)
				217	{
				218	/* This conditional will be optimized away. Thanks, GCC! */
				219	if (pos < 32)
				220	asm volatile ("bset %1,%0":"=m"
				221	(LO_WORD(*dest)):"id"(pos));
				222	else
				223	asm volatile ("bset %1,%0":"=m"
				224	(HI_WORD(*dest)):"id"(pos - 32));
				225	}
				226
				227	static inline int btst64(int pos, unsigned long long dest)
				228	{
				229	if (pos < 32)
				230	return (0 != (LO_WORD(dest) & (1 << pos)));
				231	else
				232	return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
				233	}
				234
				235	static inline void lsl64(int count, unsigned long long *dest)
				236	{
				237	if (count < 32) {
				238	HI_WORD(dest) = (HI_WORD(dest) << count)
				239	\| (LO_WORD(*dest) >> count);
				240	LO_WORD(*dest) <<= count;
				241	return;
				242	}
				243	count -= 32;
				244	HI_WORD(dest) = LO_WORD(dest) << count;
				245	LO_WORD(*dest) = 0;
				246	}
				247
				248	static inline void lsr64(int count, unsigned long long *dest)
				249	{
				250	if (count < 32) {
				251	LO_WORD(dest) = (LO_WORD(dest) >> count)
				252	\| (HI_WORD(*dest) << (32 - count));
				253	HI_WORD(*dest) >>= count;
				254	return;
				255	}
				256	count -= 32;
				257	LO_WORD(dest) = HI_WORD(dest) >> count;
				258	HI_WORD(*dest) = 0;
				259	}
				260	#endif
				261
				262	static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
				263	{
				264	reg->exp += cnt;
				265
				266	switch (cnt) {
				267	case 0 ... 8:
				268	reg->lowmant = reg->mant.m32[1] << (8 - cnt);
				269	reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) \|
				270	(reg->mant.m32[0] << (32 - cnt));
				271	reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
				272	break;
				273	case 9 ... 32:
				274	reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
				275	if (reg->mant.m32[1] << (40 - cnt))
				276	reg->lowmant \|= 1;
				277	reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) \|
				278	(reg->mant.m32[0] << (32 - cnt));
				279	reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
				280	break;
				281	case 33 ... 39:
				282	asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
				283	: "m" (reg->mant.m32[0]), "d" (64 - cnt));
				284	if (reg->mant.m32[1] << (40 - cnt))
				285	reg->lowmant \|= 1;
				286	reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
				287	reg->mant.m32[0] = 0;
				288	break;
				289	case 40 ... 71:
				290	reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
				291	if ((reg->mant.m32[0] << (72 - cnt)) \|\| reg->mant.m32[1])
				292	reg->lowmant \|= 1;
				293	reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
				294	reg->mant.m32[0] = 0;
				295	break;
				296	default:
				297	reg->lowmant = reg->mant.m32[0] \|\| reg->mant.m32[1];
				298	reg->mant.m32[0] = 0;
				299	reg->mant.m32[1] = 0;
				300	break;
				301	}
				302	}
				303
				304	static inline int fp_overnormalize(struct fp_ext *reg)
				305	{
				306	int shift;
				307
				308	if (reg->mant.m32[0]) {
				309	asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
				310	reg->mant.m32[0] = (reg->mant.m32[0] << shift) \| (reg->mant.m32[1] >> (32 - shift));
				311	reg->mant.m32[1] = (reg->mant.m32[1] << shift);
				312	} else {
				313	asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
				314	reg->mant.m32[0] = (reg->mant.m32[1] << shift);
				315	reg->mant.m32[1] = 0;
				316	shift += 32;
				317	}
				318
				319	return shift;
				320	}
				321
				322	static inline int fp_addmant(struct fp_ext dest, struct fp_ext src)
				323	{
				324	int carry;
				325
				326	/* we assume here, gcc only insert move and a clr instr */
				327	asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
				328	: "g,d" (src->lowmant), "0,0" (dest->lowmant));
				329	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
				330	: "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
				331	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
				332	: "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
				333	asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
				334
				335	return carry;
				336	}
				337
				338	static inline int fp_addcarry(struct fp_ext *reg)
				339	{
				340	if (++reg->exp == 0x7fff) {
				341	if (reg->mant.m64)
				342	fp_set_sr(FPSR_EXC_INEX2);
				343	reg->mant.m64 = 0;
				344	fp_set_sr(FPSR_EXC_OVFL);
				345	return 0;
				346	}
				347	reg->lowmant = (reg->mant.m32[1] << 7) \| (reg->lowmant ? 1 : 0);
				348	reg->mant.m32[1] = (reg->mant.m32[1] >> 1) \|
				349	(reg->mant.m32[0] << 31);
				350	reg->mant.m32[0] = (reg->mant.m32[0] >> 1) \| 0x80000000;
				351
				352	return 1;
				353	}
				354
				355	static inline void fp_submant(struct fp_ext dest, struct fp_ext src1,
				356	struct fp_ext *src2)
				357	{
				358	/* we assume here, gcc only insert move and a clr instr */
				359	asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
				360	: "g,d" (src2->lowmant), "0,0" (src1->lowmant));
				361	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
				362	: "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
				363	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
				364	: "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
				365	}
				366
				367	#define fp_mul64(desth, destl, src1, src2) ({ \
				368	asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth) \
Al Viro	7f26333	2006-01-12 01:06:19 -0800	[diff] [blame]	369	: "dm" (src1), "0" (src2)); \
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	370	})
				371	#define fp_div64(quot, rem, srch, srcl, div) \
				372	asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem) \
				373	: "dm" (div), "1" (srch), "0" (srcl))
				374	#define fp_add64(dest1, dest2, src1, src2) ({ \
				375	asm ("add.l %1,%0" : "=d,dm" (dest2) \
				376	: "dm,d" (src2), "0,0" (dest2)); \
				377	asm ("addx.l %1,%0" : "=d" (dest1) \
				378	: "d" (src1), "0" (dest1)); \
				379	})
				380	#define fp_addx96(dest, src) ({ \
				381	/* we assume here, gcc only insert move and a clr instr */ \
				382	asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2]) \
				383	: "g,d" (temp.m32[1]), "0,0" (dest->m32[2])); \
				384	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1]) \
				385	: "d" (temp.m32[0]), "0" (dest->m32[1])); \
				386	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0]) \
				387	: "d" (0), "0" (dest->m32[0])); \
				388	})
				389	#define fp_sub64(dest, src) ({ \
				390	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1]) \
				391	: "dm,d" (src.m32[1]), "0,0" (dest.m32[1])); \
				392	asm ("subx.l %1,%0" : "=d" (dest.m32[0]) \
				393	: "d" (src.m32[0]), "0" (dest.m32[0])); \
				394	})
				395	#define fp_sub96c(dest, srch, srcm, srcl) ({ \
				396	char carry; \
				397	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2]) \
				398	: "dm,d" (srcl), "0,0" (dest.m32[2])); \
				399	asm ("subx.l %1,%0" : "=d" (dest.m32[1]) \
				400	: "d" (srcm), "0" (dest.m32[1])); \
				401	asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0]) \
				402	: "d" (srch), "1" (dest.m32[0])); \
				403	carry; \
				404	})
				405
				406	static inline void fp_multiplymant(union fp_mant128 dest, struct fp_ext src1,
				407	struct fp_ext *src2)
				408	{
				409	union fp_mant64 temp;
				410
				411	fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
				412	fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
				413
				414	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
				415	fp_addx96(dest, temp);
				416
				417	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
				418	fp_addx96(dest, temp);
				419	}
				420
				421	static inline void fp_dividemant(union fp_mant128 dest, struct fp_ext src,
				422	struct fp_ext *div)
				423	{
				424	union fp_mant128 tmp;
				425	union fp_mant64 tmp64;
				426	unsigned long *mantp = dest->m32;
				427	unsigned long fix, rem, first, dummy;
				428	int i;
				429
				430	/* the algorithm below requires dest to be smaller than div,
				431	but both have the high bit set */
				432	if (src->mant.m64 >= div->mant.m64) {
				433	fp_sub64(src->mant, div->mant);
				434	*mantp = 1;
				435	} else
				436	*mantp = 0;
				437	mantp++;
				438
				439	/* basic idea behind this algorithm: we can't divide two 64bit numbers
				440	(AB/CD) directly, but we can calculate AB/C0, but this means this
				441	quotient is off by C0/CD, so we have to multiply the first result
				442	to fix the result, after that we have nearly the correct result
				443	and only a few corrections are needed. */
				444
				445	/* C0/CD can be precalculated, but it's an 64bit division again, but
				446	we can make it a bit easier, by dividing first through C so we get
				447	10/1D and now only a single shift and the value fits into 32bit. */
				448	fix = 0x80000000;
				449	dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
				450	dummy = (dummy >> 1) \| fix;
				451	fp_div64(fix, dummy, fix, 0, dummy);
				452	fix--;
				453
				454	for (i = 0; i < 3; i++, mantp++) {
				455	if (src->mant.m32[0] == div->mant.m32[0]) {
				456	fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
				457
				458	fp_mul64(*mantp, dummy, first, fix);
				459	*mantp += fix;
				460	} else {
				461	fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
				462
				463	fp_mul64(*mantp, dummy, first, fix);
				464	}
				465
				466	fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
				467	fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
				468	tmp.m32[2] = 0;
				469
				470	fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
				471	fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
				472
				473	src->mant.m32[0] = tmp.m32[1];
				474	src->mant.m32[1] = tmp.m32[2];
				475
				476	while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
				477	src->mant.m32[0] = tmp.m32[1];
				478	src->mant.m32[1] = tmp.m32[2];
				479	*mantp += 1;
				480	}
				481	}
				482	}
				483
				484	#if 0
				485	static inline unsigned int fp_fls128(union fp_mant128 *src)
				486	{
				487	unsigned long data;
				488	unsigned int res, off;
				489
				490	if ((data = src->m32[0]))
				491	off = 0;
				492	else if ((data = src->m32[1]))
				493	off = 32;
				494	else if ((data = src->m32[2]))
				495	off = 64;
				496	else if ((data = src->m32[3]))
				497	off = 96;
				498	else
				499	return 128;
				500
				501	asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
				502	return res + off;
				503	}
				504
				505	static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
				506	{
				507	unsigned long sticky;
				508
				509	switch (shift) {
				510	case 0:
				511	return;
				512	case 1:
				513	asm volatile ("lsl.l #1,%0"
				514	: "=d" (src->m32[3]) : "0" (src->m32[3]));
				515	asm volatile ("roxl.l #1,%0"
				516	: "=d" (src->m32[2]) : "0" (src->m32[2]));
				517	asm volatile ("roxl.l #1,%0"
				518	: "=d" (src->m32[1]) : "0" (src->m32[1]));
				519	asm volatile ("roxl.l #1,%0"
				520	: "=d" (src->m32[0]) : "0" (src->m32[0]));
				521	return;
				522	case 2 ... 31:
				523	src->m32[0] = (src->m32[0] << shift) \| (src->m32[1] >> (32 - shift));
				524	src->m32[1] = (src->m32[1] << shift) \| (src->m32[2] >> (32 - shift));
				525	src->m32[2] = (src->m32[2] << shift) \| (src->m32[3] >> (32 - shift));
				526	src->m32[3] = (src->m32[3] << shift);
				527	return;
				528	case 32 ... 63:
				529	shift -= 32;
				530	src->m32[0] = (src->m32[1] << shift) \| (src->m32[2] >> (32 - shift));
				531	src->m32[1] = (src->m32[2] << shift) \| (src->m32[3] >> (32 - shift));
				532	src->m32[2] = (src->m32[3] << shift);
				533	src->m32[3] = 0;
				534	return;
				535	case 64 ... 95:
				536	shift -= 64;
				537	src->m32[0] = (src->m32[2] << shift) \| (src->m32[3] >> (32 - shift));
				538	src->m32[1] = (src->m32[3] << shift);
				539	src->m32[2] = src->m32[3] = 0;
				540	return;
				541	case 96 ... 127:
				542	shift -= 96;
				543	src->m32[0] = (src->m32[3] << shift);
				544	src->m32[1] = src->m32[2] = src->m32[3] = 0;
				545	return;
				546	case -31 ... -1:
				547	shift = -shift;
				548	sticky = 0;
				549	if (src->m32[3] << (32 - shift))
				550	sticky = 1;
				551	src->m32[3] = (src->m32[3] >> shift) \| (src->m32[2] << (32 - shift)) \| sticky;
				552	src->m32[2] = (src->m32[2] >> shift) \| (src->m32[1] << (32 - shift));
				553	src->m32[1] = (src->m32[1] >> shift) \| (src->m32[0] << (32 - shift));
				554	src->m32[0] = (src->m32[0] >> shift);
				555	return;
				556	case -63 ... -32:
				557	shift = -shift - 32;
				558	sticky = 0;
				559	if ((src->m32[2] << (32 - shift)) \|\| src->m32[3])
				560	sticky = 1;
				561	src->m32[3] = (src->m32[2] >> shift) \| (src->m32[1] << (32 - shift)) \| sticky;
				562	src->m32[2] = (src->m32[1] >> shift) \| (src->m32[0] << (32 - shift));
				563	src->m32[1] = (src->m32[0] >> shift);
				564	src->m32[0] = 0;
				565	return;
				566	case -95 ... -64:
				567	shift = -shift - 64;
				568	sticky = 0;
				569	if ((src->m32[1] << (32 - shift)) \|\| src->m32[2] \|\| src->m32[3])
				570	sticky = 1;
				571	src->m32[3] = (src->m32[1] >> shift) \| (src->m32[0] << (32 - shift)) \| sticky;
				572	src->m32[2] = (src->m32[0] >> shift);
				573	src->m32[1] = src->m32[0] = 0;
				574	return;
				575	case -127 ... -96:
				576	shift = -shift - 96;
				577	sticky = 0;
				578	if ((src->m32[0] << (32 - shift)) \|\| src->m32[1] \|\| src->m32[2] \|\| src->m32[3])
				579	sticky = 1;
				580	src->m32[3] = (src->m32[0] >> shift) \| sticky;
				581	src->m32[2] = src->m32[1] = src->m32[0] = 0;
				582	return;
				583	}
				584
				585	if (shift < 0 && (src->m32[0] \|\| src->m32[1] \|\| src->m32[2] \|\| src->m32[3]))
				586	src->m32[3] = 1;
				587	else
				588	src->m32[3] = 0;
				589	src->m32[2] = 0;
				590	src->m32[1] = 0;
				591	src->m32[0] = 0;
				592	}
				593	#endif
				594
				595	static inline void fp_putmant128(struct fp_ext dest, union fp_mant128 src,
				596	int shift)
				597	{
				598	unsigned long tmp;
				599
				600	switch (shift) {
				601	case 0:
				602	dest->mant.m64 = src->m64[0];
				603	dest->lowmant = src->m32[2] >> 24;
				604	if (src->m32[3] \|\| (src->m32[2] << 8))
				605	dest->lowmant \|= 1;
				606	break;
				607	case 1:
				608	asm volatile ("lsl.l #1,%0"
				609	: "=d" (tmp) : "0" (src->m32[2]));
				610	asm volatile ("roxl.l #1,%0"
				611	: "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
				612	asm volatile ("roxl.l #1,%0"
				613	: "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
				614	dest->lowmant = tmp >> 24;
				615	if (src->m32[3] \|\| (tmp << 8))
				616	dest->lowmant \|= 1;
				617	break;
				618	case 31:
				619	asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
				620	: "=d" (dest->mant.m32[0])
				621	: "d" (src->m32[0]), "0" (src->m32[1]));
				622	asm volatile ("roxr.l #1,%0"
				623	: "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
				624	asm volatile ("roxr.l #1,%0"
				625	: "=d" (tmp) : "0" (src->m32[3]));
				626	dest->lowmant = tmp >> 24;
				627	if (src->m32[3] << 7)
				628	dest->lowmant \|= 1;
				629	break;
				630	case 32:
				631	dest->mant.m32[0] = src->m32[1];
				632	dest->mant.m32[1] = src->m32[2];
				633	dest->lowmant = src->m32[3] >> 24;
				634	if (src->m32[3] << 8)
				635	dest->lowmant \|= 1;
				636	break;
				637	}
				638	}
				639
				640	#if 0 /* old code... */
				641	static inline int fls(unsigned int a)
				642	{
				643	int r;
				644
				645	asm volatile ("bfffo %1{#0,#32},%0"
				646	: "=d" (r) : "md" (a));
				647	return r;
				648	}
				649
				650	/* fls = "find last set" (cf. ffs(3)) */
				651	static inline int fls128(const int128 a)
				652	{
				653	if (a[MSW128])
				654	return fls(a[MSW128]);
				655	if (a[NMSW128])
				656	return fls(a[NMSW128]) + 32;
				657	/* XXX: it probably never gets beyond this point in actual
				658	use, but that's indicative of a more general problem in the
				659	algorithm (i.e. as per the actual 68881 implementation, we
				660	really only need at most 67 bits of precision [plus
				661	overflow]) so I'm not going to fix it. */
				662	if (a[NLSW128])
				663	return fls(a[NLSW128]) + 64;
				664	if (a[LSW128])
				665	return fls(a[LSW128]) + 96;
				666	else
				667	return -1;
				668	}
				669
				670	static inline int zerop128(const int128 a)
				671	{
				672	return !(a[LSW128] \| a[NLSW128] \| a[NMSW128] \| a[MSW128]);
				673	}
				674
				675	static inline int nonzerop128(const int128 a)
				676	{
				677	return (a[LSW128] \| a[NLSW128] \| a[NMSW128] \| a[MSW128]);
				678	}
				679
				680	/* Addition and subtraction */
				681	/* Do these in "pure" assembly, because "extended" asm is unmanageable
				682	here */
				683	static inline void add128(const int128 a, int128 b)
				684	{
				685	/* rotating carry flags */
				686	unsigned int carry[2];
				687
				688	carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
				689	b[LSW128] += a[LSW128];
				690
				691	carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
				692	b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
				693
				694	carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
				695	b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
				696
				697	b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
				698	}
				699
				700	/* Note: assembler semantics: "b -= a" */
				701	static inline void sub128(const int128 a, int128 b)
				702	{
				703	/* rotating borrow flags */
				704	unsigned int borrow[2];
				705
				706	borrow[0] = b[LSW128] < a[LSW128];
				707	b[LSW128] -= a[LSW128];
				708
				709	borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
				710	b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
				711
				712	borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
				713	b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
				714
				715	b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
				716	}
				717
				718	/* Poor man's 64-bit expanding multiply */
				719	static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
				720	{
				721	unsigned long long acc;
				722	int128 acc128;
				723
				724	zero128(acc128);
				725	zero128(c);
				726
				727	/* first the low words */
				728	if (LO_WORD(a) && LO_WORD(b)) {
				729	acc = (long long) LO_WORD(a) * LO_WORD(b);
				730	c[NLSW128] = HI_WORD(acc);
				731	c[LSW128] = LO_WORD(acc);
				732	}
				733	/* Next the high words */
				734	if (HI_WORD(a) && HI_WORD(b)) {
				735	acc = (long long) HI_WORD(a) * HI_WORD(b);
				736	c[MSW128] = HI_WORD(acc);
				737	c[NMSW128] = LO_WORD(acc);
				738	}
				739	/* The middle words */
				740	if (LO_WORD(a) && HI_WORD(b)) {
				741	acc = (long long) LO_WORD(a) * HI_WORD(b);
				742	acc128[NMSW128] = HI_WORD(acc);
				743	acc128[NLSW128] = LO_WORD(acc);
				744	add128(acc128, c);
				745	}
				746	/* The first and last words */
				747	if (HI_WORD(a) && LO_WORD(b)) {
				748	acc = (long long) HI_WORD(a) * LO_WORD(b);
				749	acc128[NMSW128] = HI_WORD(acc);
				750	acc128[NLSW128] = LO_WORD(acc);
				751	add128(acc128, c);
				752	}
				753	}
				754
				755	/* Note: unsigned */
				756	static inline int cmp128(int128 a, int128 b)
				757	{
				758	if (a[MSW128] < b[MSW128])
				759	return -1;
				760	if (a[MSW128] > b[MSW128])
				761	return 1;
				762	if (a[NMSW128] < b[NMSW128])
				763	return -1;
				764	if (a[NMSW128] > b[NMSW128])
				765	return 1;
				766	if (a[NLSW128] < b[NLSW128])
				767	return -1;
				768	if (a[NLSW128] > b[NLSW128])
				769	return 1;
				770
				771	return (signed) a[LSW128] - b[LSW128];
				772	}
				773
				774	inline void div128(int128 a, int128 b, int128 c)
				775	{
				776	int128 mask;
				777
				778	/* Algorithm:
				779
				780	Shift the divisor until it's at least as big as the
				781	dividend, keeping track of the position to which we've
				782	shifted it, i.e. the power of 2 which we've multiplied it
				783	by.
				784
				785	Then, for this power of 2 (the mask), and every one smaller
				786	than it, subtract the mask from the dividend and add it to
				787	the quotient until the dividend is smaller than the raised
				788	divisor. At this point, divide the dividend and the mask
				789	by 2 (i.e. shift one place to the right). Lather, rinse,
				790	and repeat, until there are no more powers of 2 left. */
				791
				792	/* FIXME: needless to say, there's room for improvement here too. */
				793
				794	/* Shift up */
				795	/* XXX: since it just has to be "at least as big", we can
				796	probably eliminate this horribly wasteful loop. I will
				797	have to prove this first, though */
				798	set128(0, 0, 0, 1, mask);
				799	while (cmp128(b, a) < 0 && !btsthi128(b)) {
				800	lslone128(b);
				801	lslone128(mask);
				802	}
				803
				804	/* Shift down */
				805	zero128(c);
				806	do {
				807	if (cmp128(a, b) >= 0) {
				808	sub128(b, a);
				809	add128(mask, c);
				810	}
				811	lsrone128(mask);
				812	lsrone128(b);
				813	} while (nonzerop128(mask));
				814
				815	/* The remainder is in a... */
				816	}
				817	#endif
				818
				819	#endif /* MULTI_ARITH_H */