Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.

Let it rip!
diff --git a/arch/m68k/math-emu/Makefile b/arch/m68k/math-emu/Makefile
new file mode 100644
index 0000000..5399404
--- /dev/null
+++ b/arch/m68k/math-emu/Makefile
@@ -0,0 +1,11 @@
+#
+# Makefile for the linux kernel.
+#
+
+EXTRA_AFLAGS := -traditional
+
+#EXTRA_AFLAGS += -DFPU_EMU_DEBUG
+#EXTRA_CFLAGS += -DFPU_EMU_DEBUG
+
+obj-y		:= fp_entry.o fp_scan.o fp_util.o fp_move.o fp_movem.o \
+			fp_cond.o fp_arith.o fp_log.o fp_trig.o
diff --git a/arch/m68k/math-emu/fp_arith.c b/arch/m68k/math-emu/fp_arith.c
new file mode 100644
index 0000000..08f286d
--- /dev/null
+++ b/arch/m68k/math-emu/fp_arith.c
@@ -0,0 +1,701 @@
+/*
+
+   fp_arith.c: floating-point math routines for the Linux-m68k
+   floating point emulator.
+
+   Copyright (c) 1998-1999 David Huggins-Daines.
+
+   Somewhat based on the AlphaLinux floating point emulator, by David
+   Mosberger-Tang.
+
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience.
+ */
+
+#include "fp_emu.h"
+#include "multi_arith.h"
+#include "fp_arith.h"
+
+const struct fp_ext fp_QNaN =
+{
+	.exp = 0x7fff,
+	.mant = { .m64 = ~0 }
+};
+
+const struct fp_ext fp_Inf =
+{
+	.exp = 0x7fff,
+};
+
+/* let's start with the easy ones */
+
+struct fp_ext *
+fp_fabs(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fabs\n");
+
+	fp_monadic_check(dest, src);
+
+	dest->sign = 0;
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fneg(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fneg\n");
+
+	fp_monadic_check(dest, src);
+
+	dest->sign = !dest->sign;
+
+	return dest;
+}
+
+/* Now, the slightly harder ones */
+
+/* fp_fadd: Implements the kernel of the FADD, FSADD, FDADD, FSUB,
+   FDSUB, and FCMP instructions. */
+
+struct fp_ext *
+fp_fadd(struct fp_ext *dest, struct fp_ext *src)
+{
+	int diff;
+
+	dprint(PINSTR, "fadd\n");
+
+	fp_dyadic_check(dest, src);
+
+	if (IS_INF(dest)) {
+		/* infinity - infinity == NaN */
+		if (IS_INF(src) && (src->sign != dest->sign))
+			fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(src)) {
+		fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	if (IS_ZERO(dest)) {
+		if (IS_ZERO(src)) {
+			if (src->sign != dest->sign) {
+				if (FPDATA->rnd == FPCR_ROUND_RM)
+					dest->sign = 1;
+				else
+					dest->sign = 0;
+			}
+		} else
+			fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	dest->lowmant = src->lowmant = 0;
+
+	if ((diff = dest->exp - src->exp) > 0)
+		fp_denormalize(src, diff);
+	else if ((diff = -diff) > 0)
+		fp_denormalize(dest, diff);
+
+	if (dest->sign == src->sign) {
+		if (fp_addmant(dest, src))
+			if (!fp_addcarry(dest))
+				return dest;
+	} else {
+		if (dest->mant.m64 < src->mant.m64) {
+			fp_submant(dest, src, dest);
+			dest->sign = !dest->sign;
+		} else
+			fp_submant(dest, dest, src);
+	}
+
+	return dest;
+}
+
+/* fp_fsub: Implements the kernel of the FSUB, FSSUB, and FDSUB
+   instructions.
+
+   Remember that the arguments are in assembler-syntax order! */
+
+struct fp_ext *
+fp_fsub(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fsub ");
+
+	src->sign = !src->sign;
+	return fp_fadd(dest, src);
+}
+
+
+struct fp_ext *
+fp_fcmp(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fcmp ");
+
+	FPDATA->temp[1] = *dest;
+	src->sign = !src->sign;
+	return fp_fadd(&FPDATA->temp[1], src);
+}
+
+struct fp_ext *
+fp_ftst(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "ftst\n");
+
+	(void)dest;
+
+	return src;
+}
+
+struct fp_ext *
+fp_fmul(struct fp_ext *dest, struct fp_ext *src)
+{
+	union fp_mant128 temp;
+	int exp;
+
+	dprint(PINSTR, "fmul\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(src)) {
+		if (IS_ZERO(dest))
+			fp_set_nan(dest);
+		else
+			fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	/* Of course, as we all know, zero * anything = zero.  You may
+	   not have known that it might be a positive or negative
+	   zero... */
+	if (IS_ZERO(dest) || IS_ZERO(src)) {
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp + src->exp - 0x3ffe;
+
+	/* shift up the mantissa for denormalized numbers,
+	   so that the highest bit is set, this makes the
+	   shift of the result below easier */
+	if ((long)dest->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(dest);
+	if ((long)src->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(src);
+
+	/* now, do a 64-bit multiply with expansion */
+	fp_multiplymant(&temp, dest, src);
+
+	/* normalize it back to 64 bits and stuff it back into the
+	   destination struct */
+	if ((long)temp.m32[0] > 0) {
+		exp--;
+		fp_putmant128(dest, &temp, 1);
+	} else
+		fp_putmant128(dest, &temp, 0);
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+/* fp_fdiv: Implements the "kernel" of the FDIV, FSDIV, FDDIV and
+   FSGLDIV instructions.
+
+   Note that the order of the operands is counter-intuitive: instead
+   of src / dest, the result is actually dest / src. */
+
+struct fp_ext *
+fp_fdiv(struct fp_ext *dest, struct fp_ext *src)
+{
+	union fp_mant128 temp;
+	int exp;
+
+	dprint(PINSTR, "fdiv\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		/* infinity / infinity = NaN (quiet, as always) */
+		if (IS_INF(src))
+			fp_set_nan(dest);
+		/* infinity / anything else = infinity (with approprate sign) */
+		return dest;
+	}
+	if (IS_INF(src)) {
+		/* anything / infinity = zero (with appropriate sign) */
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	/* zeroes */
+	if (IS_ZERO(dest)) {
+		/* zero / zero = NaN */
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		/* zero / anything else = zero */
+		return dest;
+	}
+	if (IS_ZERO(src)) {
+		/* anything / zero = infinity (with appropriate sign) */
+		fp_set_sr(FPSR_EXC_DZ);
+		dest->exp = 0x7fff;
+		dest->mant.m64 = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp - src->exp + 0x3fff;
+
+	/* shift up the mantissa for denormalized numbers,
+	   so that the highest bit is set, this makes lots
+	   of things below easier */
+	if ((long)dest->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(dest);
+	if ((long)src->mant.m32[0] >= 0)
+		exp -= fp_overnormalize(src);
+
+	/* now, do the 64-bit divide */
+	fp_dividemant(&temp, dest, src);
+
+	/* normalize it back to 64 bits and stuff it back into the
+	   destination struct */
+	if (!temp.m32[0]) {
+		exp--;
+		fp_putmant128(dest, &temp, 32);
+	} else
+		fp_putmant128(dest, &temp, 31);
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsglmul(struct fp_ext *dest, struct fp_ext *src)
+{
+	int exp;
+
+	dprint(PINSTR, "fsglmul\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(src)) {
+		if (IS_ZERO(dest))
+			fp_set_nan(dest);
+		else
+			fp_copy_ext(dest, src);
+		return dest;
+	}
+
+	/* Of course, as we all know, zero * anything = zero.  You may
+	   not have known that it might be a positive or negative
+	   zero... */
+	if (IS_ZERO(dest) || IS_ZERO(src)) {
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp + src->exp - 0x3ffe;
+
+	/* do a 32-bit multiply */
+	fp_mul64(dest->mant.m32[0], dest->mant.m32[1],
+		 dest->mant.m32[0] & 0xffffff00,
+		 src->mant.m32[0] & 0xffffff00);
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsgldiv(struct fp_ext *dest, struct fp_ext *src)
+{
+	int exp;
+	unsigned long quot, rem;
+
+	dprint(PINSTR, "fsgldiv\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* calculate the correct sign now, as it's necessary for infinities */
+	dest->sign = src->sign ^ dest->sign;
+
+	/* Handle infinities */
+	if (IS_INF(dest)) {
+		/* infinity / infinity = NaN (quiet, as always) */
+		if (IS_INF(src))
+			fp_set_nan(dest);
+		/* infinity / anything else = infinity (with approprate sign) */
+		return dest;
+	}
+	if (IS_INF(src)) {
+		/* anything / infinity = zero (with appropriate sign) */
+		dest->exp = 0;
+		dest->mant.m64 = 0;
+		dest->lowmant = 0;
+
+		return dest;
+	}
+
+	/* zeroes */
+	if (IS_ZERO(dest)) {
+		/* zero / zero = NaN */
+		if (IS_ZERO(src))
+			fp_set_nan(dest);
+		/* zero / anything else = zero */
+		return dest;
+	}
+	if (IS_ZERO(src)) {
+		/* anything / zero = infinity (with appropriate sign) */
+		fp_set_sr(FPSR_EXC_DZ);
+		dest->exp = 0x7fff;
+		dest->mant.m64 = 0;
+
+		return dest;
+	}
+
+	exp = dest->exp - src->exp + 0x3fff;
+
+	dest->mant.m32[0] &= 0xffffff00;
+	src->mant.m32[0] &= 0xffffff00;
+
+	/* do the 32-bit divide */
+	if (dest->mant.m32[0] >= src->mant.m32[0]) {
+		fp_sub64(dest->mant, src->mant);
+		fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]);
+		dest->mant.m32[0] = 0x80000000 | (quot >> 1);
+		dest->mant.m32[1] = (quot & 1) | rem;	/* only for rounding */
+	} else {
+		fp_div64(quot, rem, dest->mant.m32[0], 0, src->mant.m32[0]);
+		dest->mant.m32[0] = quot;
+		dest->mant.m32[1] = rem;		/* only for rounding */
+		exp--;
+	}
+
+	if (exp >= 0x7fff) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+	dest->exp = exp;
+	if (exp < 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -exp);
+	}
+
+	return dest;
+}
+
+/* fp_roundint: Internal rounding function for use by several of these
+   emulated instructions.
+
+   This one rounds off the fractional part using the rounding mode
+   specified. */
+
+static void fp_roundint(struct fp_ext *dest, int mode)
+{
+	union fp_mant64 oldmant;
+	unsigned long mask;
+
+	if (!fp_normalize_ext(dest))
+		return;
+
+	/* infinities and zeroes */
+	if (IS_INF(dest) || IS_ZERO(dest))
+		return;
+
+	/* first truncate the lower bits */
+	oldmant = dest->mant;
+	switch (dest->exp) {
+	case 0 ... 0x3ffe:
+		dest->mant.m64 = 0;
+		break;
+	case 0x3fff ... 0x401e:
+		dest->mant.m32[0] &= 0xffffffffU << (0x401e - dest->exp);
+		dest->mant.m32[1] = 0;
+		if (oldmant.m64 == dest->mant.m64)
+			return;
+		break;
+	case 0x401f ... 0x403e:
+		dest->mant.m32[1] &= 0xffffffffU << (0x403e - dest->exp);
+		if (oldmant.m32[1] == dest->mant.m32[1])
+			return;
+		break;
+	default:
+		return;
+	}
+	fp_set_sr(FPSR_EXC_INEX2);
+
+	/* We might want to normalize upwards here... however, since
+	   we know that this is only called on the output of fp_fdiv,
+	   or with the input to fp_fint or fp_fintrz, and the inputs
+	   to all these functions are either normal or denormalized
+	   (no subnormals allowed!), there's really no need.
+
+	   In the case of fp_fdiv, observe that 0x80000000 / 0xffff =
+	   0xffff8000, and the same holds for 128-bit / 64-bit. (i.e. the
+	   smallest possible normal dividend and the largest possible normal
+	   divisor will still produce a normal quotient, therefore, (normal
+	   << 64) / normal is normal in all cases) */
+
+	switch (mode) {
+	case FPCR_ROUND_RN:
+		switch (dest->exp) {
+		case 0 ... 0x3ffd:
+			return;
+		case 0x3ffe:
+			/* As noted above, the input is always normal, so the
+			   guard bit (bit 63) is always set.  therefore, the
+			   only case in which we will NOT round to 1.0 is when
+			   the input is exactly 0.5. */
+			if (oldmant.m64 == (1ULL << 63))
+				return;
+			break;
+		case 0x3fff ... 0x401d:
+			mask = 1 << (0x401d - dest->exp);
+			if (!(oldmant.m32[0] & mask))
+				return;
+			if (oldmant.m32[0] & (mask << 1))
+				break;
+			if (!(oldmant.m32[0] << (dest->exp - 0x3ffd)) &&
+					!oldmant.m32[1])
+				return;
+			break;
+		case 0x401e:
+			if (!(oldmant.m32[1] >= 0))
+				return;
+			if (oldmant.m32[0] & 1)
+				break;
+			if (!(oldmant.m32[1] << 1))
+				return;
+			break;
+		case 0x401f ... 0x403d:
+			mask = 1 << (0x403d - dest->exp);
+			if (!(oldmant.m32[1] & mask))
+				return;
+			if (oldmant.m32[1] & (mask << 1))
+				break;
+			if (!(oldmant.m32[1] << (dest->exp - 0x401d)))
+				return;
+			break;
+		default:
+			return;
+		}
+		break;
+	case FPCR_ROUND_RZ:
+		return;
+	default:
+		if (dest->sign ^ (mode - FPCR_ROUND_RM))
+			break;
+		return;
+	}
+
+	switch (dest->exp) {
+	case 0 ... 0x3ffe:
+		dest->exp = 0x3fff;
+		dest->mant.m64 = 1ULL << 63;
+		break;
+	case 0x3fff ... 0x401e:
+		mask = 1 << (0x401e - dest->exp);
+		if (dest->mant.m32[0] += mask)
+			break;
+		dest->mant.m32[0] = 0x80000000;
+		dest->exp++;
+		break;
+	case 0x401f ... 0x403e:
+		mask = 1 << (0x403e - dest->exp);
+		if (dest->mant.m32[1] += mask)
+			break;
+		if (dest->mant.m32[0] += 1)
+                        break;
+		dest->mant.m32[0] = 0x80000000;
+                dest->exp++;
+		break;
+	}
+}
+
+/* modrem_kernel: Implementation of the FREM and FMOD instructions
+   (which are exactly the same, except for the rounding used on the
+   intermediate value) */
+
+static struct fp_ext *
+modrem_kernel(struct fp_ext *dest, struct fp_ext *src, int mode)
+{
+	struct fp_ext tmp;
+
+	fp_dyadic_check(dest, src);
+
+	/* Infinities and zeros */
+	if (IS_INF(dest) || IS_ZERO(src)) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_ZERO(dest) || IS_INF(src))
+		return dest;
+
+	/* FIXME: there is almost certainly a smarter way to do this */
+	fp_copy_ext(&tmp, dest);
+	fp_fdiv(&tmp, src);		/* NOTE: src might be modified */
+	fp_roundint(&tmp, mode);
+	fp_fmul(&tmp, src);
+	fp_fsub(dest, &tmp);
+
+	/* set the quotient byte */
+	fp_set_quotient((dest->mant.m64 & 0x7f) | (dest->sign << 7));
+	return dest;
+}
+
+/* fp_fmod: Implements the kernel of the FMOD instruction.
+
+   Again, the argument order is backwards.  The result, as defined in
+   the Motorola manuals, is:
+
+   fmod(src,dest) = (dest - (src * floor(dest / src))) */
+
+struct fp_ext *
+fp_fmod(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fmod\n");
+	return modrem_kernel(dest, src, FPCR_ROUND_RZ);
+}
+
+/* fp_frem: Implements the kernel of the FREM instruction.
+
+   frem(src,dest) = (dest - (src * round(dest / src)))
+ */
+
+struct fp_ext *
+fp_frem(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "frem\n");
+	return modrem_kernel(dest, src, FPCR_ROUND_RN);
+}
+
+struct fp_ext *
+fp_fint(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fint\n");
+
+	fp_copy_ext(dest, src);
+
+	fp_roundint(dest, FPDATA->rnd);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fintrz(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fintrz\n");
+
+	fp_copy_ext(dest, src);
+
+	fp_roundint(dest, FPCR_ROUND_RZ);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fscale(struct fp_ext *dest, struct fp_ext *src)
+{
+	int scale, oldround;
+
+	dprint(PINSTR, "fscale\n");
+
+	fp_dyadic_check(dest, src);
+
+	/* Infinities */
+	if (IS_INF(src)) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(dest))
+		return dest;
+
+	/* zeroes */
+	if (IS_ZERO(src) || IS_ZERO(dest))
+		return dest;
+
+	/* Source exponent out of range */
+	if (src->exp >= 0x400c) {
+		fp_set_ovrflw(dest);
+		return dest;
+	}
+
+	/* src must be rounded with round to zero. */
+	oldround = FPDATA->rnd;
+	FPDATA->rnd = FPCR_ROUND_RZ;
+	scale = fp_conv_ext2long(src);
+	FPDATA->rnd = oldround;
+
+	/* new exponent */
+	scale += dest->exp;
+
+	if (scale >= 0x7fff) {
+		fp_set_ovrflw(dest);
+	} else if (scale <= 0) {
+		fp_set_sr(FPSR_EXC_UNFL);
+		fp_denormalize(dest, -scale);
+	} else
+		dest->exp = scale;
+
+	return dest;
+}
+
diff --git a/arch/m68k/math-emu/fp_arith.h b/arch/m68k/math-emu/fp_arith.h
new file mode 100644
index 0000000..2cc3f84
--- /dev/null
+++ b/arch/m68k/math-emu/fp_arith.h
@@ -0,0 +1,52 @@
+/*
+
+   fp_arith.h: floating-point math routines for the Linux-m68k
+   floating point emulator.
+
+   Copyright (c) 1998 David Huggins-Daines.
+
+   Somewhat based on the AlphaLinux floating point emulator, by David
+   Mosberger-Tang.
+
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience.
+
+ */
+
+#ifndef FP_ARITH_H
+#define FP_ARITH_H
+
+/* easy ones */
+struct fp_ext *
+fp_fabs(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fneg(struct fp_ext *dest, struct fp_ext *src);
+
+/* straightforward arithmetic */
+struct fp_ext *
+fp_fadd(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fsub(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fcmp(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_ftst(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fmul(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fdiv(struct fp_ext *dest, struct fp_ext *src);
+
+/* ones that do rounding and integer conversions */
+struct fp_ext *
+fp_fmod(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_frem(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fint(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fintrz(struct fp_ext *dest, struct fp_ext *src);
+struct fp_ext *
+fp_fscale(struct fp_ext *dest, struct fp_ext *src);
+
+#endif	/* FP_ARITH__H */
diff --git a/arch/m68k/math-emu/fp_cond.S b/arch/m68k/math-emu/fp_cond.S
new file mode 100644
index 0000000..ddae8b1
--- /dev/null
+++ b/arch/m68k/math-emu/fp_cond.S
@@ -0,0 +1,334 @@
+/*
+ * fp_cond.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+	.globl	fp_fscc, fp_fbccw, fp_fbccl
+
+#ifdef FPU_EMU_DEBUG
+fp_fnop:
+	printf	PDECODE,"fnop\n"
+	jra	fp_end
+#else
+#define fp_fnop fp_end
+#endif
+
+fp_fbccw:
+	tst.w	%d2
+	jeq	fp_fnop
+	printf	PDECODE,"fbccw "
+	fp_get_pc %a0
+	lea	(-2,%a0,%d2.w),%a0
+	jra	1f
+
+fp_fbccl:
+	printf	PDECODE,"fbccl "
+	fp_get_pc %a0
+	move.l	%d2,%d0
+	swap	%d0
+	fp_get_instr_word %d0,fp_err_ua1
+	lea	(-2,%a0,%d0.l),%a0
+1:	printf	PDECODE,"%x",1,%a0
+	move.l	%d2,%d0
+	swap	%d0
+	jsr	fp_compute_cond
+	tst.l	%d0
+	jeq	1f
+	fp_put_pc %a0,1
+1:	printf	PDECODE,"\n"
+	jra	fp_end
+
+fp_fdbcc:
+	printf	PDECODE,"fdbcc "
+	fp_get_pc %a1				| calculate new pc
+	fp_get_instr_word %d0,fp_err_ua1
+	add.w	%d0,%a1
+	fp_decode_addr_reg
+	printf	PDECODE,"d%d,%x\n",2,%d0,%a1
+	swap	%d1				| test condition in %d1
+	tst.w	%d1
+	jne	2f
+	move.l	%d0,%d1
+	jsr	fp_get_data_reg
+	subq.w	#1,%d0
+	jcs	1f
+	fp_put_pc %a1,1
+1:	jsr	fp_put_data_reg
+2:	jra	fp_end
+
+| set flags for decode macros for fs<cc>
+do_fscc=1
+do_no_pc_mode=1
+
+fp_fscc:
+	printf	PDECODE,"fscc "
+	move.l	%d2,%d0
+	jsr	fp_compute_cond
+	move.w	%d0,%d1
+	swap	%d1
+
+	| decode addressing mode
+	fp_decode_addr_mode
+
+	.long	fp_data, fp_fdbcc
+	.long	fp_indirect, fp_postinc
+	.long	fp_predecr, fp_disp16
+	.long	fp_extmode0, fp_extmode1
+
+	| addressing mode: data register direct
+fp_data:
+	fp_mode_data_direct
+	move.w	%d0,%d1			| save register nr
+	jsr	fp_get_data_reg
+	swap	%d1
+	move.b	%d1,%d0
+	swap	%d1
+	jsr	fp_put_data_reg
+	printf	PDECODE,"\n"
+	jra	fp_end
+
+fp_indirect:
+	fp_mode_addr_indirect
+	jra	fp_do_scc
+
+fp_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fp_do_scc
+
+fp_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fp_do_scc
+
+fp_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fp_do_scc
+
+fp_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fp_do_scc
+
+fp_extmode1:
+	bfextu	%d2{#13,#3},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_absolute_short, fp_absolute_long
+	.long	fp_ill, fp_ill		| NOTE: jump here to ftrap.x
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+
+fp_absolute_short:
+	fp_mode_abs_short
+	jra	fp_do_scc
+
+fp_absolute_long:
+	fp_mode_abs_long
+|	jra	fp_do_scc
+
+fp_do_scc:
+	swap	%d1
+	putuser.b %d1,(%a0),fp_err_ua1,%a0
+	printf	PDECODE,"\n"
+	jra	fp_end
+
+
+#define tst_NAN	btst #24,%d1
+#define tst_Z	btst #26,%d1
+#define tst_N	btst #27,%d1
+
+fp_compute_cond:
+	move.l	(FPD_FPSR,FPDATA),%d1
+	btst	#4,%d0
+	jeq	1f
+	tst_NAN
+	jeq	1f
+	bset	#15,%d1
+	bset	#7,%d1
+	move.l	%d1,(FPD_FPSR,FPDATA)
+1:	and.w	#0xf,%d0
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_f  , fp_eq , fp_ogt, fp_oge
+	.long	fp_olt, fp_ole, fp_ogl, fp_or
+	.long	fp_un , fp_ueq, fp_ugt, fp_uge
+	.long	fp_ult, fp_ule, fp_ne , fp_t
+
+fp_f:
+	moveq	#0,%d0
+	rts
+
+fp_eq:
+	moveq	#0,%d0
+	tst_Z
+	jeq	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_ogt:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_oge:
+	moveq	#-1,%d0
+	tst_Z
+	jne	2f
+	tst_NAN
+	jne	1f
+	tst_N
+	jeq	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_olt:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jeq	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_ole:
+	moveq	#-1,%d0
+	tst_Z
+	jne	2f
+	tst_NAN
+	jne	1f
+	tst_N
+	jne	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_ogl:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_or:
+	moveq	#0,%d0
+	tst_NAN
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_un:
+	moveq	#0,%d0
+	tst_NAN
+	jeq	1f
+	moveq	#-1,%d0
+	rts
+
+fp_ueq:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	moveq	#0,%d0
+1:	rts
+
+fp_ugt:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	2f
+	tst_N
+	jne	1f
+	tst_Z
+	jeq	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_uge:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jeq	1f
+	moveq	#0,%d0
+1:	rts
+
+fp_ult:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	2f
+	tst_Z
+	jne	1f
+	tst_N
+	jne	2f
+1:	moveq	#0,%d0
+2:	rts
+
+fp_ule:
+	moveq	#-1,%d0
+	tst_NAN
+	jne	1f
+	tst_Z
+	jne	1f
+	tst_N
+	jne	1f
+	moveq	#0,%d0
+1:	rts
+
+fp_ne:
+	moveq	#0,%d0
+	tst_Z
+	jne	1f
+	moveq	#-1,%d0
+1:	rts
+
+fp_t:
+	moveq	#-1,%d0
+	rts
diff --git a/arch/m68k/math-emu/fp_decode.h b/arch/m68k/math-emu/fp_decode.h
new file mode 100644
index 0000000..759679d
--- /dev/null
+++ b/arch/m68k/math-emu/fp_decode.h
@@ -0,0 +1,417 @@
+/*
+ * fp_decode.h
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FP_DECODE_H
+#define _FP_DECODE_H
+
+/* These macros do the dirty work of the instr decoding, several variables
+ * can be defined in the source file to modify the work of these macros,
+ * currently the following variables are used:
+ * ...
+ * The register usage:
+ * d0 - will contain source operand for data direct mode,
+ *	otherwise scratch register
+ * d1 - upper 16bit are reserved for caller
+ *	lower 16bit may contain further arguments,
+ *	is destroyed during decoding
+ * d2 - contains first two instruction words,
+ *	first word will be used for extension word
+ * a0 - will point to source/dest operand for any indirect mode
+ *	otherwise scratch register
+ * a1 - scratch register
+ * a2 - base addr to the task structure
+ *
+ * the current implementation doesn't check for every disallowed
+ * addressing mode (e.g. pc relative modes as destination), as long
+ * as it only means a new addressing mode, which should not appear
+ * in a program and that doesn't crash the emulation, I think it's
+ * not a problem to allow these modes.
+ */
+
+do_fmovem=0
+do_fmovem_cr=0
+do_no_pc_mode=0
+do_fscc=0
+
+| first decoding of the instr type
+| this separates the conditional instr
+.macro	fp_decode_cond_instr_type
+	bfextu	%d2{#8,#2},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"f<op>","fscc/fdbcc"
+|	.long	"fbccw","fbccl"
+.endm
+
+| second decoding of the instr type
+| this separates most move instr
+.macro	fp_decode_move_instr_type
+	bfextu	%d2{#16,#3},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"f<op> fpx,fpx","invalid instr"
+|	.long	"f<op> <ea>,fpx","fmove fpx,<ea>"
+|	.long	"fmovem <ea>,fpcr","fmovem <ea>,fpx"
+|	.long	"fmovem fpcr,<ea>","fmovem fpx,<ea>"
+.endm
+
+| extract the source specifier, specifies
+| either source fp register or data format
+.macro	fp_decode_sourcespec
+	bfextu	%d2{#19,#3},%d0
+.endm
+
+| decode destination format for fmove reg,ea
+.macro	fp_decode_dest_format
+	bfextu	%d2{#19,#3},%d0
+.endm
+
+| decode source register for fmove reg,ea
+.macro	fp_decode_src_reg
+	bfextu	%d2{#22,#3},%d0
+.endm
+
+| extract the addressing mode
+| it depends on the instr which of the modes is valid
+.macro	fp_decode_addr_mode
+	bfextu	%d2{#10,#3},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"data register direct","addr register direct"
+|	.long	"addr register indirect"
+|	.long	"addr register indirect postincrement"
+|	.long	"addr register indirect predecrement"
+|	.long	"addr register + index16"
+|	.long	"extension mode1","extension mode2"
+.endm
+
+| extract the register for the addressing mode
+.macro	fp_decode_addr_reg
+	bfextu	%d2{#13,#3},%d0
+.endm
+
+| decode the 8bit diplacement from the brief extension word
+.macro	fp_decode_disp8
+	move.b	%d2,%d0
+	ext.w	%d0
+.endm
+
+| decode the index of the brief/full extension word
+.macro	fp_decode_index
+	bfextu	%d2{#17,#3},%d0		| get the register nr
+	btst	#15,%d2			| test for data/addr register
+	jne	1\@f
+	printf	PDECODE,"d%d",1,%d0
+	jsr	fp_get_data_reg
+	jra	2\@f
+1\@:	printf	PDECODE,"a%d",1,%d0
+	jsr	fp_get_addr_reg
+	move.l	%a0,%d0
+2\@:
+debug	lea	"'l'.w,%a0"
+	btst	#11,%d2			| 16/32 bit size?
+	jne	3\@f
+debug	lea	"'w'.w,%a0"
+	ext.l	%d0
+3\@:	printf	PDECODE,":%c",1,%a0
+	move.w	%d2,%d1			| scale factor
+	rol.w	#7,%d1
+	and.w	#3,%d1
+debug	move.l	"%d1,-(%sp)"
+debug	ext.l	"%d1"
+	printf	PDECODE,":%d",1,%d1
+debug	move.l	"(%sp)+,%d1"
+	lsl.l	%d1,%d0
+.endm
+
+| decode the base displacement size
+.macro	fp_decode_basedisp
+	bfextu	%d2{#26,#2},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"reserved","null displacement"
+|	.long	"word displacement","long displacement"
+.endm
+
+.macro	fp_decode_outerdisp
+	bfextu	%d2{#30,#2},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+|	.long	"no memory indirect action/reserved","null outer displacement"
+|	.long	"word outer displacement","long outer displacement"
+.endm
+
+| get the extension word and test for brief or full extension type
+.macro	fp_get_test_extword label
+	fp_get_instr_word %d2,fp_err_ua1
+	btst	#8,%d2
+	jne	\label
+.endm
+
+
+| test if %pc is the base register for the indirect addr mode
+.macro	fp_test_basereg_d16	label
+	btst	#20,%d2
+	jeq	\label
+.endm
+
+| test if %pc is the base register for one of the extended modes
+.macro	fp_test_basereg_ext	label
+	btst	#19,%d2
+	jeq	\label
+.endm
+
+.macro	fp_test_suppr_index label
+	btst	#6,%d2
+	jne	\label
+.endm
+
+
+| addressing mode: data register direct
+.macro	fp_mode_data_direct
+	fp_decode_addr_reg
+	printf	PDECODE,"d%d",1,%d0
+.endm
+
+| addressing mode: address register indirect
+.macro	fp_mode_addr_indirect
+	fp_decode_addr_reg
+	printf	PDECODE,"(a%d)",1,%d0
+	jsr	fp_get_addr_reg
+.endm
+
+| adjust stack for byte moves from/to stack
+.macro	fp_test_sp_byte_move
+	.if	!do_fmovem
+	.if	do_fscc
+	move.w	#6,%d1
+	.endif
+	cmp.w	#7,%d0
+	jne	1\@f
+	.if	!do_fscc
+	cmp.w	#6,%d1
+	jne	1\@f
+	.endif
+	move.w	#4,%d1
+1\@:
+	.endif
+.endm
+
+| addressing mode: address register indirect with postincrement
+.macro	fp_mode_addr_indirect_postinc
+	fp_decode_addr_reg
+	printf	PDECODE,"(a%d)+",1,%d0
+	fp_test_sp_byte_move
+	jsr	fp_get_addr_reg
+	move.l	%a0,%a1			| save addr
+	.if	do_fmovem
+	lea	(%a0,%d1.w*4),%a0
+	.if	!do_fmovem_cr
+	lea	(%a0,%d1.w*8),%a0
+	.endif
+	.else
+	add.w	(fp_datasize,%d1.w*2),%a0
+	.endif
+	jsr	fp_put_addr_reg
+	move.l	%a1,%a0
+.endm
+
+| addressing mode: address register indirect with predecrement
+.macro	fp_mode_addr_indirect_predec
+	fp_decode_addr_reg
+	printf	PDECODE,"-(a%d)",1,%d0
+	fp_test_sp_byte_move
+	jsr	fp_get_addr_reg
+	.if	do_fmovem
+	.if	!do_fmovem_cr
+	lea	(-12,%a0),%a1		| setup to addr of 1st reg to move
+	neg.w	%d1
+	lea	(%a0,%d1.w*4),%a0
+	add.w	%d1,%d1
+	lea	(%a0,%d1.w*4),%a0
+	jsr	fp_put_addr_reg
+	move.l	%a1,%a0
+	.else
+	neg.w	%d1
+	lea	(%a0,%d1.w*4),%a0
+	jsr	fp_put_addr_reg
+	.endif
+	.else
+	sub.w	(fp_datasize,%d1.w*2),%a0
+	jsr	fp_put_addr_reg
+	.endif
+.endm
+
+| addressing mode: address register/programm counter indirect
+|		   with 16bit displacement
+.macro	fp_mode_addr_indirect_disp16
+	.if	!do_no_pc_mode
+	fp_test_basereg_d16 1f
+	printf	PDECODE,"pc"
+	fp_get_pc %a0
+	jra	2f
+	.endif
+1:	fp_decode_addr_reg
+	printf	PDECODE,"a%d",1,%d0
+	jsr	fp_get_addr_reg
+2:	fp_get_instr_word %a1,fp_err_ua1
+	printf	PDECODE,"@(%x)",1,%a1
+	add.l	%a1,%a0
+.endm
+
+| perform preindex (if I/IS == 0xx and xx != 00)
+.macro	fp_do_preindex
+	moveq	#3,%d0
+	and.w	%d2,%d0
+	jeq	1f
+	btst	#2,%d2
+	jne	1f
+	printf	PDECODE,")@("
+	getuser.l (%a1),%a1,fp_err_ua1,%a1
+debug	jra	"2f"
+1:	printf	PDECODE,","
+2:
+.endm
+
+| perform postindex (if I/IS == 1xx)
+.macro	fp_do_postindex
+	btst	#2,%d2
+	jeq	1f
+	printf	PDECODE,")@("
+	getuser.l (%a1),%a1,fp_err_ua1,%a1
+debug	jra	"2f"
+1:	printf	PDECODE,","
+2:
+.endm
+
+| all other indirect addressing modes will finally end up here
+.macro	fp_mode_addr_indirect_extmode0
+	.if	!do_no_pc_mode
+	fp_test_basereg_ext 1f
+	printf	PDECODE,"pc"
+	fp_get_pc %a0
+	jra	2f
+	.endif
+1:	fp_decode_addr_reg
+	printf	PDECODE,"a%d",1,%d0
+	jsr	fp_get_addr_reg
+2:	move.l	%a0,%a1
+	swap	%d2
+	fp_get_test_extword 3f
+	| addressing mode: address register/programm counter indirect
+	|		   with index and 8bit displacement
+	fp_decode_disp8
+debug	ext.l	"%d0"
+	printf	PDECODE,"@(%x,",1,%d0
+	add.w	%d0,%a1
+	fp_decode_index
+	add.l	%d0,%a1
+	printf	PDECODE,")"
+	jra	9f
+3:	| addressing mode: address register/programm counter memory indirect
+	|		   with base and/or outer displacement
+	btst	#7,%d2			| base register suppressed?
+	jeq	1f
+	printf	PDECODE,"!"
+	sub.l	%a1,%a1
+1:	printf	PDECODE,"@("
+	fp_decode_basedisp
+
+	.long	fp_ill,1f
+	.long	2f,3f
+
+#ifdef FPU_EMU_DEBUG
+1:	printf	PDECODE,"0"		| null base displacement
+	jra	1f
+#endif
+2:	fp_get_instr_word %a0,fp_err_ua1 | 16bit base displacement
+	printf	PDECODE,"%x:w",1,%a0
+	jra	4f
+3:	fp_get_instr_long %a0,fp_err_ua1 | 32bit base displacement
+	printf	PDECODE,"%x:l",1,%a0
+4:	add.l	%a0,%a1
+1:
+	fp_do_postindex
+	fp_test_suppr_index 1f
+	fp_decode_index
+	add.l	%d0,%a1
+1:	fp_do_preindex
+
+	fp_decode_outerdisp
+
+	.long	5f,1f
+	.long	2f,3f
+
+#ifdef FPU_EMU_DEBUG
+1:	printf	PDECODE,"0"		| null outer displacement
+	jra	1f
+#endif
+2:	fp_get_instr_word %a0,fp_err_ua1 | 16bit outer displacement
+	printf	PDECODE,"%x:w",1,%a0
+	jra	4f
+3:	fp_get_instr_long %a0,fp_err_ua1 | 32bit outer displacement
+	printf	PDECODE,"%x:l",1,%a0
+4:	add.l	%a0,%a1
+1:
+5:	printf	PDECODE,")"
+9:	move.l	%a1,%a0
+	swap	%d2
+.endm
+
+| get the absolute short address from user space
+.macro	fp_mode_abs_short
+	fp_get_instr_word %a0,fp_err_ua1
+	printf	PDECODE,"%x.w",1,%a0
+.endm
+
+| get the absolute long address from user space
+.macro	fp_mode_abs_long
+	fp_get_instr_long %a0,fp_err_ua1
+	printf	PDECODE,"%x.l",1,%a0
+.endm
+
+#endif /* _FP_DECODE_H */
diff --git a/arch/m68k/math-emu/fp_emu.h b/arch/m68k/math-emu/fp_emu.h
new file mode 100644
index 0000000..1d6edc9
--- /dev/null
+++ b/arch/m68k/math-emu/fp_emu.h
@@ -0,0 +1,146 @@
+/*
+ * fp_emu.h
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FP_EMU_H
+#define _FP_EMU_H
+
+#ifdef __ASSEMBLY__
+#include <asm/offsets.h>
+#endif
+#include <asm/math-emu.h>
+
+#ifndef __ASSEMBLY__
+
+#define IS_INF(a) ((a)->exp == 0x7fff)
+#define IS_ZERO(a) ((a)->mant.m64 == 0)
+
+
+#define fp_set_sr(bit) ({					\
+	FPDATA->fpsr |= 1 << (bit);				\
+})
+
+#define fp_set_quotient(quotient) ({				\
+	FPDATA->fpsr &= 0xff00ffff;				\
+	FPDATA->fpsr |= ((quotient) & 0xff) << 16;		\
+})
+
+/* linkage for several useful functions */
+
+/* Normalize the extended struct, return 0 for a NaN */
+#define fp_normalize_ext(fpreg) ({				\
+	register struct fp_ext *reg asm ("a0") = fpreg;		\
+	register int res asm ("d0");				\
+								\
+	asm volatile ("jsr fp_conv_ext2ext"			\
+			: "=d" (res) : "a" (reg)		\
+			: "a1", "d1", "d2", "memory");		\
+	res;							\
+})
+
+#define fp_copy_ext(dest, src) ({				\
+	*dest = *src;						\
+})
+
+#define fp_monadic_check(dest, src) ({				\
+	fp_copy_ext(dest, src);					\
+	if (!fp_normalize_ext(dest))				\
+		return dest;					\
+})
+
+#define fp_dyadic_check(dest, src) ({				\
+	if (!fp_normalize_ext(dest))				\
+		return dest;					\
+	if (!fp_normalize_ext(src)) {				\
+		fp_copy_ext(dest, src);				\
+		return dest;					\
+	}							\
+})
+
+extern const struct fp_ext fp_QNaN;
+extern const struct fp_ext fp_Inf;
+
+#define fp_set_nan(dest) ({					\
+	fp_set_sr(FPSR_EXC_OPERR);				\
+	*dest = fp_QNaN;					\
+})
+
+/* TODO check rounding mode? */
+#define fp_set_ovrflw(dest) ({					\
+	fp_set_sr(FPSR_EXC_OVFL);				\
+	dest->exp = 0x7fff;					\
+	dest->mant.m64 = 0;					\
+})
+
+#define fp_conv_ext2long(src) ({				\
+	register struct fp_ext *__src asm ("a0") = src;		\
+	register int __res asm ("d0");				\
+								\
+	asm volatile ("jsr fp_conv_ext2long"			\
+			: "=d" (__res) : "a" (__src)		\
+			: "a1", "d1", "d2", "memory");		\
+	__res;							\
+})
+
+#define fp_conv_long2ext(dest, src) ({				\
+	register struct fp_ext *__dest asm ("a0") = dest;	\
+	register int __src asm ("d0") = src;			\
+								\
+	asm volatile ("jsr fp_conv_ext2long"			\
+			: : "d" (__src), "a" (__dest)		\
+			: "a1", "d1", "d2", "memory");		\
+})
+
+#else /* __ASSEMBLY__ */
+
+/*
+ * set, reset or clear a bit in the fp status register
+ */
+.macro	fp_set_sr	bit
+	bset	#(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+
+.macro	fp_clr_sr	bit
+	bclr	#(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+
+.macro	fp_tst_sr	bit
+	btst	#(\bit&7),(FPD_FPSR+3-(\bit/8),FPDATA)
+.endm
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _FP_EMU_H */
diff --git a/arch/m68k/math-emu/fp_entry.S b/arch/m68k/math-emu/fp_entry.S
new file mode 100644
index 0000000..5ec2d91
--- /dev/null
+++ b/arch/m68k/math-emu/fp_entry.S
@@ -0,0 +1,325 @@
+/*
+ * fp_emu.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/entry.h>
+
+#include "fp_emu.h"
+
+	.globl	fpu_emu
+	.globl	fp_debugprint
+	.globl	fp_err_ua1,fp_err_ua2
+
+	.text
+fpu_emu:
+	SAVE_ALL_INT
+	GET_CURRENT(%d0)
+
+#if defined(CPU_M68020_OR_M68030) && defined(CPU_M68040_OR_M68060)
+        tst.l	m68k_is040or060
+        jeq	1f
+#endif
+#if defined(CPU_M68040_OR_M68060)
+	move.l	(FPS_PC2,%sp),(FPS_PC,%sp)
+#endif
+1:
+	| emulate the instruction
+	jsr	fp_scan
+
+#if defined(CONFIG_M68060)
+#if !defined(CPU_M68060_ONLY)
+	btst	#3,m68k_cputype+3
+	jeq	1f
+#endif
+	btst	#7,(FPS_SR,%sp)
+	jne	fp_sendtrace060
+#endif
+1:
+	| emulation successful?
+	tst.l	%d0
+	jeq	ret_from_exception
+
+	| send some signal to program here
+
+	jra	ret_from_exception
+
+	| we jump here after an access error while trying to access
+	| user space, we correct stackpointer and send a SIGSEGV to
+	| the user process
+fp_err_ua2:
+	addq.l	#4,%sp
+fp_err_ua1:
+	addq.l	#4,%sp
+	move.l	%a0,-(%sp)
+	pea	SEGV_MAPERR
+	pea	SIGSEGV
+	jsr	fpemu_signal
+	add.w	#12,%sp
+	jra	ret_from_exception
+
+#if defined(CONFIG_M68060)
+	| send a trace signal if we are debugged
+	| it does not really belong here, but...
+fp_sendtrace060:
+	move.l	(FPS_PC,%sp),-(%sp)
+	pea	TRAP_TRACE
+	pea	SIGTRAP
+	jsr	fpemu_signal
+	add.w	#12,%sp
+	jra	ret_from_exception
+#endif
+
+	.globl	fp_get_data_reg, fp_put_data_reg
+	.globl	fp_get_addr_reg, fp_put_addr_reg
+
+	| Entry points to get/put a register. Some of them can be get/put
+	| directly, others are on the stack, as we read/write the stack
+	| directly here, these function may only be called from within
+	| instruction decoding, otherwise the stack pointer is incorrect
+	| and the stack gets corrupted.
+fp_get_data_reg:
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_get_d0, fp_get_d1
+	.long	fp_get_d2, fp_get_d3
+	.long	fp_get_d4, fp_get_d5
+	.long	fp_get_d6, fp_get_d7
+
+fp_get_d0:
+	move.l	(PT_D0+8,%sp),%d0
+	printf	PREGISTER,"{d0->%08x}",1,%d0
+	rts
+
+fp_get_d1:
+	move.l	(PT_D1+8,%sp),%d0
+	printf	PREGISTER,"{d1->%08x}",1,%d0
+	rts
+
+fp_get_d2:
+	move.l	(PT_D2+8,%sp),%d0
+	printf	PREGISTER,"{d2->%08x}",1,%d0
+	rts
+
+fp_get_d3:
+	move.l	%d3,%d0
+	printf	PREGISTER,"{d3->%08x}",1,%d0
+	rts
+
+fp_get_d4:
+	move.l	%d4,%d0
+	printf	PREGISTER,"{d4->%08x}",1,%d0
+	rts
+
+fp_get_d5:
+	move.l	%d5,%d0
+	printf	PREGISTER,"{d5->%08x}",1,%d0
+	rts
+
+fp_get_d6:
+	move.l	%d6,%d0
+	printf	PREGISTER,"{d6->%08x}",1,%d0
+	rts
+
+fp_get_d7:
+	move.l	%d7,%d0
+	printf	PREGISTER,"{d7->%08x}",1,%d0
+	rts
+
+fp_put_data_reg:
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:
+	.long	fp_put_d0, fp_put_d1
+	.long	fp_put_d2, fp_put_d3
+	.long	fp_put_d4, fp_put_d5
+	.long	fp_put_d6, fp_put_d7
+
+fp_put_d0:
+	printf	PREGISTER,"{d0<-%08x}",1,%d0
+	move.l	%d0,(PT_D0+8,%sp)
+	rts
+
+fp_put_d1:
+	printf	PREGISTER,"{d1<-%08x}",1,%d0
+	move.l	%d0,(PT_D1+8,%sp)
+	rts
+
+fp_put_d2:
+	printf	PREGISTER,"{d2<-%08x}",1,%d0
+	move.l	%d0,(PT_D2+8,%sp)
+	rts
+
+fp_put_d3:
+	printf	PREGISTER,"{d3<-%08x}",1,%d0
+|	move.l	%d0,%d3
+	move.l	%d0,(PT_D3+8,%sp)
+	rts
+
+fp_put_d4:
+	printf	PREGISTER,"{d4<-%08x}",1,%d0
+|	move.l	%d0,%d4
+	move.l	%d0,(PT_D4+8,%sp)
+	rts
+
+fp_put_d5:
+	printf	PREGISTER,"{d5<-%08x}",1,%d0
+|	move.l	%d0,%d5
+	move.l	%d0,(PT_D5+8,%sp)
+	rts
+
+fp_put_d6:
+	printf	PREGISTER,"{d6<-%08x}",1,%d0
+	move.l	%d0,%d6
+	rts
+
+fp_put_d7:
+	printf	PREGISTER,"{d7<-%08x}",1,%d0
+	move.l	%d0,%d7
+	rts
+
+fp_get_addr_reg:
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_get_a0, fp_get_a1
+	.long	fp_get_a2, fp_get_a3
+	.long	fp_get_a4, fp_get_a5
+	.long	fp_get_a6, fp_get_a7
+
+fp_get_a0:
+	move.l	(PT_A0+8,%sp),%a0
+	printf	PREGISTER,"{a0->%08x}",1,%a0
+	rts
+
+fp_get_a1:
+	move.l	(PT_A1+8,%sp),%a0
+	printf	PREGISTER,"{a1->%08x}",1,%a0
+	rts
+
+fp_get_a2:
+	move.l	(PT_A2+8,%sp),%a0
+	printf	PREGISTER,"{a2->%08x}",1,%a0
+	rts
+
+fp_get_a3:
+	move.l	%a3,%a0
+	printf	PREGISTER,"{a3->%08x}",1,%a0
+	rts
+
+fp_get_a4:
+	move.l	%a4,%a0
+	printf	PREGISTER,"{a4->%08x}",1,%a0
+	rts
+
+fp_get_a5:
+	move.l	%a5,%a0
+	printf	PREGISTER,"{a5->%08x}",1,%a0
+	rts
+
+fp_get_a6:
+	move.l	%a6,%a0
+	printf	PREGISTER,"{a6->%08x}",1,%a0
+	rts
+
+fp_get_a7:
+	move.l	%usp,%a0
+	printf	PREGISTER,"{a7->%08x}",1,%a0
+	rts
+
+fp_put_addr_reg:
+	jmp	([0f:w,%pc,%d0.w*4])
+
+	.align	4
+0:
+	.long	fp_put_a0, fp_put_a1
+	.long	fp_put_a2, fp_put_a3
+	.long	fp_put_a4, fp_put_a5
+	.long	fp_put_a6, fp_put_a7
+
+fp_put_a0:
+	printf	PREGISTER,"{a0<-%08x}",1,%a0
+	move.l	%a0,(PT_A0+8,%sp)
+	rts
+
+fp_put_a1:
+	printf	PREGISTER,"{a1<-%08x}",1,%a0
+	move.l	%a0,(PT_A1+8,%sp)
+	rts
+
+fp_put_a2:
+	printf	PREGISTER,"{a2<-%08x}",1,%a0
+	move.l	%a0,(PT_A2+8,%sp)
+	rts
+
+fp_put_a3:
+	printf	PREGISTER,"{a3<-%08x}",1,%a0
+	move.l	%a0,%a3
+	rts
+
+fp_put_a4:
+	printf	PREGISTER,"{a4<-%08x}",1,%a0
+	move.l	%a0,%a4
+	rts
+
+fp_put_a5:
+	printf	PREGISTER,"{a5<-%08x}",1,%a0
+	move.l	%a0,%a5
+	rts
+
+fp_put_a6:
+	printf	PREGISTER,"{a6<-%08x}",1,%a0
+	move.l	%a0,%a6
+	rts
+
+fp_put_a7:
+	printf	PREGISTER,"{a7<-%08x}",1,%a0
+	move.l	%a0,%usp
+	rts
+
+	.data
+	.align	4
+
+fp_debugprint:
+|	.long	PMDECODE
+	.long	PMINSTR+PMDECODE+PMCONV+PMNORM
+|	.long	PMCONV+PMNORM+PMINSTR
+|	.long	0
diff --git a/arch/m68k/math-emu/fp_log.c b/arch/m68k/math-emu/fp_log.c
new file mode 100644
index 0000000..87b4f01
--- /dev/null
+++ b/arch/m68k/math-emu/fp_log.c
@@ -0,0 +1,223 @@
+/*
+
+  fp_trig.c: floating-point math routines for the Linux-m68k
+  floating point emulator.
+
+  Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel.
+
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+
+*/
+
+#include "fp_emu.h"
+
+static const struct fp_ext fp_one =
+{
+	.exp = 0x3fff,
+};
+
+extern struct fp_ext *fp_fadd(struct fp_ext *dest, const struct fp_ext *src);
+extern struct fp_ext *fp_fdiv(struct fp_ext *dest, const struct fp_ext *src);
+extern struct fp_ext *fp_fmul(struct fp_ext *dest, const struct fp_ext *src);
+
+struct fp_ext *
+fp_fsqrt(struct fp_ext *dest, struct fp_ext *src)
+{
+	struct fp_ext tmp, src2;
+	int i, exp;
+
+	dprint(PINSTR, "fsqrt\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_ZERO(dest))
+		return dest;
+
+	if (dest->sign) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_INF(dest))
+		return dest;
+
+	/*
+	 *		 sqrt(m) * 2^(p)	, if e = 2*p
+	 * sqrt(m*2^e) =
+	 *		 sqrt(2*m) * 2^(p)	, if e = 2*p + 1
+	 *
+	 * So we use the last bit of the exponent to decide wether to
+	 * use the m or 2*m.
+	 *
+	 * Since only the fractional part of the mantissa is stored and
+	 * the integer part is assumed to be one, we place a 1 or 2 into
+	 * the fixed point representation.
+	 */
+	exp = dest->exp;
+	dest->exp = 0x3FFF;
+	if (!(exp & 1))		/* lowest bit of exponent is set */
+		dest->exp++;
+	fp_copy_ext(&src2, dest);
+
+	/*
+	 * The taylor row arround a for sqrt(x) is:
+	 *	sqrt(x) = sqrt(a) + 1/(2*sqrt(a))*(x-a) + R
+	 * With a=1 this gives:
+	 *	sqrt(x) = 1 + 1/2*(x-1)
+	 *		= 1/2*(1+x)
+	 */
+	fp_fadd(dest, &fp_one);
+	dest->exp--;		/* * 1/2 */
+
+	/*
+	 * We now apply the newton rule to the function
+	 *	f(x) := x^2 - r
+	 * which has a null point on x = sqrt(r).
+	 *
+	 * It gives:
+	 *	x' := x - f(x)/f'(x)
+	 *	    = x - (x^2 -r)/(2*x)
+	 *	    = x - (x - r/x)/2
+	 *          = (2*x - x + r/x)/2
+	 *	    = (x + r/x)/2
+	 */
+	for (i = 0; i < 9; i++) {
+		fp_copy_ext(&tmp, &src2);
+
+		fp_fdiv(&tmp, dest);
+		fp_fadd(dest, &tmp);
+		dest->exp--;
+	}
+
+	dest->exp += (exp - 0x3FFF) / 2;
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fetoxm1(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fetoxm1\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_ZERO(dest))
+		return dest;
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fetox(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fetox\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftwotox(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftwotox\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftentox(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftentox\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flogn(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flogn\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flognp1(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flognp1\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flog10(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flog10\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_flog2(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("flog2\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fgetexp(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fgetexp\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_INF(dest)) {
+		fp_set_nan(dest);
+		return dest;
+	}
+	if (IS_ZERO(dest))
+		return dest;
+
+	fp_conv_long2ext(dest, (int)dest->exp - 0x3FFF);
+
+	fp_normalize_ext(dest);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fgetman(struct fp_ext *dest, struct fp_ext *src)
+{
+	dprint(PINSTR, "fgetman\n");
+
+	fp_monadic_check(dest, src);
+
+	if (IS_ZERO(dest))
+		return dest;
+
+	if (IS_INF(dest))
+		return dest;
+
+	dest->exp = 0x3FFF;
+
+	return dest;
+}
+
diff --git a/arch/m68k/math-emu/fp_move.S b/arch/m68k/math-emu/fp_move.S
new file mode 100644
index 0000000..71bdf83
--- /dev/null
+++ b/arch/m68k/math-emu/fp_move.S
@@ -0,0 +1,244 @@
+/*
+ * fp_move.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+do_no_pc_mode=1
+
+	.globl	fp_fmove_fp2mem
+
+fp_fmove_fp2mem:
+	clr.b	(2+FPD_FPSR,FPDATA)
+	fp_decode_dest_format
+	move.w	%d0,%d1			| store data size twice in %d1
+	swap	%d1			| one can be trashed below
+	move.w	%d0,%d1
+#ifdef FPU_EMU_DEBUG
+	lea	0f,%a0
+	clr.l	%d0
+	move.b	(%a0,%d1.w),%d0
+	printf	PDECODE,"fmove.%c ",1,%d0
+	fp_decode_src_reg
+	printf	PDECODE,"fp%d,",1,%d0
+
+	.data
+0:	.byte	'l','s','x','p','w','d','b','p'
+	.previous
+#endif
+
+	| encode addressing mode for dest
+	fp_decode_addr_mode
+
+	.long	fp_data, fp_ill
+	.long	fp_indirect, fp_postinc
+	.long	fp_predecr, fp_disp16
+	.long	fp_extmode0, fp_extmode1
+
+	| addressing mode: data register direct
+fp_data:
+	fp_mode_data_direct
+	move.w	%d0,%d1
+	fp_decode_src_reg
+	fp_get_fp_reg
+	lea	(FPD_TEMPFP1,FPDATA),%a1
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0),(%a1)
+	lea	(-8,%a1),%a0
+	swap	%d1
+	move.l	%d1,%d2
+	printf	PDECODE,"\n"
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:
+	.long	fp_data_long, fp_data_single
+	.long	fp_ill, fp_ill
+	.long	fp_data_word, fp_ill
+	.long	fp_data_byte, fp_ill
+
+fp_data_byte:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2byte
+	move.l	%d0,%d1
+	swap	%d2
+	move.w	%d2,%d0
+	jsr	fp_get_data_reg
+	move.b	%d1,%d0
+	move.w	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+fp_data_word:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2short
+	move.l	%d0,%d1
+	swap	%d2
+	move.w	%d2,%d0
+	jsr	fp_get_data_reg
+	move.w	%d1,%d0
+	move.l	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+fp_data_long:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2long
+	swap	%d2
+	move.w	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+fp_data_single:
+	jsr	fp_normalize_ext
+	jsr	fp_conv_ext2single
+	swap	%d2
+	move.w	%d2,%d1
+	jsr	fp_put_data_reg
+	jra	fp_final
+
+	| addressing mode: address register indirect
+fp_indirect:
+	fp_mode_addr_indirect
+	jra	fp_putdest
+
+	| addressing mode: address register indirect with postincrement
+fp_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fp_putdest
+
+	| addressing mode: address register indirect with predecrement
+fp_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fp_putdest
+
+	| addressing mode: address register indirect with 16bit displacement
+fp_disp16:
+	fp_mode_addr_indirect_disp16
+	jra     fp_putdest
+
+fp_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fp_putdest
+
+fp_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_abs_short, fp_abs_long
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+
+fp_abs_short:
+	fp_mode_abs_short
+	jra	fp_putdest
+
+fp_abs_long:
+	fp_mode_abs_long
+	jra	fp_putdest
+
+fp_putdest:
+	move.l	%a0,%a1
+	fp_decode_src_reg
+	move.l	%d1,%d2			| save size
+	fp_get_fp_reg
+	printf	PDECODE,"\n"
+	addq.l	#8,%a0
+	move.l	(%a0),-(%sp)
+	move.l	-(%a0),-(%sp)
+	move.l	-(%a0),-(%sp)
+	move.l	%sp,%a0
+	jsr	fp_normalize_ext
+
+	swap	%d2
+	jmp	([0f:w,%pc,%d2.w*4])
+
+	.align	4
+0:
+	.long	fp_format_long, fp_format_single
+	.long	fp_format_extended, fp_format_packed
+	.long	fp_format_word, fp_format_double
+	.long	fp_format_byte, fp_format_packed
+
+fp_format_long:
+	jsr	fp_conv_ext2long
+	putuser.l %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_single:
+	jsr	fp_conv_ext2single
+	putuser.l %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_extended:
+	move.l	(%a0)+,%d0
+	lsl.w	#1,%d0
+	lsl.l	#7,%d0
+	lsl.l	#8,%d0
+	putuser.l %d0,(%a1)+,fp_err_ua1,%a1
+	move.l	(%a0)+,%d0
+	putuser.l %d0,(%a1)+,fp_err_ua1,%a1
+	move.l	(%a0),%d0
+	putuser.l %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_packed:
+	/* not supported yet */
+	lea	(12,%sp),%sp
+	jra	fp_ill
+
+fp_format_word:
+	jsr	fp_conv_ext2short
+	putuser.w %d0,(%a1),fp_err_ua1,%a1
+	jra	fp_finish_move
+
+fp_format_double:
+	jsr	fp_conv_ext2double
+	jra	fp_finish_move
+
+fp_format_byte:
+	jsr	fp_conv_ext2byte
+	putuser.b %d0,(%a1),fp_err_ua1,%a1
+|	jra	fp_finish_move
+
+fp_finish_move:
+	lea	(12,%sp),%sp
+	jra	fp_final
diff --git a/arch/m68k/math-emu/fp_movem.S b/arch/m68k/math-emu/fp_movem.S
new file mode 100644
index 0000000..8354d39
--- /dev/null
+++ b/arch/m68k/math-emu/fp_movem.S
@@ -0,0 +1,368 @@
+/*
+ * fp_movem.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+| set flags for decode macros for fmovem
+do_fmovem=1
+
+	.globl	fp_fmovem_fp, fp_fmovem_cr
+
+| %d1 contains the mask and count of the register list
+| for other register usage see fp_decode.h
+
+fp_fmovem_fp:
+	printf	PDECODE,"fmovem.x "
+	| get register list and count them
+	btst	#11,%d2
+	jne	1f
+	bfextu	%d2{#24,#8},%d0		| static register list
+	jra	2f
+1:	bfextu	%d2{#25,#3},%d0		| dynamic register list
+	jsr	fp_get_data_reg
+2:	move.l	%d0,%d1
+	swap	%d1
+	jra	2f
+1:	addq.w	#1,%d1			| count the # of registers in
+2:	lsr.b	#1,%d0			| register list and keep it in %d1
+	jcs	1b
+	jne	2b
+	printf	PDECODE,"#%08x",1,%d1
+#ifdef FPU_EMU_DEBUG
+	btst	#12,%d2
+	jne	1f
+	printf	PDECODE,"-"		| decremental move
+	jra	2f
+1:	printf	PDECODE,"+"		| incremental move
+2:	btst	#13,%d2
+	jeq	1f
+	printf	PDECODE,"->"		| fpu -> cpu
+	jra	2f
+1:	printf	PDECODE,"<-"		| fpu <- cpu
+2:
+#endif
+
+	| decode address mode
+	fp_decode_addr_mode
+
+	.long	fp_ill, fp_ill
+	.long	fpr_indirect, fpr_postinc
+	.long	fpr_predecr, fpr_disp16
+	.long	fpr_extmode0, fpr_extmode1
+
+	| addressing mode: address register indirect
+fpr_indirect:
+	fp_mode_addr_indirect
+	jra	fpr_do_movem
+
+	| addressing mode: address register indirect with postincrement
+fpr_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fpr_do_movem
+
+fpr_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fpr_do_movem
+
+	| addressing mode: address register/programm counter indirect
+	|		   with 16bit displacement
+fpr_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fpr_do_movem
+
+fpr_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fpr_do_movem
+
+fpr_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fpr_absolute_short, fpr_absolute_long
+	.long	fpr_disp16, fpr_extmode0
+	.long	fp_ill, fp_ill
+	.long	fp_ill, fp_ill
+
+fpr_absolute_short:
+	fp_mode_abs_short
+	jra	fpr_do_movem
+
+fpr_absolute_long:
+	fp_mode_abs_long
+|	jra	fpr_do_movem
+
+fpr_do_movem:
+	swap	%d1			| get fpu register list
+	lea	(FPD_FPREG,FPDATA),%a1
+	moveq	#12,%d0
+	btst	#12,%d2
+	jne	1f
+	lea	(-12,%a1,%d0*8),%a1
+	neg.l	%d0
+1:	btst	#13,%d2
+	jne	4f
+	| move register from memory into fpu
+	jra	3f
+1:	printf	PMOVEM,"(%p>%p)",2,%a0,%a1
+	getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+	lsr.l	#8,%d2
+	lsr.l	#7,%d2
+	lsr.w	#1,%d2
+	move.l	%d2,(%a1)+
+	getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+	move.l	%d2,(%a1)+
+	getuser.l (%a0),%d2,fp_err_ua1,%a0
+	move.l	%d2,(%a1)
+	subq.l	#8,%a0
+	subq.l	#8,%a1
+	add.l	%d0,%a0
+2:	add.l	%d0,%a1
+3:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+	jra	5f
+	| move register from fpu into memory
+1:	printf	PMOVEM,"(%p>%p)",2,%a1,%a0
+	move.l	(%a1)+,%d2
+	lsl.w	#1,%d2
+	lsl.l	#7,%d2
+	lsl.l	#8,%d2
+	putuser.l %d2,(%a0)+,fp_err_ua1,%a0
+	move.l	(%a1)+,%d2
+	putuser.l %d2,(%a0)+,fp_err_ua1,%a0
+	move.l	(%a1),%d2
+	putuser.l %d2,(%a0),fp_err_ua1,%a0
+	subq.l	#8,%a1
+	subq.l	#8,%a0
+	add.l	%d0,%a0
+2:	add.l	%d0,%a1
+4:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+5:
+	printf	PDECODE,"\n"
+#if 0
+	lea	(FPD_FPREG,FPDATA),%a0
+	printf	PMOVEM,"fp:"
+	printx	PMOVEM,%a0@(0)
+	printx	PMOVEM,%a0@(12)
+	printf	PMOVEM,"\n   "
+	printx	PMOVEM,%a0@(24)
+	printx	PMOVEM,%a0@(36)
+	printf	PMOVEM,"\n   "
+	printx	PMOVEM,%a0@(48)
+	printx	PMOVEM,%a0@(60)
+	printf	PMOVEM,"\n   "
+	printx	PMOVEM,%a0@(72)
+	printx	PMOVEM,%a0@(84)
+	printf	PMOVEM,"\n"
+#endif
+	jra	fp_end
+
+| set flags for decode macros for fmovem control register
+do_fmovem=1
+do_fmovem_cr=1
+
+fp_fmovem_cr:
+	printf	PDECODE,"fmovem.cr "
+	| get register list and count them
+	bfextu	%d2{#19,#3},%d0
+	move.l	%d0,%d1
+	swap	%d1
+	jra	2f
+1:	addq.w	#1,%d1
+2:	lsr.l	#1,%d0
+	jcs	1b
+	jne	2b
+	printf	PDECODE,"#%08x",1,%d1
+#ifdef FPU_EMU_DEBUG
+	btst	#13,%d2
+	jeq	1f
+	printf	PDECODE,"->"		| fpu -> cpu
+	jra	2f
+1:	printf	PDECODE,"<-"		| fpu <- cpu
+2:
+#endif
+
+	| decode address mode
+	fp_decode_addr_mode
+
+	.long	fpc_data, fpc_addr
+	.long	fpc_indirect, fpc_postinc
+	.long	fpc_predecr, fpc_disp16
+	.long	fpc_extmode0, fpc_extmode1
+
+fpc_data:
+	fp_mode_data_direct
+	move.w	%d0,%d1
+	bfffo	%d2{#19,#3},%d0
+	sub.w	#19,%d0
+	lea	(FPD_FPCR,FPDATA,%d0.w*4),%a1
+	btst	#13,%d2
+	jne	1f
+	move.w	%d1,%d0
+	jsr	fp_get_data_reg
+	move.l	%d0,(%a1)
+	jra	fpc_movem_fin
+1:	move.l	(%a1),%d0
+	jsr	fp_put_data_reg
+	jra	fpc_movem_fin
+
+fpc_addr:
+	fp_decode_addr_reg
+	printf	PDECODE,"a%d",1,%d0
+	btst	#13,%d2
+	jne	1f
+	jsr	fp_get_addr_reg
+	move.l	%a0,(FPD_FPIAR,FPDATA)
+	jra	fpc_movem_fin
+1:	move.l	(FPD_FPIAR,FPDATA),%a0
+	jsr	fp_put_addr_reg
+	jra	fpc_movem_fin
+
+fpc_indirect:
+	fp_mode_addr_indirect
+	jra	fpc_do_movem
+
+fpc_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fpc_do_movem
+
+fpc_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fpc_do_movem
+
+fpc_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fpc_do_movem
+
+fpc_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fpc_do_movem
+
+fpc_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fpc_absolute_short, fpc_absolute_long
+	.long	fpc_disp16, fpc_extmode0
+	.long	fpc_immediate, fp_ill
+	.long	fp_ill, fp_ill
+
+fpc_absolute_short:
+	fp_mode_abs_short
+	jra	fpc_do_movem
+
+fpc_absolute_long:
+	fp_mode_abs_long
+	jra	fpc_do_movem
+
+fpc_immediate:
+	fp_get_pc %a0
+	lea	(%a0,%d1.w*4),%a1
+	fp_put_pc %a1
+	printf	PDECODE,"#imm"
+|	jra	fpc_do_movem
+#if 0
+	swap	%d1
+	lsl.l	#5,%d1
+	lea	(FPD_FPCR,FPDATA),%a0
+	jra	3f
+1:	move.l	%d0,(%a0)
+2:	addq.l	#4,%a0
+3:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+	jra	fpc_movem_fin
+#endif
+
+fpc_do_movem:
+	swap	%d1			| get fpu register list
+	lsl.l	#5,%d1
+	lea	(FPD_FPCR,FPDATA),%a1
+1:	btst	#13,%d2
+	jne	4f
+
+	| move register from memory into fpu
+	jra	3f
+1:	printf	PMOVEM,"(%p>%p)",2,%a0,%a1
+	getuser.l (%a0)+,%d0,fp_err_ua1,%a0
+	move.l	%d0,(%a1)
+2:	addq.l	#4,%a1
+3:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+	jra	fpc_movem_fin
+
+	| move register from fpu into memory
+1:	printf	PMOVEM,"(%p>%p)",2,%a1,%a0
+	move.l	(%a1),%d0
+	putuser.l %d0,(%a0)+,fp_err_ua1,%a0
+2:	addq.l	#4,%a1
+4:	lsl.b	#1,%d1
+	jcs	1b
+	jne	2b
+
+fpc_movem_fin:
+	and.l	#0x0000fff0,(FPD_FPCR,FPDATA)
+	and.l	#0x0ffffff8,(FPD_FPSR,FPDATA)
+	move.l	(FPD_FPCR,FPDATA),%d0
+	lsr.l	#4,%d0
+	moveq	#3,%d1
+	and.l	%d0,%d1
+	move.w	%d1,(FPD_RND,FPDATA)
+	lsr.l	#2,%d0
+	moveq	#3,%d1
+	and.l	%d0,%d1
+	move.w	%d1,(FPD_PREC,FPDATA)
+	printf	PDECODE,"\n"
+#if 0
+	printf	PMOVEM,"fpcr : %08x\n",1,FPDATA@(FPD_FPCR)
+	printf	PMOVEM,"fpsr : %08x\n",1,FPDATA@(FPD_FPSR)
+	printf	PMOVEM,"fpiar: %08x\n",1,FPDATA@(FPD_FPIAR)
+	clr.l	%d0
+	move.w	(FPD_PREC,FPDATA),%d0
+	printf	PMOVEM,"prec : %04x\n",1,%d0
+	move.w	(FPD_RND,FPDATA),%d0
+	printf	PMOVEM,"rnd  : %04x\n",1,%d0
+#endif
+	jra	fp_end
diff --git a/arch/m68k/math-emu/fp_scan.S b/arch/m68k/math-emu/fp_scan.S
new file mode 100644
index 0000000..e4146ed
--- /dev/null
+++ b/arch/m68k/math-emu/fp_scan.S
@@ -0,0 +1,478 @@
+/*
+ * fp_scan.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fp_emu.h"
+#include "fp_decode.h"
+
+	.globl	fp_scan, fp_datasize
+
+	.data
+
+| %d2 - first two instr words
+| %d1 - operand size
+
+/* operand formats are:
+
+	Long = 0,		i.e. fmove.l
+	Single,			i.e. fmove.s
+	Extended,		i.e. fmove.x
+	Packed-BCD,		i.e. fmove.p
+	Word,			i.e. fmove.w
+	Double,			i.e. fmove.d
+*/
+
+	.text
+
+| On entry:
+| FPDATA - base of emulated FPU registers
+
+fp_scan:
+| normal fpu instruction? (this excludes fsave/frestore)
+	fp_get_pc %a0
+	printf	PDECODE,"%08x: ",1,%a0
+	getuser.b (%a0),%d0,fp_err_ua1,%a0
+#if 1
+	cmp.b	#0xf2,%d0		| cpid = 1
+#else
+	cmp.b	#0xfc,%d0		| cpid = 6
+#endif
+	jne	fp_nonstd
+| first two instruction words are kept in %d2
+	getuser.l (%a0)+,%d2,fp_err_ua1,%a0
+	fp_put_pc %a0
+fp_decode_cond:				| separate conditional instr
+	fp_decode_cond_instr_type
+
+	.long	fp_decode_move, fp_fscc
+	.long	fp_fbccw, fp_fbccl
+
+fp_decode_move:				| separate move instr
+	fp_decode_move_instr_type
+
+	.long	fp_fgen_fp, fp_ill
+	.long	fp_fgen_ea, fp_fmove_fp2mem
+	.long	fp_fmovem_cr, fp_fmovem_cr
+	.long	fp_fmovem_fp, fp_fmovem_fp
+
+| now all arithmetic instr and a few move instr are left
+fp_fgen_fp:				| source is a fpu register
+	clr.b	(FPD_FPSR+2,FPDATA)	| clear the exception byte
+	fp_decode_sourcespec
+	printf	PDECODE,"f<op>.x fp%d",1,%d0
+	fp_get_fp_reg
+	lea	(FPD_TEMPFP1,FPDATA),%a1 | copy src into a temp location
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0)+,(%a1)+
+	move.l	(%a0),(%a1)
+	lea	(-8,%a1),%a0
+	jra	fp_getdest
+
+fp_fgen_ea:				| source is <ea>
+	clr.b	(FPD_FPSR+2,FPDATA)	| clear the exception byte
+	| sort out fmovecr, keep data size in %d1
+	fp_decode_sourcespec
+	cmp.w	#7,%d0
+	jeq	fp_fmovecr
+	move.w	%d0,%d1			| store data size twice in %d1
+	swap	%d1			| one can be trashed below
+	move.w	%d0,%d1
+#ifdef FPU_EMU_DEBUG
+	lea	0f,%a0
+	clr.l	%d0
+	move.b	(%a0,%d1.w),%d0
+	printf	PDECODE,"f<op>.%c ",1,%d0
+
+	.data
+0:	.byte	'l','s','x','p','w','d','b',0
+	.previous
+#endif
+
+/*
+	fp_getsource, fp_getdest
+
+	basically, we end up with a pointer to the source operand in
+	%a1, and a pointer to the destination operand in %a0.  both
+	are, of course, 96-bit extended floating point numbers.
+*/
+
+fp_getsource:
+	| decode addressing mode for source
+	fp_decode_addr_mode
+
+	.long	fp_data, fp_ill
+	.long	fp_indirect, fp_postinc
+	.long	fp_predecr, fp_disp16
+	.long	fp_extmode0, fp_extmode1
+
+	| addressing mode: data register direct
+fp_data:
+	fp_mode_data_direct
+	jsr	fp_get_data_reg
+	lea	(FPD_TEMPFP1,FPDATA),%a0
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:
+	.long	fp_data_long, fp_data_single
+	.long	fp_ill, fp_ill
+	.long	fp_data_word, fp_ill
+	.long	fp_data_byte, fp_ill
+
+	| data types that fit in an integer data register
+fp_data_byte:
+	extb.l	%d0
+	jra	fp_data_long
+
+fp_data_word:
+	ext.l	%d0
+
+fp_data_long:
+	jsr	fp_conv_long2ext
+	jra	fp_getdest
+
+fp_data_single:
+	jsr	fp_conv_single2ext
+	jra	fp_getdest
+
+	| addressing mode: address register indirect
+fp_indirect:
+	fp_mode_addr_indirect
+	jra	fp_fetchsource
+
+	| addressing mode: address register indirect with postincrement
+fp_postinc:
+	fp_mode_addr_indirect_postinc
+	jra	fp_fetchsource
+
+	| addressing mode: address register indirect with predecrement
+fp_predecr:
+	fp_mode_addr_indirect_predec
+	jra	fp_fetchsource
+
+	| addressing mode: address register/programm counter indirect
+	|		   with 16bit displacement
+fp_disp16:
+	fp_mode_addr_indirect_disp16
+	jra	fp_fetchsource
+
+	| all other indirect addressing modes will finally end up here
+fp_extmode0:
+	fp_mode_addr_indirect_extmode0
+	jra	fp_fetchsource
+
+| all pc relative addressing modes and immediate/absolute modes end up here
+| the first ones are sent to fp_extmode0 or fp_disp16
+| and only the latter are handled here
+fp_extmode1:
+	fp_decode_addr_reg
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_abs_short, fp_abs_long
+	.long	fp_disp16, fp_extmode0
+	.long	fp_immediate, fp_ill
+	.long	fp_ill, fp_ill
+
+	| addressing mode: absolute short
+fp_abs_short:
+	fp_mode_abs_short
+	jra	fp_fetchsource
+
+	| addressing mode: absolute long
+fp_abs_long:
+	fp_mode_abs_long
+	jra	fp_fetchsource
+
+	| addressing mode: immediate data
+fp_immediate:
+	printf	PDECODE,"#"
+	fp_get_pc %a0
+	move.w	(fp_datasize,%d1.w*2),%d0
+	addq.w	#1,%d0
+	and.w	#-2,%d0
+#ifdef FPU_EMU_DEBUG
+	movem.l	%d0/%d1,-(%sp)
+	movel	%a0,%a1
+	clr.l	%d1
+	jra	2f
+1:	getuser.b (%a1)+,%d1,fp_err_ua1,%a1
+	printf	PDECODE,"%02x",1,%d1
+2:	dbra	%d0,1b
+	movem.l	(%sp)+,%d0/%d1
+#endif
+	lea	(%a0,%d0.w),%a1
+	fp_put_pc %a1
+|	jra	fp_fetchsource
+
+fp_fetchsource:
+	move.l	%a0,%a1
+	swap	%d1
+	lea	(FPD_TEMPFP1,FPDATA),%a0
+	jmp	([0f:w,%pc,%d1.w*4])
+
+	.align	4
+0:	.long	fp_long, fp_single
+	.long	fp_ext, fp_pack
+	.long	fp_word, fp_double
+	.long	fp_byte, fp_ill
+
+fp_long:
+	getuser.l (%a1),%d0,fp_err_ua1,%a1
+	jsr	fp_conv_long2ext
+	jra	fp_getdest
+
+fp_single:
+	getuser.l (%a1),%d0,fp_err_ua1,%a1
+	jsr	fp_conv_single2ext
+	jra	fp_getdest
+
+fp_ext:
+	getuser.l (%a1)+,%d0,fp_err_ua1,%a1
+	lsr.l	#8,%d0
+	lsr.l	#7,%d0
+	lsr.w	#1,%d0
+	move.l	%d0,(%a0)+
+	getuser.l (%a1)+,%d0,fp_err_ua1,%a1
+	move.l	%d0,(%a0)+
+	getuser.l (%a1),%d0,fp_err_ua1,%a1
+	move.l	%d0,(%a0)
+	subq.l	#8,%a0
+	jra	fp_getdest
+
+fp_pack:
+	/* not supported yet */
+	jra	fp_ill
+
+fp_word:
+	getuser.w (%a1),%d0,fp_err_ua1,%a1
+	ext.l	%d0
+	jsr	fp_conv_long2ext
+	jra	fp_getdest
+
+fp_double:
+	jsr	fp_conv_double2ext
+	jra	fp_getdest
+
+fp_byte:
+	getuser.b (%a1),%d0,fp_err_ua1,%a1
+	extb.l	%d0
+	jsr	fp_conv_long2ext
+|	jra	fp_getdest
+
+fp_getdest:
+	move.l	%a0,%a1
+	bfextu	%d2{#22,#3},%d0
+	printf	PDECODE,",fp%d\n",1,%d0
+	fp_get_fp_reg
+	movem.l	%a0/%a1,-(%sp)
+	pea	fp_finalrounding
+	bfextu	%d2{#25,#7},%d0
+	jmp	([0f:w,%pc,%d0*4])
+
+	.align	4
+0:
+	.long	fp_fmove_mem2fp, fp_fint, fp_fsinh, fp_fintrz
+	.long	fp_fsqrt, fp_ill, fp_flognp1, fp_ill
+	.long	fp_fetoxm1, fp_ftanh, fp_fatan, fp_ill
+	.long	fp_fasin, fp_fatanh, fp_fsin, fp_ftan
+	.long	fp_fetox, fp_ftwotox, fp_ftentox, fp_ill
+	.long	fp_flogn, fp_flog10, fp_flog2, fp_ill
+	.long	fp_fabs, fp_fcosh, fp_fneg, fp_ill
+	.long	fp_facos, fp_fcos, fp_fgetexp, fp_fgetman
+	.long	fp_fdiv, fp_fmod, fp_fadd, fp_fmul
+	.long	fpa_fsgldiv, fp_frem, fp_fscale, fpa_fsglmul
+	.long	fp_fsub, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_fsincos0, fp_fsincos1, fp_fsincos2, fp_fsincos3
+	.long	fp_fsincos4, fp_fsincos5, fp_fsincos6, fp_fsincos7
+	.long	fp_fcmp, fp_ill, fp_ftst, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_fsmove, fp_fssqrt, fp_ill, fp_ill
+	.long	fp_fdmove, fp_fdsqrt, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_fsabs, fp_ill, fp_fsneg, fp_ill
+	.long	fp_fdabs, fp_ill, fp_fdneg, fp_ill
+	.long	fp_fsdiv, fp_ill, fp_fsadd, fp_fsmul
+	.long	fp_fddiv, fp_ill, fp_fdadd, fp_fdmul
+	.long	fp_fssub, fp_ill, fp_ill, fp_ill
+	.long	fp_fdsub, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+	.long	fp_ill, fp_ill, fp_ill, fp_ill
+
+	| Instructions follow
+
+	| Move an (emulated) ROM constant
+fp_fmovecr:
+	bfextu	%d2{#27,#5},%d0
+	printf	PINSTR,"fp_fmovecr #%d",1,%d0
+	move.l	%d0,%d1
+	add.l	%d0,%d0
+	add.l	%d1,%d0
+	lea	(fp_constants,%d0*4),%a0
+	move.l	#0x801cc0ff,%d0
+	addq.l	#1,%d1
+	lsl.l	%d1,%d0
+	jcc	1f
+	fp_set_sr FPSR_EXC_INEX2			| INEX2 exception
+1:	moveq	#-128,%d0				| continue with fmove
+	and.l	%d0,%d2
+	jra	fp_getdest
+
+	.data
+	.align	4
+fp_constants:
+	.long	0x00004000,0xc90fdaa2,0x2168c235	| pi
+	.extend	0,0,0,0,0,0,0,0,0,0
+	.long	0x00003ffd,0x9a209a84,0xfbcff798	| log10(2)
+	.long	0x00004000,0xadf85458,0xa2bb4a9a	| e
+	.long	0x00003fff,0xb8aa3b29,0x5c17f0bc	| log2(e)
+	.long	0x00003ffd,0xde5bd8a9,0x37287195	| log10(e)
+	.long	0x00000000,0x00000000,0x00000000	| 0.0
+	.long	0x00003ffe,0xb17217f7,0xd1cf79ac	| 1n(2)
+	.long	0x00004000,0x935d8ddd,0xaaa8ac17	| 1n(10)
+	| read this as "1.0 * 2^0" - note the high bit in the mantissa
+	.long	0x00003fff,0x80000000,0x00000000	| 10^0
+	.long	0x00004002,0xa0000000,0x00000000	| 10^1
+	.long	0x00004005,0xc8000000,0x00000000	| 10^2
+	.long	0x0000400c,0x9c400000,0x00000000	| 10^4
+	.long	0x00004019,0xbebc2000,0x00000000	| 10^8
+	.long	0x00004034,0x8e1bc9bf,0x04000000	| 10^16
+	.long	0x00004069,0x9dc5ada8,0x2b70b59e	| 10^32
+	.long	0x000040d3,0xc2781f49,0xffcfa6d5	| 10^64
+	.long	0x000041a8,0x93ba47c9,0x80e98ce0	| 10^128
+	.long	0x00004351,0xaa7eebfb,0x9df9de8e	| 10^256
+	.long	0x000046a3,0xe319a0ae,0xa60e91c7	| 10^512
+	.long	0x00004d48,0xc9767586,0x81750c17	| 10^1024
+	.long	0x00005a92,0x9e8b3b5d,0xc53d5de5	| 10^2048
+	.long	0x00007525,0xc4605202,0x8a20979b	| 10^4096
+	.previous
+
+fp_fmove_mem2fp:
+	printf	PINSTR,"fmove %p,%p\n",2,%a0,%a1
+	move.l	(%a1)+,(%a0)+
+	move.l	(%a1)+,(%a0)+
+	move.l	(%a1),(%a0)
+	subq.l	#8,%a0
+	rts
+
+fpa_fsglmul:
+	move.l	#fp_finalrounding_single_fast,(%sp)
+	jra	fp_fsglmul
+
+fpa_fsgldiv:
+	move.l	#fp_finalrounding_single_fast,(%sp)
+	jra	fp_fsgldiv
+
+.macro	fp_dosingleprec instr
+	printf	PINSTR,"single "
+	move.l	#fp_finalrounding_single,(%sp)
+	jra	\instr
+.endm
+
+.macro	fp_dodoubleprec instr
+	printf	PINSTR,"double "
+	move.l	#fp_finalrounding_double,(%sp)
+	jra	\instr
+.endm
+
+fp_fsmove:
+	fp_dosingleprec fp_fmove_mem2fp
+
+fp_fssqrt:
+	fp_dosingleprec fp_fsqrt
+
+fp_fdmove:
+	fp_dodoubleprec fp_fmove_mem2fp
+
+fp_fdsqrt:
+	fp_dodoubleprec fp_fsqrt
+
+fp_fsabs:
+	fp_dosingleprec fp_fabs
+
+fp_fsneg:
+	fp_dosingleprec fp_fneg
+
+fp_fdabs:
+	fp_dodoubleprec fp_fabs
+
+fp_fdneg:
+	fp_dodoubleprec fp_fneg
+
+fp_fsdiv:
+	fp_dosingleprec fp_fdiv
+
+fp_fsadd:
+	fp_dosingleprec fp_fadd
+
+fp_fsmul:
+	fp_dosingleprec fp_fmul
+
+fp_fddiv:
+	fp_dodoubleprec fp_fdiv
+
+fp_fdadd:
+	fp_dodoubleprec fp_fadd
+
+fp_fdmul:
+	fp_dodoubleprec fp_fmul
+
+fp_fssub:
+	fp_dosingleprec fp_fsub
+
+fp_fdsub:
+	fp_dodoubleprec fp_fsub
+
+fp_nonstd:
+	fp_get_pc %a0
+	getuser.l (%a0),%d0,fp_err_ua1,%a0
+	printf	,"nonstd ((%08x)=%08x)\n",2,%a0,%d0
+	moveq	#-1,%d0
+	rts
+
+	.data
+	.align	4
+
+	| data sizes corresponding to the operand formats
+fp_datasize:
+	.word	4, 4, 12, 12, 2, 8, 1, 0
diff --git a/arch/m68k/math-emu/fp_trig.c b/arch/m68k/math-emu/fp_trig.c
new file mode 100644
index 0000000..6361d07
--- /dev/null
+++ b/arch/m68k/math-emu/fp_trig.c
@@ -0,0 +1,183 @@
+/*
+
+  fp_trig.c: floating-point math routines for the Linux-m68k
+  floating point emulator.
+
+  Copyright (c) 1998-1999 David Huggins-Daines / Roman Zippel.
+
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+
+*/
+
+#include "fp_emu.h"
+#include "fp_trig.h"
+
+struct fp_ext *
+fp_fsin(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsin\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fcos(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fcos\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftan(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftan\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fasin(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fasin\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_facos(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("facos\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fatan(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fatan\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsinh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsinh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fcosh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fcosh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_ftanh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("ftanh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fatanh(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fatanh\n");
+
+	fp_monadic_check(dest, src);
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos0(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos0\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos1(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos1\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos2(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos2\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos3(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos3\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos4(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos4\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos5(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos5\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos6(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos6\n");
+
+	return dest;
+}
+
+struct fp_ext *
+fp_fsincos7(struct fp_ext *dest, struct fp_ext *src)
+{
+	uprint("fsincos7\n");
+
+	return dest;
+}
diff --git a/arch/m68k/math-emu/fp_trig.h b/arch/m68k/math-emu/fp_trig.h
new file mode 100644
index 0000000..af8b247
--- /dev/null
+++ b/arch/m68k/math-emu/fp_trig.h
@@ -0,0 +1,32 @@
+/*
+
+  fp_trig.h: floating-point math routines for the Linux-m68k
+  floating point emulator.
+
+  Copyright (c) 1998 David Huggins-Daines.
+
+  I hereby give permission, free of charge, to copy, modify, and
+  redistribute this software, in source or binary form, provided that
+  the above copyright notice and the following disclaimer are included
+  in all such copies.
+
+  THIS SOFTWARE IS PROVIDED "AS IS", WITH ABSOLUTELY NO WARRANTY, REAL
+  OR IMPLIED.
+
+*/
+
+#ifndef FP_TRIG_H
+#define FP_TRIG_H
+
+#include "fp_emu.h"
+
+/* floating point trigonometric instructions:
+
+   the arguments to these are in the "internal" extended format, that
+   is, an "exploded" version of the 96-bit extended fp format used by
+   the 68881.
+
+   they return a status code, which should end up in %d0, if all goes
+   well.  */
+
+#endif /* FP_TRIG__H */
diff --git a/arch/m68k/math-emu/fp_util.S b/arch/m68k/math-emu/fp_util.S
new file mode 100644
index 0000000..a9f7f01
--- /dev/null
+++ b/arch/m68k/math-emu/fp_util.S
@@ -0,0 +1,1455 @@
+/*
+ * fp_util.S
+ *
+ * Copyright Roman Zippel, 1997.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU General Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/config.h>
+#include "fp_emu.h"
+
+/*
+ * Here are lots of conversion and normalization functions mainly
+ * used by fp_scan.S
+ * Note that these functions are optimized for "normal" numbers,
+ * these are handled first and exit as fast as possible, this is
+ * especially important for fp_normalize_ext/fp_conv_ext2ext, as
+ * it's called very often.
+ * The register usage is optimized for fp_scan.S and which register
+ * is currently at that time unused, be careful if you want change
+ * something here. %d0 and %d1 is always usable, sometimes %d2 (or
+ * only the lower half) most function have to return the %a0
+ * unmodified, so that the caller can immediately reuse it.
+ */
+
+	.globl	fp_ill, fp_end
+
+	| exits from fp_scan:
+	| illegal instruction
+fp_ill:
+	printf	,"fp_illegal\n"
+	rts
+	| completed instruction
+fp_end:
+	tst.l	(TASK_MM-8,%a2)
+	jmi	1f
+	tst.l	(TASK_MM-4,%a2)
+	jmi	1f
+	tst.l	(TASK_MM,%a2)
+	jpl	2f
+1:	printf	,"oops:%p,%p,%p\n",3,%a2@(TASK_MM-8),%a2@(TASK_MM-4),%a2@(TASK_MM)
+2:	clr.l	%d0
+	rts
+
+	.globl	fp_conv_long2ext, fp_conv_single2ext
+	.globl	fp_conv_double2ext, fp_conv_ext2ext
+	.globl	fp_normalize_ext, fp_normalize_double
+	.globl	fp_normalize_single, fp_normalize_single_fast
+	.globl	fp_conv_ext2double, fp_conv_ext2single
+	.globl	fp_conv_ext2long, fp_conv_ext2short
+	.globl	fp_conv_ext2byte
+	.globl	fp_finalrounding_single, fp_finalrounding_single_fast
+	.globl	fp_finalrounding_double
+	.globl	fp_finalrounding, fp_finaltest, fp_final
+
+/*
+ * First several conversion functions from a source operand
+ * into the extended format. Note, that only fp_conv_ext2ext
+ * normalizes the number and is always called after the other
+ * conversion functions, which only move the information into
+ * fp_ext structure.
+ */
+
+	| fp_conv_long2ext:
+	|
+	| args:	%d0 = source (32-bit long)
+	|	%a0 = destination (ptr to struct fp_ext)
+
+fp_conv_long2ext:
+	printf	PCONV,"l2e: %p -> %p(",2,%d0,%a0
+	clr.l	%d1			| sign defaults to zero
+	tst.l	%d0
+	jeq	fp_l2e_zero		| is source zero?
+	jpl	1f			| positive?
+	moveq	#1,%d1
+	neg.l	%d0
+1:	swap	%d1
+	move.w	#0x3fff+31,%d1
+	move.l	%d1,(%a0)+		| set sign / exp
+	move.l	%d0,(%a0)+		| set mantissa
+	clr.l	(%a0)
+	subq.l	#8,%a0			| restore %a0
+	printx	PCONV,%a0@
+	printf	PCONV,")\n"
+	rts
+	| source is zero
+fp_l2e_zero:
+	clr.l	(%a0)+
+	clr.l	(%a0)+
+	clr.l	(%a0)
+	subq.l	#8,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,")\n"
+	rts
+
+	| fp_conv_single2ext
+	| args:	%d0 = source (single-precision fp value)
+	|	%a0 = dest (struct fp_ext *)
+
+fp_conv_single2ext:
+	printf	PCONV,"s2e: %p -> %p(",2,%d0,%a0
+	move.l	%d0,%d1
+	lsl.l	#8,%d0			| shift mantissa
+	lsr.l	#8,%d1			| exponent / sign
+	lsr.l	#7,%d1
+	lsr.w	#8,%d1
+	jeq	fp_s2e_small		| zero / denormal?
+	cmp.w	#0xff,%d1		| NaN / Inf?
+	jeq	fp_s2e_large
+	bset	#31,%d0			| set explizit bit
+	add.w	#0x3fff-0x7f,%d1	| re-bias the exponent.
+9:	move.l	%d1,(%a0)+		| fp_ext.sign, fp_ext.exp
+	move.l	%d0,(%a0)+		| high lword of fp_ext.mant
+	clr.l	(%a0)			| low lword = 0
+	subq.l	#8,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,")\n"
+	rts
+	| zeros and denormalized
+fp_s2e_small:
+	| exponent is zero, so explizit bit is already zero too
+	tst.l	%d0
+	jeq	9b
+	move.w	#0x4000-0x7f,%d1
+	jra	9b
+	| infinities and NAN
+fp_s2e_large:
+	bclr	#31,%d0			| clear explizit bit
+	move.w	#0x7fff,%d1
+	jra	9b
+
+fp_conv_double2ext:
+#ifdef FPU_EMU_DEBUG
+	getuser.l %a1@(0),%d0,fp_err_ua2,%a1
+	getuser.l %a1@(4),%d1,fp_err_ua2,%a1
+	printf	PCONV,"d2e: %p%p -> %p(",3,%d0,%d1,%a0
+#endif
+	getuser.l (%a1)+,%d0,fp_err_ua2,%a1
+	move.l	%d0,%d1
+	lsl.l	#8,%d0			| shift high mantissa
+	lsl.l	#3,%d0
+	lsr.l	#8,%d1			| exponent / sign
+	lsr.l	#7,%d1
+	lsr.w	#5,%d1
+	jeq	fp_d2e_small		| zero / denormal?
+	cmp.w	#0x7ff,%d1		| NaN / Inf?
+	jeq	fp_d2e_large
+	bset	#31,%d0			| set explizit bit
+	add.w	#0x3fff-0x3ff,%d1	| re-bias the exponent.
+9:	move.l	%d1,(%a0)+		| fp_ext.sign, fp_ext.exp
+	move.l	%d0,(%a0)+
+	getuser.l (%a1)+,%d0,fp_err_ua2,%a1
+	move.l	%d0,%d1
+	lsl.l	#8,%d0
+	lsl.l	#3,%d0
+	move.l	%d0,(%a0)
+	moveq	#21,%d0
+	lsr.l	%d0,%d1
+	or.l	%d1,-(%a0)
+	subq.l	#4,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,")\n"
+	rts
+	| zeros and denormalized
+fp_d2e_small:
+	| exponent is zero, so explizit bit is already zero too
+	tst.l	%d0
+	jeq	9b
+	move.w	#0x4000-0x3ff,%d1
+	jra	9b
+	| infinities and NAN
+fp_d2e_large:
+	bclr	#31,%d0			| clear explizit bit
+	move.w	#0x7fff,%d1
+	jra	9b
+
+	| fp_conv_ext2ext:
+	| originally used to get longdouble from userspace, now it's
+	| called before arithmetic operations to make sure the number
+	| is normalized [maybe rename it?].
+	| args:	%a0 = dest (struct fp_ext *)
+	| returns 0 in %d0 for a NaN, otherwise 1
+
+fp_conv_ext2ext:
+	printf	PCONV,"e2e: %p(",1,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,"), "
+	move.l	(%a0)+,%d0
+	cmp.w	#0x7fff,%d0		| Inf / NaN?
+	jeq	fp_e2e_large
+	move.l	(%a0),%d0
+	jpl	fp_e2e_small		| zero / denorm?
+	| The high bit is set, so normalization is irrelevant.
+fp_e2e_checkround:
+	subq.l	#4,%a0
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+	move.b	(%a0),%d0
+	jne	fp_e2e_round
+#endif
+	printf	PCONV,"%p(",1,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,")\n"
+	moveq	#1,%d0
+	rts
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_e2e_round:
+	fp_set_sr FPSR_EXC_INEX2
+	clr.b	(%a0)
+	move.w	(FPD_RND,FPDATA),%d2
+	jne	fp_e2e_roundother	| %d2 == 0, round to nearest
+	tst.b	%d0			| test guard bit
+	jpl	9f			| zero is closer
+	btst	#0,(11,%a0)		| test lsb bit
+	jne	fp_e2e_doroundup	| round to infinity
+	lsl.b	#1,%d0			| check low bits
+	jeq	9f			| round to zero
+fp_e2e_doroundup:
+	addq.l	#1,(8,%a0)
+	jcc	9f
+	addq.l	#1,(4,%a0)
+	jcc	9f
+	move.w	#0x8000,(4,%a0)
+	addq.w	#1,(2,%a0)
+9:	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+fp_e2e_roundother:
+	subq.w	#2,%d2
+	jcs	9b			| %d2 < 2, round to zero
+	jhi	1f			| %d2 > 2, round to +infinity
+	tst.b	(1,%a0)			| to -inf
+	jne	fp_e2e_doroundup	| negative, round to infinity
+	jra	9b			| positive, round to zero
+1:	tst.b	(1,%a0)			| to +inf
+	jeq	fp_e2e_doroundup	| positive, round to infinity
+	jra	9b			| negative, round to zero
+#endif
+	| zeros and subnormals:
+	| try to normalize these anyway.
+fp_e2e_small:
+	jne	fp_e2e_small1		| high lword zero?
+	move.l	(4,%a0),%d0
+	jne	fp_e2e_small2
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+	clr.l	%d0
+	move.b	(-4,%a0),%d0
+	jne	fp_e2e_small3
+#endif
+	| Genuine zero.
+	clr.w	-(%a0)
+	subq.l	#2,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	moveq	#1,%d0
+	rts
+	| definitely subnormal, need to shift all 64 bits
+fp_e2e_small1:
+	bfffo	%d0{#0,#32},%d1
+	move.w	-(%a0),%d2
+	sub.w	%d1,%d2
+	jcc	1f
+	| Pathologically small, denormalize.
+	add.w	%d2,%d1
+	clr.w	%d2
+1:	move.w	%d2,(%a0)+
+	move.w	%d1,%d2
+	jeq	fp_e2e_checkround
+	| fancy 64-bit double-shift begins here
+	lsl.l	%d2,%d0
+	move.l	%d0,(%a0)+
+	move.l	(%a0),%d0
+	move.l	%d0,%d1
+	lsl.l	%d2,%d0
+	move.l	%d0,(%a0)
+	neg.w	%d2
+	and.w	#0x1f,%d2
+	lsr.l	%d2,%d1
+	or.l	%d1,-(%a0)
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_e2e_extra1:
+	clr.l	%d0
+	move.b	(-4,%a0),%d0
+	neg.w	%d2
+	add.w	#24,%d2
+	jcc	1f
+	clr.b	(-4,%a0)
+	lsl.l	%d2,%d0
+	or.l	%d0,(4,%a0)
+	jra	fp_e2e_checkround
+1:	addq.w	#8,%d2
+	lsl.l	%d2,%d0
+	move.b	%d0,(-4,%a0)
+	lsr.l	#8,%d0
+	or.l	%d0,(4,%a0)
+#endif
+	jra	fp_e2e_checkround
+	| pathologically small subnormal
+fp_e2e_small2:
+	bfffo	%d0{#0,#32},%d1
+	add.w	#32,%d1
+	move.w	-(%a0),%d2
+	sub.w	%d1,%d2
+	jcc	1f
+	| Beyond pathologically small, denormalize.
+	add.w	%d2,%d1
+	clr.w	%d2
+1:	move.w	%d2,(%a0)+
+	ext.l	%d1
+	jeq	fp_e2e_checkround
+	clr.l	(4,%a0)
+	sub.w	#32,%d2
+	jcs	1f
+	lsl.l	%d1,%d0			| lower lword needs only to be shifted
+	move.l	%d0,(%a0)		| into the higher lword
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+	clr.l	%d0
+	move.b	(-4,%a0),%d0
+	clr.b	(-4,%a0)
+	neg.w	%d1
+	add.w	#32,%d1
+	bfins	%d0,(%a0){%d1,#8}
+#endif
+	jra	fp_e2e_checkround
+1:	neg.w	%d1			| lower lword is splitted between
+	bfins	%d0,(%a0){%d1,#32}	| higher and lower lword
+#ifndef CONFIG_M68KFPU_EMU_EXTRAPREC
+	jra	fp_e2e_checkround
+#else
+	move.w	%d1,%d2
+	jra	fp_e2e_extra1
+	| These are extremely small numbers, that will mostly end up as zero
+	| anyway, so this is only important for correct rounding.
+fp_e2e_small3:
+	bfffo	%d0{#24,#8},%d1
+	add.w	#40,%d1
+	move.w	-(%a0),%d2
+	sub.w	%d1,%d2
+	jcc	1f
+	| Pathologically small, denormalize.
+	add.w	%d2,%d1
+	clr.w	%d2
+1:	move.w	%d2,(%a0)+
+	ext.l	%d1
+	jeq	fp_e2e_checkround
+	cmp.w	#8,%d1
+	jcs	2f
+1:	clr.b	(-4,%a0)
+	sub.w	#64,%d1
+	jcs	1f
+	add.w	#24,%d1
+	lsl.l	%d1,%d0
+	move.l	%d0,(%a0)
+	jra	fp_e2e_checkround
+1:	neg.w	%d1
+	bfins	%d0,(%a0){%d1,#8}
+	jra	fp_e2e_checkround
+2:	lsl.l	%d1,%d0
+	move.b	%d0,(-4,%a0)
+	lsr.l	#8,%d0
+	move.b	%d0,(7,%a0)
+	jra	fp_e2e_checkround
+#endif
+1:	move.l	%d0,%d1			| lower lword is splitted between
+	lsl.l	%d2,%d0			| higher and lower lword
+	move.l	%d0,(%a0)
+	move.l	%d1,%d0
+	neg.w	%d2
+	add.w	#32,%d2
+	lsr.l	%d2,%d0
+	move.l	%d0,-(%a0)
+	jra	fp_e2e_checkround
+	| Infinities and NaNs
+fp_e2e_large:
+	move.l	(%a0)+,%d0
+	jne	3f
+1:	tst.l	(%a0)
+	jne	4f
+	moveq	#1,%d0
+2:	subq.l	#8,%a0
+	printf	PCONV,"%p(",1,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,")\n"
+	rts
+	| we have maybe a NaN, shift off the highest bit
+3:	lsl.l	#1,%d0
+	jeq	1b
+	| we have a NaN, clear the return value
+4:	clrl	%d0
+	jra	2b
+
+
+/*
+ * Normalization functions.  Call these on the output of general
+ * FP operators, and before any conversion into the destination
+ * formats. fp_normalize_ext has always to be called first, the
+ * following conversion functions expect an already normalized
+ * number.
+ */
+
+	| fp_normalize_ext:
+	| normalize an extended in extended (unpacked) format, basically
+	| it does the same as fp_conv_ext2ext, additionally it also does
+	| the necessary postprocessing checks.
+	| args:	%a0 (struct fp_ext *)
+	| NOTE: it does _not_ modify %a0/%a1 and the upper word of %d2
+
+fp_normalize_ext:
+	printf	PNORM,"ne: %p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,"), "
+	move.l	(%a0)+,%d0
+	cmp.w	#0x7fff,%d0		| Inf / NaN?
+	jeq	fp_ne_large
+	move.l	(%a0),%d0
+	jpl	fp_ne_small		| zero / denorm?
+	| The high bit is set, so normalization is irrelevant.
+fp_ne_checkround:
+	subq.l	#4,%a0
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+	move.b	(%a0),%d0
+	jne	fp_ne_round
+#endif
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_ne_round:
+	fp_set_sr FPSR_EXC_INEX2
+	clr.b	(%a0)
+	move.w	(FPD_RND,FPDATA),%d2
+	jne	fp_ne_roundother	| %d2 == 0, round to nearest
+	tst.b	%d0			| test guard bit
+	jpl	9f			| zero is closer
+	btst	#0,(11,%a0)		| test lsb bit
+	jne	fp_ne_doroundup		| round to infinity
+	lsl.b	#1,%d0			| check low bits
+	jeq	9f			| round to zero
+fp_ne_doroundup:
+	addq.l	#1,(8,%a0)
+	jcc	9f
+	addq.l	#1,(4,%a0)
+	jcc	9f
+	addq.w	#1,(2,%a0)
+	move.w	#0x8000,(4,%a0)
+9:	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+fp_ne_roundother:
+	subq.w	#2,%d2
+	jcs	9b			| %d2 < 2, round to zero
+	jhi	1f			| %d2 > 2, round to +infinity
+	tst.b	(1,%a0)			| to -inf
+	jne	fp_ne_doroundup		| negative, round to infinity
+	jra	9b			| positive, round to zero
+1:	tst.b	(1,%a0)			| to +inf
+	jeq	fp_ne_doroundup		| positive, round to infinity
+	jra	9b			| negative, round to zero
+#endif
+	| Zeros and subnormal numbers
+	| These are probably merely subnormal, rather than "denormalized"
+	|  numbers, so we will try to make them normal again.
+fp_ne_small:
+	jne	fp_ne_small1		| high lword zero?
+	move.l	(4,%a0),%d0
+	jne	fp_ne_small2
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+	clr.l	%d0
+	move.b	(-4,%a0),%d0
+	jne	fp_ne_small3
+#endif
+	| Genuine zero.
+	clr.w	-(%a0)
+	subq.l	#2,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+	| Subnormal.
+fp_ne_small1:
+	bfffo	%d0{#0,#32},%d1
+	move.w	-(%a0),%d2
+	sub.w	%d1,%d2
+	jcc	1f
+	| Pathologically small, denormalize.
+	add.w	%d2,%d1
+	clr.w	%d2
+	fp_set_sr FPSR_EXC_UNFL
+1:	move.w	%d2,(%a0)+
+	move.w	%d1,%d2
+	jeq	fp_ne_checkround
+	| This is exactly the same 64-bit double shift as seen above.
+	lsl.l	%d2,%d0
+	move.l	%d0,(%a0)+
+	move.l	(%a0),%d0
+	move.l	%d0,%d1
+	lsl.l	%d2,%d0
+	move.l	%d0,(%a0)
+	neg.w	%d2
+	and.w	#0x1f,%d2
+	lsr.l	%d2,%d1
+	or.l	%d1,-(%a0)
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+fp_ne_extra1:
+	clr.l	%d0
+	move.b	(-4,%a0),%d0
+	neg.w	%d2
+	add.w	#24,%d2
+	jcc	1f
+	clr.b	(-4,%a0)
+	lsl.l	%d2,%d0
+	or.l	%d0,(4,%a0)
+	jra	fp_ne_checkround
+1:	addq.w	#8,%d2
+	lsl.l	%d2,%d0
+	move.b	%d0,(-4,%a0)
+	lsr.l	#8,%d0
+	or.l	%d0,(4,%a0)
+#endif
+	jra	fp_ne_checkround
+	| May or may not be subnormal, if so, only 32 bits to shift.
+fp_ne_small2:
+	bfffo	%d0{#0,#32},%d1
+	add.w	#32,%d1
+	move.w	-(%a0),%d2
+	sub.w	%d1,%d2
+	jcc	1f
+	| Beyond pathologically small, denormalize.
+	add.w	%d2,%d1
+	clr.w	%d2
+	fp_set_sr FPSR_EXC_UNFL
+1:	move.w	%d2,(%a0)+
+	ext.l	%d1
+	jeq	fp_ne_checkround
+	clr.l	(4,%a0)
+	sub.w	#32,%d1
+	jcs	1f
+	lsl.l	%d1,%d0			| lower lword needs only to be shifted
+	move.l	%d0,(%a0)		| into the higher lword
+#ifdef CONFIG_M68KFPU_EMU_EXTRAPREC
+	clr.l	%d0
+	move.b	(-4,%a0),%d0
+	clr.b	(-4,%a0)
+	neg.w	%d1
+	add.w	#32,%d1
+	bfins	%d0,(%a0){%d1,#8}
+#endif
+	jra	fp_ne_checkround
+1:	neg.w	%d1			| lower lword is splitted between
+	bfins	%d0,(%a0){%d1,#32}	| higher and lower lword
+#ifndef CONFIG_M68KFPU_EMU_EXTRAPREC
+	jra	fp_ne_checkround
+#else
+	move.w	%d1,%d2
+	jra	fp_ne_extra1
+	| These are extremely small numbers, that will mostly end up as zero
+	| anyway, so this is only important for correct rounding.
+fp_ne_small3:
+	bfffo	%d0{#24,#8},%d1
+	add.w	#40,%d1
+	move.w	-(%a0),%d2
+	sub.w	%d1,%d2
+	jcc	1f
+	| Pathologically small, denormalize.
+	add.w	%d2,%d1
+	clr.w	%d2
+1:	move.w	%d2,(%a0)+
+	ext.l	%d1
+	jeq	fp_ne_checkround
+	cmp.w	#8,%d1
+	jcs	2f
+1:	clr.b	(-4,%a0)
+	sub.w	#64,%d1
+	jcs	1f
+	add.w	#24,%d1
+	lsl.l	%d1,%d0
+	move.l	%d0,(%a0)
+	jra	fp_ne_checkround
+1:	neg.w	%d1
+	bfins	%d0,(%a0){%d1,#8}
+	jra	fp_ne_checkround
+2:	lsl.l	%d1,%d0
+	move.b	%d0,(-4,%a0)
+	lsr.l	#8,%d0
+	move.b	%d0,(7,%a0)
+	jra	fp_ne_checkround
+#endif
+	| Infinities and NaNs, again, same as above.
+fp_ne_large:
+	move.l	(%a0)+,%d0
+	jne	3f
+1:	tst.l	(%a0)
+	jne	4f
+2:	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+	| we have maybe a NaN, shift off the highest bit
+3:	move.l	%d0,%d1
+	lsl.l	#1,%d1
+	jne	4f
+	clr.l	(-4,%a0)
+	jra	1b
+	| we have a NaN, test if it is signaling
+4:	bset	#30,%d0
+	jne	2b
+	fp_set_sr FPSR_EXC_SNAN
+	move.l	%d0,(-4,%a0)
+	jra	2b
+
+	| these next two do rounding as per the IEEE standard.
+	| values for the rounding modes appear to be:
+	| 0:	Round to nearest
+	| 1:	Round to zero
+	| 2:	Round to -Infinity
+	| 3:	Round to +Infinity
+	| both functions expect that fp_normalize was already
+	| called (and extended argument is already normalized
+	| as far as possible), these are used if there is different
+	| rounding precision is selected and before converting
+	| into single/double
+
+	| fp_normalize_double:
+	| normalize an extended with double (52-bit) precision
+	| args:	 %a0 (struct fp_ext *)
+
+fp_normalize_double:
+	printf	PNORM,"nd: %p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,"), "
+	move.l	(%a0)+,%d2
+	tst.w	%d2
+	jeq	fp_nd_zero		| zero / denormalized
+	cmp.w	#0x7fff,%d2
+	jeq	fp_nd_huge		| NaN / infinitive.
+	sub.w	#0x4000-0x3ff,%d2	| will the exponent fit?
+	jcs	fp_nd_small		| too small.
+	cmp.w	#0x7fe,%d2
+	jcc	fp_nd_large		| too big.
+	addq.l	#4,%a0
+	move.l	(%a0),%d0		| low lword of mantissa
+	| now, round off the low 11 bits.
+fp_nd_round:
+	moveq	#21,%d1
+	lsl.l	%d1,%d0			| keep 11 low bits.
+	jne	fp_nd_checkround	| Are they non-zero?
+	| nothing to do here
+9:	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+	| Be careful with the X bit! It contains the lsb
+	| from the shift above, it is needed for round to nearest.
+fp_nd_checkround:
+	fp_set_sr FPSR_EXC_INEX2	| INEX2 bit
+	and.w	#0xf800,(2,%a0)		| clear bits 0-10
+	move.w	(FPD_RND,FPDATA),%d2	| rounding mode
+	jne	2f			| %d2 == 0, round to nearest
+	tst.l	%d0			| test guard bit
+	jpl	9b			| zero is closer
+	| here we test the X bit by adding it to %d2
+	clr.w	%d2			| first set z bit, addx only clears it
+	addx.w	%d2,%d2			| test lsb bit
+	| IEEE754-specified "round to even" behaviour.  If the guard
+	| bit is set, then the number is odd, so rounding works like
+	| in grade-school arithmetic (i.e. 1.5 rounds to 2.0)
+	| Otherwise, an equal distance rounds towards zero, so as not
+	| to produce an odd number.  This is strange, but it is what
+	| the standard says.
+	jne	fp_nd_doroundup		| round to infinity
+	lsl.l	#1,%d0			| check low bits
+	jeq	9b			| round to zero
+fp_nd_doroundup:
+	| round (the mantissa, that is) towards infinity
+	add.l	#0x800,(%a0)
+	jcc	9b			| no overflow, good.
+	addq.l	#1,-(%a0)		| extend to high lword
+	jcc	1f			| no overflow, good.
+	| Yow! we have managed to overflow the mantissa.  Since this
+	| only happens when %d1 was 0xfffff800, it is now zero, so
+	| reset the high bit, and increment the exponent.
+	move.w	#0x8000,(%a0)
+	addq.w	#1,-(%a0)
+	cmp.w	#0x43ff,(%a0)+		| exponent now overflown?
+	jeq	fp_nd_large		| yes, so make it infinity.
+1:	subq.l	#4,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+2:	subq.w	#2,%d2
+	jcs	9b			| %d2 < 2, round to zero
+	jhi	3f			| %d2 > 2, round to +infinity
+	| Round to +Inf or -Inf.  High word of %d2 contains the
+	| sign of the number, by the way.
+	swap	%d2			| to -inf
+	tst.b	%d2
+	jne	fp_nd_doroundup		| negative, round to infinity
+	jra	9b			| positive, round to zero
+3:	swap	%d2			| to +inf
+	tst.b	%d2
+	jeq	fp_nd_doroundup		| positive, round to infinity
+	jra	9b			| negative, round to zero
+	| Exponent underflow.  Try to make a denormal, and set it to
+	| the smallest possible fraction if this fails.
+fp_nd_small:
+	fp_set_sr FPSR_EXC_UNFL		| set UNFL bit
+	move.w	#0x3c01,(-2,%a0)	| 2**-1022
+	neg.w	%d2			| degree of underflow
+	cmp.w	#32,%d2			| single or double shift?
+	jcc	1f
+	| Again, another 64-bit double shift.
+	move.l	(%a0),%d0
+	move.l	%d0,%d1
+	lsr.l	%d2,%d0
+	move.l	%d0,(%a0)+
+	move.l	(%a0),%d0
+	lsr.l	%d2,%d0
+	neg.w	%d2
+	add.w	#32,%d2
+	lsl.l	%d2,%d1
+	or.l	%d1,%d0
+	move.l	(%a0),%d1
+	move.l	%d0,(%a0)
+	| Check to see if we shifted off any significant bits
+	lsl.l	%d2,%d1
+	jeq	fp_nd_round		| Nope, round.
+	bset	#0,%d0			| Yes, so set the "sticky bit".
+	jra	fp_nd_round		| Now, round.
+	| Another 64-bit single shift and store
+1:	sub.w	#32,%d2
+	cmp.w	#32,%d2			| Do we really need to shift?
+	jcc	2f			| No, the number is too small.
+	move.l	(%a0),%d0
+	clr.l	(%a0)+
+	move.l	%d0,%d1
+	lsr.l	%d2,%d0
+	neg.w	%d2
+	add.w	#32,%d2
+	| Again, check to see if we shifted off any significant bits.
+	tst.l	(%a0)
+	jeq	1f
+	bset	#0,%d0			| Sticky bit.
+1:	move.l	%d0,(%a0)
+	lsl.l	%d2,%d1
+	jeq	fp_nd_round
+	bset	#0,%d0
+	jra	fp_nd_round
+	| Sorry, the number is just too small.
+2:	clr.l	(%a0)+
+	clr.l	(%a0)
+	moveq	#1,%d0			| Smallest possible fraction,
+	jra	fp_nd_round		| round as desired.
+	| zero and denormalized
+fp_nd_zero:
+	tst.l	(%a0)+
+	jne	1f
+	tst.l	(%a0)
+	jne	1f
+	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts				| zero.  nothing to do.
+	| These are not merely subnormal numbers, but true denormals,
+	| i.e. pathologically small (exponent is 2**-16383) numbers.
+	| It is clearly impossible for even a normal extended number
+	| with that exponent to fit into double precision, so just
+	| write these ones off as "too darn small".
+1:	fp_set_sr FPSR_EXC_UNFL		| Set UNFL bit
+	clr.l	(%a0)
+	clr.l	-(%a0)
+	move.w	#0x3c01,-(%a0)		| i.e. 2**-1022
+	addq.l	#6,%a0
+	moveq	#1,%d0
+	jra	fp_nd_round		| round.
+	| Exponent overflow.  Just call it infinity.
+fp_nd_large:
+	move.w	#0x7ff,%d0
+	and.w	(6,%a0),%d0
+	jeq	1f
+	fp_set_sr FPSR_EXC_INEX2
+1:	fp_set_sr FPSR_EXC_OVFL
+	move.w	(FPD_RND,FPDATA),%d2
+	jne	3f			| %d2 = 0 round to nearest
+1:	move.w	#0x7fff,(-2,%a0)
+	clr.l	(%a0)+
+	clr.l	(%a0)
+2:	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+3:	subq.w	#2,%d2
+	jcs	5f			| %d2 < 2, round to zero
+	jhi	4f			| %d2 > 2, round to +infinity
+	tst.b	(-3,%a0)		| to -inf
+	jne	1b
+	jra	5f
+4:	tst.b	(-3,%a0)		| to +inf
+	jeq	1b
+5:	move.w	#0x43fe,(-2,%a0)
+	moveq	#-1,%d0
+	move.l	%d0,(%a0)+
+	move.w	#0xf800,%d0
+	move.l	%d0,(%a0)
+	jra	2b
+	| Infinities or NaNs
+fp_nd_huge:
+	subq.l	#4,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+
+	| fp_normalize_single:
+	| normalize an extended with single (23-bit) precision
+	| args:	 %a0 (struct fp_ext *)
+
+fp_normalize_single:
+	printf	PNORM,"ns: %p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,") "
+	addq.l	#2,%a0
+	move.w	(%a0)+,%d2
+	jeq	fp_ns_zero		| zero / denormalized
+	cmp.w	#0x7fff,%d2
+	jeq	fp_ns_huge		| NaN / infinitive.
+	sub.w	#0x4000-0x7f,%d2	| will the exponent fit?
+	jcs	fp_ns_small		| too small.
+	cmp.w	#0xfe,%d2
+	jcc	fp_ns_large		| too big.
+	move.l	(%a0)+,%d0		| get high lword of mantissa
+fp_ns_round:
+	tst.l	(%a0)			| check the low lword
+	jeq	1f
+	| Set a sticky bit if it is non-zero.  This should only
+	| affect the rounding in what would otherwise be equal-
+	| distance situations, which is what we want it to do.
+	bset	#0,%d0
+1:	clr.l	(%a0)			| zap it from memory.
+	| now, round off the low 8 bits of the hi lword.
+	tst.b	%d0			| 8 low bits.
+	jne	fp_ns_checkround	| Are they non-zero?
+	| nothing to do here
+	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+fp_ns_checkround:
+	fp_set_sr FPSR_EXC_INEX2	| INEX2 bit
+	clr.b	-(%a0)			| clear low byte of high lword
+	subq.l	#3,%a0
+	move.w	(FPD_RND,FPDATA),%d2	| rounding mode
+	jne	2f			| %d2 == 0, round to nearest
+	tst.b	%d0			| test guard bit
+	jpl	9f			| zero is closer
+	btst	#8,%d0			| test lsb bit
+	| round to even behaviour, see above.
+	jne	fp_ns_doroundup		| round to infinity
+	lsl.b	#1,%d0			| check low bits
+	jeq	9f			| round to zero
+fp_ns_doroundup:
+	| round (the mantissa, that is) towards infinity
+	add.l	#0x100,(%a0)
+	jcc	9f			| no overflow, good.
+	| Overflow.  This means that the %d1 was 0xffffff00, so it
+	| is now zero.  We will set the mantissa to reflect this, and
+	| increment the exponent (checking for overflow there too)
+	move.w	#0x8000,(%a0)
+	addq.w	#1,-(%a0)
+	cmp.w	#0x407f,(%a0)+		| exponent now overflown?
+	jeq	fp_ns_large		| yes, so make it infinity.
+9:	subq.l	#4,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+	| check nondefault rounding modes
+2:	subq.w	#2,%d2
+	jcs	9b			| %d2 < 2, round to zero
+	jhi	3f			| %d2 > 2, round to +infinity
+	tst.b	(-3,%a0)		| to -inf
+	jne	fp_ns_doroundup		| negative, round to infinity
+	jra	9b			| positive, round to zero
+3:	tst.b	(-3,%a0)		| to +inf
+	jeq	fp_ns_doroundup		| positive, round to infinity
+	jra	9b			| negative, round to zero
+	| Exponent underflow.  Try to make a denormal, and set it to
+	| the smallest possible fraction if this fails.
+fp_ns_small:
+	fp_set_sr FPSR_EXC_UNFL		| set UNFL bit
+	move.w	#0x3f81,(-2,%a0)	| 2**-126
+	neg.w	%d2			| degree of underflow
+	cmp.w	#32,%d2			| single or double shift?
+	jcc	2f
+	| a 32-bit shift.
+	move.l	(%a0),%d0
+	move.l	%d0,%d1
+	lsr.l	%d2,%d0
+	move.l	%d0,(%a0)+
+	| Check to see if we shifted off any significant bits.
+	neg.w	%d2
+	add.w	#32,%d2
+	lsl.l	%d2,%d1
+	jeq	1f
+	bset	#0,%d0			| Sticky bit.
+	| Check the lower lword
+1:	tst.l	(%a0)
+	jeq	fp_ns_round
+	clr	(%a0)
+	bset	#0,%d0			| Sticky bit.
+	jra	fp_ns_round
+	| Sorry, the number is just too small.
+2:	clr.l	(%a0)+
+	clr.l	(%a0)
+	moveq	#1,%d0			| Smallest possible fraction,
+	jra	fp_ns_round		| round as desired.
+	| Exponent overflow.  Just call it infinity.
+fp_ns_large:
+	tst.b	(3,%a0)
+	jeq	1f
+	fp_set_sr FPSR_EXC_INEX2
+1:	fp_set_sr FPSR_EXC_OVFL
+	move.w	(FPD_RND,FPDATA),%d2
+	jne	3f			| %d2 = 0 round to nearest
+1:	move.w	#0x7fff,(-2,%a0)
+	clr.l	(%a0)+
+	clr.l	(%a0)
+2:	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+3:	subq.w	#2,%d2
+	jcs	5f			| %d2 < 2, round to zero
+	jhi	4f			| %d2 > 2, round to +infinity
+	tst.b	(-3,%a0)		| to -inf
+	jne	1b
+	jra	5f
+4:	tst.b	(-3,%a0)		| to +inf
+	jeq	1b
+5:	move.w	#0x407e,(-2,%a0)
+	move.l	#0xffffff00,(%a0)+
+	clr.l	(%a0)
+	jra	2b
+	| zero and denormalized
+fp_ns_zero:
+	tst.l	(%a0)+
+	jne	1f
+	tst.l	(%a0)
+	jne	1f
+	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts				| zero.  nothing to do.
+	| These are not merely subnormal numbers, but true denormals,
+	| i.e. pathologically small (exponent is 2**-16383) numbers.
+	| It is clearly impossible for even a normal extended number
+	| with that exponent to fit into single precision, so just
+	| write these ones off as "too darn small".
+1:	fp_set_sr FPSR_EXC_UNFL		| Set UNFL bit
+	clr.l	(%a0)
+	clr.l	-(%a0)
+	move.w	#0x3f81,-(%a0)		| i.e. 2**-126
+	addq.l	#6,%a0
+	moveq	#1,%d0
+	jra	fp_ns_round		| round.
+	| Infinities or NaNs
+fp_ns_huge:
+	subq.l	#4,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+
+	| fp_normalize_single_fast:
+	| normalize an extended with single (23-bit) precision
+	| this is only used by fsgldiv/fsgdlmul, where the
+	| operand is not completly normalized.
+	| args:	 %a0 (struct fp_ext *)
+
+fp_normalize_single_fast:
+	printf	PNORM,"nsf: %p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,") "
+	addq.l	#2,%a0
+	move.w	(%a0)+,%d2
+	cmp.w	#0x7fff,%d2
+	jeq	fp_nsf_huge		| NaN / infinitive.
+	move.l	(%a0)+,%d0		| get high lword of mantissa
+fp_nsf_round:
+	tst.l	(%a0)			| check the low lword
+	jeq	1f
+	| Set a sticky bit if it is non-zero.  This should only
+	| affect the rounding in what would otherwise be equal-
+	| distance situations, which is what we want it to do.
+	bset	#0,%d0
+1:	clr.l	(%a0)			| zap it from memory.
+	| now, round off the low 8 bits of the hi lword.
+	tst.b	%d0			| 8 low bits.
+	jne	fp_nsf_checkround	| Are they non-zero?
+	| nothing to do here
+	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+fp_nsf_checkround:
+	fp_set_sr FPSR_EXC_INEX2	| INEX2 bit
+	clr.b	-(%a0)			| clear low byte of high lword
+	subq.l	#3,%a0
+	move.w	(FPD_RND,FPDATA),%d2	| rounding mode
+	jne	2f			| %d2 == 0, round to nearest
+	tst.b	%d0			| test guard bit
+	jpl	9f			| zero is closer
+	btst	#8,%d0			| test lsb bit
+	| round to even behaviour, see above.
+	jne	fp_nsf_doroundup		| round to infinity
+	lsl.b	#1,%d0			| check low bits
+	jeq	9f			| round to zero
+fp_nsf_doroundup:
+	| round (the mantissa, that is) towards infinity
+	add.l	#0x100,(%a0)
+	jcc	9f			| no overflow, good.
+	| Overflow.  This means that the %d1 was 0xffffff00, so it
+	| is now zero.  We will set the mantissa to reflect this, and
+	| increment the exponent (checking for overflow there too)
+	move.w	#0x8000,(%a0)
+	addq.w	#1,-(%a0)
+	cmp.w	#0x407f,(%a0)+		| exponent now overflown?
+	jeq	fp_nsf_large		| yes, so make it infinity.
+9:	subq.l	#4,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+	| check nondefault rounding modes
+2:	subq.w	#2,%d2
+	jcs	9b			| %d2 < 2, round to zero
+	jhi	3f			| %d2 > 2, round to +infinity
+	tst.b	(-3,%a0)		| to -inf
+	jne	fp_nsf_doroundup	| negative, round to infinity
+	jra	9b			| positive, round to zero
+3:	tst.b	(-3,%a0)		| to +inf
+	jeq	fp_nsf_doroundup		| positive, round to infinity
+	jra	9b			| negative, round to zero
+	| Exponent overflow.  Just call it infinity.
+fp_nsf_large:
+	tst.b	(3,%a0)
+	jeq	1f
+	fp_set_sr FPSR_EXC_INEX2
+1:	fp_set_sr FPSR_EXC_OVFL
+	move.w	(FPD_RND,FPDATA),%d2
+	jne	3f			| %d2 = 0 round to nearest
+1:	move.w	#0x7fff,(-2,%a0)
+	clr.l	(%a0)+
+	clr.l	(%a0)
+2:	subq.l	#8,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+3:	subq.w	#2,%d2
+	jcs	5f			| %d2 < 2, round to zero
+	jhi	4f			| %d2 > 2, round to +infinity
+	tst.b	(-3,%a0)		| to -inf
+	jne	1b
+	jra	5f
+4:	tst.b	(-3,%a0)		| to +inf
+	jeq	1b
+5:	move.w	#0x407e,(-2,%a0)
+	move.l	#0xffffff00,(%a0)+
+	clr.l	(%a0)
+	jra	2b
+	| Infinities or NaNs
+fp_nsf_huge:
+	subq.l	#4,%a0
+	printf	PNORM,"%p(",1,%a0
+	printx	PNORM,%a0@
+	printf	PNORM,")\n"
+	rts
+
+	| conv_ext2int (macro):
+	| Generates a subroutine that converts an extended value to an
+	| integer of a given size, again, with the appropriate type of
+	| rounding.
+
+	| Macro arguments:
+	| s:	size, as given in an assembly instruction.
+	| b:	number of bits in that size.
+
+	| Subroutine arguments:
+	| %a0:	source (struct fp_ext *)
+
+	| Returns the integer in %d0 (like it should)
+
+.macro conv_ext2int s,b
+	.set	inf,(1<<(\b-1))-1	| i.e. MAXINT
+	printf	PCONV,"e2i%d: %p(",2,#\b,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,") "
+	addq.l	#2,%a0
+	move.w	(%a0)+,%d2		| exponent
+	jeq	fp_e2i_zero\b		| zero / denorm (== 0, here)
+	cmp.w	#0x7fff,%d2
+	jeq	fp_e2i_huge\b		| Inf / NaN
+	sub.w	#0x3ffe,%d2
+	jcs	fp_e2i_small\b
+	cmp.w	#\b,%d2
+	jhi	fp_e2i_large\b
+	move.l	(%a0),%d0
+	move.l	%d0,%d1
+	lsl.l	%d2,%d1
+	jne	fp_e2i_round\b
+	tst.l	(4,%a0)
+	jne	fp_e2i_round\b
+	neg.w	%d2
+	add.w	#32,%d2
+	lsr.l	%d2,%d0
+9:	tst.w	(-4,%a0)
+	jne	1f
+	tst.\s	%d0
+	jmi	fp_e2i_large\b
+	printf	PCONV,"-> %p\n",1,%d0
+	rts
+1:	neg.\s	%d0
+	jeq	1f
+	jpl	fp_e2i_large\b
+1:	printf	PCONV,"-> %p\n",1,%d0
+	rts
+fp_e2i_round\b:
+	fp_set_sr FPSR_EXC_INEX2	| INEX2 bit
+	neg.w	%d2
+	add.w	#32,%d2
+	.if	\b>16
+	jeq	5f
+	.endif
+	lsr.l	%d2,%d0
+	move.w	(FPD_RND,FPDATA),%d2	| rounding mode
+	jne	2f			| %d2 == 0, round to nearest
+	tst.l	%d1			| test guard bit
+	jpl	9b			| zero is closer
+	btst	%d2,%d0			| test lsb bit (%d2 still 0)
+	jne	fp_e2i_doroundup\b
+	lsl.l	#1,%d1			| check low bits
+	jne	fp_e2i_doroundup\b
+	tst.l	(4,%a0)
+	jeq	9b
+fp_e2i_doroundup\b:
+	addq.l	#1,%d0
+	jra	9b
+	| check nondefault rounding modes
+2:	subq.w	#2,%d2
+	jcs	9b			| %d2 < 2, round to zero
+	jhi	3f			| %d2 > 2, round to +infinity
+	tst.w	(-4,%a0)		| to -inf
+	jne	fp_e2i_doroundup\b	| negative, round to infinity
+	jra	9b			| positive, round to zero
+3:	tst.w	(-4,%a0)		| to +inf
+	jeq	fp_e2i_doroundup\b	| positive, round to infinity
+	jra	9b	| negative, round to zero
+	| we are only want -2**127 get correctly rounded here,
+	| since the guard bit is in the lower lword.
+	| everything else ends up anyway as overflow.
+	.if	\b>16
+5:	move.w	(FPD_RND,FPDATA),%d2	| rounding mode
+	jne	2b			| %d2 == 0, round to nearest
+	move.l	(4,%a0),%d1		| test guard bit
+	jpl	9b			| zero is closer
+	lsl.l	#1,%d1			| check low bits
+	jne	fp_e2i_doroundup\b
+	jra	9b
+	.endif
+fp_e2i_zero\b:
+	clr.l	%d0
+	tst.l	(%a0)+
+	jne	1f
+	tst.l	(%a0)
+	jeq	3f
+1:	subq.l	#4,%a0
+	fp_clr_sr FPSR_EXC_UNFL		| fp_normalize_ext has set this bit
+fp_e2i_small\b:
+	fp_set_sr FPSR_EXC_INEX2
+	clr.l	%d0
+	move.w	(FPD_RND,FPDATA),%d2	| rounding mode
+	subq.w	#2,%d2
+	jcs	3f			| %d2 < 2, round to nearest/zero
+	jhi	2f			| %d2 > 2, round to +infinity
+	tst.w	(-4,%a0)		| to -inf
+	jeq	3f
+	subq.\s	#1,%d0
+	jra	3f
+2:	tst.w	(-4,%a0)		| to +inf
+	jne	3f
+	addq.\s	#1,%d0
+3:	printf	PCONV,"-> %p\n",1,%d0
+	rts
+fp_e2i_large\b:
+	fp_set_sr FPSR_EXC_OPERR
+	move.\s	#inf,%d0
+	tst.w	(-4,%a0)
+	jeq	1f
+	addq.\s	#1,%d0
+1:	printf	PCONV,"-> %p\n",1,%d0
+	rts
+fp_e2i_huge\b:
+	move.\s	(%a0),%d0
+	tst.l	(%a0)
+	jne	1f
+	tst.l	(%a0)
+	jeq	fp_e2i_large\b
+	| fp_normalize_ext has set this bit already
+	| and made the number nonsignaling
+1:	fp_tst_sr FPSR_EXC_SNAN
+	jne	1f
+	fp_set_sr FPSR_EXC_OPERR
+1:	printf	PCONV,"-> %p\n",1,%d0
+	rts
+.endm
+
+fp_conv_ext2long:
+	conv_ext2int l,32
+
+fp_conv_ext2short:
+	conv_ext2int w,16
+
+fp_conv_ext2byte:
+	conv_ext2int b,8
+
+fp_conv_ext2double:
+	jsr	fp_normalize_double
+	printf	PCONV,"e2d: %p(",1,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,"), "
+	move.l	(%a0)+,%d2
+	cmp.w	#0x7fff,%d2
+	jne	1f
+	move.w	#0x7ff,%d2
+	move.l	(%a0)+,%d0
+	jra	2f
+1:	sub.w	#0x3fff-0x3ff,%d2
+	move.l	(%a0)+,%d0
+	jmi	2f
+	clr.w	%d2
+2:	lsl.w	#5,%d2
+	lsl.l	#7,%d2
+	lsl.l	#8,%d2
+	move.l	%d0,%d1
+	lsl.l	#1,%d0
+	lsr.l	#4,%d0
+	lsr.l	#8,%d0
+	or.l	%d2,%d0
+	putuser.l %d0,(%a1)+,fp_err_ua2,%a1
+	moveq	#21,%d0
+	lsl.l	%d0,%d1
+	move.l	(%a0),%d0
+	lsr.l	#4,%d0
+	lsr.l	#7,%d0
+	or.l	%d1,%d0
+	putuser.l %d0,(%a1),fp_err_ua2,%a1
+#ifdef FPU_EMU_DEBUG
+	getuser.l %a1@(-4),%d0,fp_err_ua2,%a1
+	getuser.l %a1@(0),%d1,fp_err_ua2,%a1
+	printf	PCONV,"%p(%08x%08x)\n",3,%a1,%d0,%d1
+#endif
+	rts
+
+fp_conv_ext2single:
+	jsr	fp_normalize_single
+	printf	PCONV,"e2s: %p(",1,%a0
+	printx	PCONV,%a0@
+	printf	PCONV,"), "
+	move.l	(%a0)+,%d1
+	cmp.w	#0x7fff,%d1
+	jne	1f
+	move.w	#0xff,%d1
+	move.l	(%a0)+,%d0
+	jra	2f
+1:	sub.w	#0x3fff-0x7f,%d1
+	move.l	(%a0)+,%d0
+	jmi	2f
+	clr.w	%d1
+2:	lsl.w	#8,%d1
+	lsl.l	#7,%d1
+	lsl.l	#8,%d1
+	bclr	#31,%d0
+	lsr.l	#8,%d0
+	or.l	%d1,%d0
+	printf	PCONV,"%08x\n",1,%d0
+	rts
+
+	| special return addresses for instr that
+	| encode the rounding precision in the opcode
+	| (e.g. fsmove,fdmove)
+
+fp_finalrounding_single:
+	addq.l	#8,%sp
+	jsr	fp_normalize_ext
+	jsr	fp_normalize_single
+	jra	fp_finaltest
+
+fp_finalrounding_single_fast:
+	addq.l	#8,%sp
+	jsr	fp_normalize_ext
+	jsr	fp_normalize_single_fast
+	jra	fp_finaltest
+
+fp_finalrounding_double:
+	addq.l	#8,%sp
+	jsr	fp_normalize_ext
+	jsr	fp_normalize_double
+	jra	fp_finaltest
+
+	| fp_finaltest:
+	| set the emulated status register based on the outcome of an
+	| emulated instruction.
+
+fp_finalrounding:
+	addq.l	#8,%sp
+|	printf	,"f: %p\n",1,%a0
+	jsr	fp_normalize_ext
+	move.w	(FPD_PREC,FPDATA),%d0
+	subq.w	#1,%d0
+	jcs	fp_finaltest
+	jne	1f
+	jsr	fp_normalize_single
+	jra	2f
+1:	jsr	fp_normalize_double
+2:|	printf	,"f: %p\n",1,%a0
+fp_finaltest:
+	| First, we do some of the obvious tests for the exception
+	| status byte and condition code bytes of fp_sr here, so that
+	| they do not have to be handled individually by every
+	| emulated instruction.
+	clr.l	%d0
+	addq.l	#1,%a0
+	tst.b	(%a0)+			| sign
+	jeq	1f
+	bset	#FPSR_CC_NEG-24,%d0	| N bit
+1:	cmp.w	#0x7fff,(%a0)+		| exponent
+	jeq	2f
+	| test for zero
+	moveq	#FPSR_CC_Z-24,%d1
+	tst.l	(%a0)+
+	jne	9f
+	tst.l	(%a0)
+	jne	9f
+	jra	8f
+	| infinitiv and NAN
+2:	moveq	#FPSR_CC_NAN-24,%d1
+	move.l	(%a0)+,%d2
+	lsl.l	#1,%d2			| ignore high bit
+	jne	8f
+	tst.l	(%a0)
+	jne	8f
+	moveq	#FPSR_CC_INF-24,%d1
+8:	bset	%d1,%d0
+9:	move.b	%d0,(FPD_FPSR+0,FPDATA)	| set condition test result
+	| move instructions enter here
+	| Here, we test things in the exception status byte, and set
+	| other things in the accrued exception byte accordingly.
+	| Emulated instructions can set various things in the former,
+	| as defined in fp_emu.h.
+fp_final:
+	move.l	(FPD_FPSR,FPDATA),%d0
+#if 0
+	btst	#FPSR_EXC_SNAN,%d0	| EXC_SNAN
+	jne	1f
+	btst	#FPSR_EXC_OPERR,%d0	| EXC_OPERR
+	jeq	2f
+1:	bset	#FPSR_AEXC_IOP,%d0	| set IOP bit
+2:	btst	#FPSR_EXC_OVFL,%d0	| EXC_OVFL
+	jeq	1f
+	bset	#FPSR_AEXC_OVFL,%d0	| set OVFL bit
+1:	btst	#FPSR_EXC_UNFL,%d0	| EXC_UNFL
+	jeq	1f
+	btst	#FPSR_EXC_INEX2,%d0	| EXC_INEX2
+	jeq	1f
+	bset	#FPSR_AEXC_UNFL,%d0	| set UNFL bit
+1:	btst	#FPSR_EXC_DZ,%d0	| EXC_INEX1
+	jeq	1f
+	bset	#FPSR_AEXC_DZ,%d0	| set DZ bit
+1:	btst	#FPSR_EXC_OVFL,%d0	| EXC_OVFL
+	jne	1f
+	btst	#FPSR_EXC_INEX2,%d0	| EXC_INEX2
+	jne	1f
+	btst	#FPSR_EXC_INEX1,%d0	| EXC_INEX1
+	jeq	2f
+1:	bset	#FPSR_AEXC_INEX,%d0	| set INEX bit
+2:	move.l	%d0,(FPD_FPSR,FPDATA)
+#else
+	| same as above, greatly optimized, but untested (yet)
+	move.l	%d0,%d2
+	lsr.l	#5,%d0
+	move.l	%d0,%d1
+	lsr.l	#4,%d1
+	or.l	%d0,%d1
+	and.b	#0x08,%d1
+	move.l	%d2,%d0
+	lsr.l	#6,%d0
+	or.l	%d1,%d0
+	move.l	%d2,%d1
+	lsr.l	#4,%d1
+	or.b	#0xdf,%d1
+	and.b	%d1,%d0
+	move.l	%d2,%d1
+	lsr.l	#7,%d1
+	and.b	#0x80,%d1
+	or.b	%d1,%d0
+	and.b	#0xf8,%d0
+	or.b	%d0,%d2
+	move.l	%d2,(FPD_FPSR,FPDATA)
+#endif
+	move.b	(FPD_FPSR+2,FPDATA),%d0
+	and.b	(FPD_FPCR+2,FPDATA),%d0
+	jeq	1f
+	printf	,"send signal!!!\n"
+1:	jra	fp_end
diff --git a/arch/m68k/math-emu/multi_arith.h b/arch/m68k/math-emu/multi_arith.h
new file mode 100644
index 0000000..02251e5
--- /dev/null
+++ b/arch/m68k/math-emu/multi_arith.h
@@ -0,0 +1,819 @@
+/* multi_arith.h: multi-precision integer arithmetic functions, needed
+   to do extended-precision floating point.
+
+   (c) 1998 David Huggins-Daines.
+
+   Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
+   David Mosberger-Tang.
+
+   You may copy, modify, and redistribute this file under the terms of
+   the GNU General Public License, version 2, or any later version, at
+   your convenience. */
+
+/* Note:
+
+   These are not general multi-precision math routines.  Rather, they
+   implement the subset of integer arithmetic that we need in order to
+   multiply, divide, and normalize 128-bit unsigned mantissae.  */
+
+#ifndef MULTI_ARITH_H
+#define MULTI_ARITH_H
+
+#if 0	/* old code... */
+
+/* Unsigned only, because we don't need signs to multiply and divide. */
+typedef unsigned int int128[4];
+
+/* Word order */
+enum {
+	MSW128,
+	NMSW128,
+	NLSW128,
+	LSW128
+};
+
+/* big-endian */
+#define LO_WORD(ll) (((unsigned int *) &ll)[1])
+#define HI_WORD(ll) (((unsigned int *) &ll)[0])
+
+/* Convenience functions to stuff various integer values into int128s */
+
+static inline void zero128(int128 a)
+{
+	a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
+}
+
+/* Human-readable word order in the arguments */
+static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
+			  unsigned int i0, int128 a)
+{
+	a[LSW128] = i0;
+	a[NLSW128] = i1;
+	a[NMSW128] = i2;
+	a[MSW128] = i3;
+}
+
+/* Convenience functions (for testing as well) */
+static inline void int64_to_128(unsigned long long src, int128 dest)
+{
+	dest[LSW128] = (unsigned int) src;
+	dest[NLSW128] = src >> 32;
+	dest[NMSW128] = dest[MSW128] = 0;
+}
+
+static inline void int128_to_64(const int128 src, unsigned long long *dest)
+{
+	*dest = src[LSW128] | (long long) src[NLSW128] << 32;
+}
+
+static inline void put_i128(const int128 a)
+{
+	printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
+	       a[NLSW128], a[LSW128]);
+}
+
+/* Internal shifters:
+
+   Note that these are only good for 0 < count < 32.
+ */
+
+static inline void _lsl128(unsigned int count, int128 a)
+{
+	a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
+	a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
+	a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
+	a[LSW128] <<= count;
+}
+
+static inline void _lsr128(unsigned int count, int128 a)
+{
+	a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
+	a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
+	a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
+	a[MSW128] >>= count;
+}
+
+/* Should be faster, one would hope */
+
+static inline void lslone128(int128 a)
+{
+	asm volatile ("lsl.l #1,%0\n"
+		      "roxl.l #1,%1\n"
+		      "roxl.l #1,%2\n"
+		      "roxl.l #1,%3\n"
+		      :
+		      "=d" (a[LSW128]),
+		      "=d"(a[NLSW128]),
+		      "=d"(a[NMSW128]),
+		      "=d"(a[MSW128])
+		      :
+		      "0"(a[LSW128]),
+		      "1"(a[NLSW128]),
+		      "2"(a[NMSW128]),
+		      "3"(a[MSW128]));
+}
+
+static inline void lsrone128(int128 a)
+{
+	asm volatile ("lsr.l #1,%0\n"
+		      "roxr.l #1,%1\n"
+		      "roxr.l #1,%2\n"
+		      "roxr.l #1,%3\n"
+		      :
+		      "=d" (a[MSW128]),
+		      "=d"(a[NMSW128]),
+		      "=d"(a[NLSW128]),
+		      "=d"(a[LSW128])
+		      :
+		      "0"(a[MSW128]),
+		      "1"(a[NMSW128]),
+		      "2"(a[NLSW128]),
+		      "3"(a[LSW128]));
+}
+
+/* Generalized 128-bit shifters:
+
+   These bit-shift to a multiple of 32, then move whole longwords.  */
+
+static inline void lsl128(unsigned int count, int128 a)
+{
+	int wordcount, i;
+
+	if (count % 32)
+		_lsl128(count % 32, a);
+
+	if (0 == (wordcount = count / 32))
+		return;
+
+	/* argh, gak, endian-sensitive */
+	for (i = 0; i < 4 - wordcount; i++) {
+		a[i] = a[i + wordcount];
+	}
+	for (i = 3; i >= 4 - wordcount; --i) {
+		a[i] = 0;
+	}
+}
+
+static inline void lsr128(unsigned int count, int128 a)
+{
+	int wordcount, i;
+
+	if (count % 32)
+		_lsr128(count % 32, a);
+
+	if (0 == (wordcount = count / 32))
+		return;
+
+	for (i = 3; i >= wordcount; --i) {
+		a[i] = a[i - wordcount];
+	}
+	for (i = 0; i < wordcount; i++) {
+		a[i] = 0;
+	}
+}
+
+static inline int orl128(int a, int128 b)
+{
+	b[LSW128] |= a;
+}
+
+static inline int btsthi128(const int128 a)
+{
+	return a[MSW128] & 0x80000000;
+}
+
+/* test bits (numbered from 0 = LSB) up to and including "top" */
+static inline int bftestlo128(int top, const int128 a)
+{
+	int r = 0;
+
+	if (top > 31)
+		r |= a[LSW128];
+	if (top > 63)
+		r |= a[NLSW128];
+	if (top > 95)
+		r |= a[NMSW128];
+
+	r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
+
+	return (r != 0);
+}
+
+/* Aargh.  We need these because GCC is broken */
+/* FIXME: do them in assembly, for goodness' sake! */
+static inline void mask64(int pos, unsigned long long *mask)
+{
+	*mask = 0;
+
+	if (pos < 32) {
+		LO_WORD(*mask) = (1 << pos) - 1;
+		return;
+	}
+	LO_WORD(*mask) = -1;
+	HI_WORD(*mask) = (1 << (pos - 32)) - 1;
+}
+
+static inline void bset64(int pos, unsigned long long *dest)
+{
+	/* This conditional will be optimized away.  Thanks, GCC! */
+	if (pos < 32)
+		asm volatile ("bset %1,%0":"=m"
+			      (LO_WORD(*dest)):"id"(pos));
+	else
+		asm volatile ("bset %1,%0":"=m"
+			      (HI_WORD(*dest)):"id"(pos - 32));
+}
+
+static inline int btst64(int pos, unsigned long long dest)
+{
+	if (pos < 32)
+		return (0 != (LO_WORD(dest) & (1 << pos)));
+	else
+		return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
+}
+
+static inline void lsl64(int count, unsigned long long *dest)
+{
+	if (count < 32) {
+		HI_WORD(*dest) = (HI_WORD(*dest) << count)
+		    | (LO_WORD(*dest) >> count);
+		LO_WORD(*dest) <<= count;
+		return;
+	}
+	count -= 32;
+	HI_WORD(*dest) = LO_WORD(*dest) << count;
+	LO_WORD(*dest) = 0;
+}
+
+static inline void lsr64(int count, unsigned long long *dest)
+{
+	if (count < 32) {
+		LO_WORD(*dest) = (LO_WORD(*dest) >> count)
+		    | (HI_WORD(*dest) << (32 - count));
+		HI_WORD(*dest) >>= count;
+		return;
+	}
+	count -= 32;
+	LO_WORD(*dest) = HI_WORD(*dest) >> count;
+	HI_WORD(*dest) = 0;
+}
+#endif
+
+static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
+{
+	reg->exp += cnt;
+
+	switch (cnt) {
+	case 0 ... 8:
+		reg->lowmant = reg->mant.m32[1] << (8 - cnt);
+		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
+				   (reg->mant.m32[0] << (32 - cnt));
+		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
+		break;
+	case 9 ... 32:
+		reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
+		if (reg->mant.m32[1] << (40 - cnt))
+			reg->lowmant |= 1;
+		reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
+				   (reg->mant.m32[0] << (32 - cnt));
+		reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
+		break;
+	case 33 ... 39:
+		asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
+			: "m" (reg->mant.m32[0]), "d" (64 - cnt));
+		if (reg->mant.m32[1] << (40 - cnt))
+			reg->lowmant |= 1;
+		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
+		reg->mant.m32[0] = 0;
+		break;
+	case 40 ... 71:
+		reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
+		if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
+			reg->lowmant |= 1;
+		reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
+		reg->mant.m32[0] = 0;
+		break;
+	default:
+		reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
+		reg->mant.m32[0] = 0;
+		reg->mant.m32[1] = 0;
+		break;
+	}
+}
+
+static inline int fp_overnormalize(struct fp_ext *reg)
+{
+	int shift;
+
+	if (reg->mant.m32[0]) {
+		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
+		reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
+		reg->mant.m32[1] = (reg->mant.m32[1] << shift);
+	} else {
+		asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
+		reg->mant.m32[0] = (reg->mant.m32[1] << shift);
+		reg->mant.m32[1] = 0;
+		shift += 32;
+	}
+
+	return shift;
+}
+
+static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
+{
+	int carry;
+
+	/* we assume here, gcc only insert move and a clr instr */
+	asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
+		: "g,d" (src->lowmant), "0,0" (dest->lowmant));
+	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
+		: "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
+	asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
+		: "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
+	asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
+
+	return carry;
+}
+
+static inline int fp_addcarry(struct fp_ext *reg)
+{
+	if (++reg->exp == 0x7fff) {
+		if (reg->mant.m64)
+			fp_set_sr(FPSR_EXC_INEX2);
+		reg->mant.m64 = 0;
+		fp_set_sr(FPSR_EXC_OVFL);
+		return 0;
+	}
+	reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
+	reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
+			   (reg->mant.m32[0] << 31);
+	reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
+
+	return 1;
+}
+
+static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
+			      struct fp_ext *src2)
+{
+	/* we assume here, gcc only insert move and a clr instr */
+	asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
+		: "g,d" (src2->lowmant), "0,0" (src1->lowmant));
+	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
+		: "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
+	asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
+		: "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
+}
+
+#define fp_mul64(desth, destl, src1, src2) ({				\
+	asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth)		\
+		: "g" (src1), "0" (src2));				\
+})
+#define fp_div64(quot, rem, srch, srcl, div)				\
+	asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem)		\
+		: "dm" (div), "1" (srch), "0" (srcl))
+#define fp_add64(dest1, dest2, src1, src2) ({				\
+	asm ("add.l %1,%0" : "=d,dm" (dest2)				\
+		: "dm,d" (src2), "0,0" (dest2));			\
+	asm ("addx.l %1,%0" : "=d" (dest1)				\
+		: "d" (src1), "0" (dest1));				\
+})
+#define fp_addx96(dest, src) ({						\
+	/* we assume here, gcc only insert move and a clr instr */	\
+	asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2])		\
+		: "g,d" (temp.m32[1]), "0,0" (dest->m32[2]));		\
+	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1])		\
+		: "d" (temp.m32[0]), "0" (dest->m32[1]));		\
+	asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0])		\
+		: "d" (0), "0" (dest->m32[0]));				\
+})
+#define fp_sub64(dest, src) ({						\
+	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1])			\
+		: "dm,d" (src.m32[1]), "0,0" (dest.m32[1]));		\
+	asm ("subx.l %1,%0" : "=d" (dest.m32[0])			\
+		: "d" (src.m32[0]), "0" (dest.m32[0]));			\
+})
+#define fp_sub96c(dest, srch, srcm, srcl) ({				\
+	char carry;							\
+	asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2])			\
+		: "dm,d" (srcl), "0,0" (dest.m32[2]));			\
+	asm ("subx.l %1,%0" : "=d" (dest.m32[1])			\
+		: "d" (srcm), "0" (dest.m32[1]));			\
+	asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0])	\
+		: "d" (srch), "1" (dest.m32[0]));			\
+	carry;								\
+})
+
+static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
+				   struct fp_ext *src2)
+{
+	union fp_mant64 temp;
+
+	fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
+	fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
+
+	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
+	fp_addx96(dest, temp);
+
+	fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
+	fp_addx96(dest, temp);
+}
+
+static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
+				 struct fp_ext *div)
+{
+	union fp_mant128 tmp;
+	union fp_mant64 tmp64;
+	unsigned long *mantp = dest->m32;
+	unsigned long fix, rem, first, dummy;
+	int i;
+
+	/* the algorithm below requires dest to be smaller than div,
+	   but both have the high bit set */
+	if (src->mant.m64 >= div->mant.m64) {
+		fp_sub64(src->mant, div->mant);
+		*mantp = 1;
+	} else
+		*mantp = 0;
+	mantp++;
+
+	/* basic idea behind this algorithm: we can't divide two 64bit numbers
+	   (AB/CD) directly, but we can calculate AB/C0, but this means this
+	   quotient is off by C0/CD, so we have to multiply the first result
+	   to fix the result, after that we have nearly the correct result
+	   and only a few corrections are needed. */
+
+	/* C0/CD can be precalculated, but it's an 64bit division again, but
+	   we can make it a bit easier, by dividing first through C so we get
+	   10/1D and now only a single shift and the value fits into 32bit. */
+	fix = 0x80000000;
+	dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
+	dummy = (dummy >> 1) | fix;
+	fp_div64(fix, dummy, fix, 0, dummy);
+	fix--;
+
+	for (i = 0; i < 3; i++, mantp++) {
+		if (src->mant.m32[0] == div->mant.m32[0]) {
+			fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
+
+			fp_mul64(*mantp, dummy, first, fix);
+			*mantp += fix;
+		} else {
+			fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
+
+			fp_mul64(*mantp, dummy, first, fix);
+		}
+
+		fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
+		fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
+		tmp.m32[2] = 0;
+
+		fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
+		fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
+
+		src->mant.m32[0] = tmp.m32[1];
+		src->mant.m32[1] = tmp.m32[2];
+
+		while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
+			src->mant.m32[0] = tmp.m32[1];
+			src->mant.m32[1] = tmp.m32[2];
+			*mantp += 1;
+		}
+	}
+}
+
+#if 0
+static inline unsigned int fp_fls128(union fp_mant128 *src)
+{
+	unsigned long data;
+	unsigned int res, off;
+
+	if ((data = src->m32[0]))
+		off = 0;
+	else if ((data = src->m32[1]))
+		off = 32;
+	else if ((data = src->m32[2]))
+		off = 64;
+	else if ((data = src->m32[3]))
+		off = 96;
+	else
+		return 128;
+
+	asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
+	return res + off;
+}
+
+static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
+{
+	unsigned long sticky;
+
+	switch (shift) {
+	case 0:
+		return;
+	case 1:
+		asm volatile ("lsl.l #1,%0"
+			: "=d" (src->m32[3]) : "0" (src->m32[3]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (src->m32[2]) : "0" (src->m32[2]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (src->m32[1]) : "0" (src->m32[1]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (src->m32[0]) : "0" (src->m32[0]));
+		return;
+	case 2 ... 31:
+		src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
+		src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
+		src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+		src->m32[3] = (src->m32[3] << shift);
+		return;
+	case 32 ... 63:
+		shift -= 32;
+		src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
+		src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+		src->m32[2] = (src->m32[3] << shift);
+		src->m32[3] = 0;
+		return;
+	case 64 ... 95:
+		shift -= 64;
+		src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
+		src->m32[1] = (src->m32[3] << shift);
+		src->m32[2] = src->m32[3] = 0;
+		return;
+	case 96 ... 127:
+		shift -= 96;
+		src->m32[0] = (src->m32[3] << shift);
+		src->m32[1] = src->m32[2] = src->m32[3] = 0;
+		return;
+	case -31 ... -1:
+		shift = -shift;
+		sticky = 0;
+		if (src->m32[3] << (32 - shift))
+			sticky = 1;
+		src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
+		src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
+		src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
+		src->m32[0] = (src->m32[0] >> shift);
+		return;
+	case -63 ... -32:
+		shift = -shift - 32;
+		sticky = 0;
+		if ((src->m32[2] << (32 - shift)) || src->m32[3])
+			sticky = 1;
+		src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
+		src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
+		src->m32[1] = (src->m32[0] >> shift);
+		src->m32[0] = 0;
+		return;
+	case -95 ... -64:
+		shift = -shift - 64;
+		sticky = 0;
+		if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
+			sticky = 1;
+		src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
+		src->m32[2] = (src->m32[0] >> shift);
+		src->m32[1] = src->m32[0] = 0;
+		return;
+	case -127 ... -96:
+		shift = -shift - 96;
+		sticky = 0;
+		if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
+			sticky = 1;
+		src->m32[3] = (src->m32[0] >> shift) | sticky;
+		src->m32[2] = src->m32[1] = src->m32[0] = 0;
+		return;
+	}
+
+	if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
+		src->m32[3] = 1;
+	else
+		src->m32[3] = 0;
+	src->m32[2] = 0;
+	src->m32[1] = 0;
+	src->m32[0] = 0;
+}
+#endif
+
+static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
+				 int shift)
+{
+	unsigned long tmp;
+
+	switch (shift) {
+	case 0:
+		dest->mant.m64 = src->m64[0];
+		dest->lowmant = src->m32[2] >> 24;
+		if (src->m32[3] || (src->m32[2] << 8))
+			dest->lowmant |= 1;
+		break;
+	case 1:
+		asm volatile ("lsl.l #1,%0"
+			: "=d" (tmp) : "0" (src->m32[2]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
+		asm volatile ("roxl.l #1,%0"
+			: "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
+		dest->lowmant = tmp >> 24;
+		if (src->m32[3] || (tmp << 8))
+			dest->lowmant |= 1;
+		break;
+	case 31:
+		asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
+			: "=d" (dest->mant.m32[0])
+			: "d" (src->m32[0]), "0" (src->m32[1]));
+		asm volatile ("roxr.l #1,%0"
+			: "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
+		asm volatile ("roxr.l #1,%0"
+			: "=d" (tmp) : "0" (src->m32[3]));
+		dest->lowmant = tmp >> 24;
+		if (src->m32[3] << 7)
+			dest->lowmant |= 1;
+		break;
+	case 32:
+		dest->mant.m32[0] = src->m32[1];
+		dest->mant.m32[1] = src->m32[2];
+		dest->lowmant = src->m32[3] >> 24;
+		if (src->m32[3] << 8)
+			dest->lowmant |= 1;
+		break;
+	}
+}
+
+#if 0 /* old code... */
+static inline int fls(unsigned int a)
+{
+	int r;
+
+	asm volatile ("bfffo %1{#0,#32},%0"
+		      : "=d" (r) : "md" (a));
+	return r;
+}
+
+/* fls = "find last set" (cf. ffs(3)) */
+static inline int fls128(const int128 a)
+{
+	if (a[MSW128])
+		return fls(a[MSW128]);
+	if (a[NMSW128])
+		return fls(a[NMSW128]) + 32;
+	/* XXX: it probably never gets beyond this point in actual
+	   use, but that's indicative of a more general problem in the
+	   algorithm (i.e. as per the actual 68881 implementation, we
+	   really only need at most 67 bits of precision [plus
+	   overflow]) so I'm not going to fix it. */
+	if (a[NLSW128])
+		return fls(a[NLSW128]) + 64;
+	if (a[LSW128])
+		return fls(a[LSW128]) + 96;
+	else
+		return -1;
+}
+
+static inline int zerop128(const int128 a)
+{
+	return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
+}
+
+static inline int nonzerop128(const int128 a)
+{
+	return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
+}
+
+/* Addition and subtraction */
+/* Do these in "pure" assembly, because "extended" asm is unmanageable
+   here */
+static inline void add128(const int128 a, int128 b)
+{
+	/* rotating carry flags */
+	unsigned int carry[2];
+
+	carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
+	b[LSW128] += a[LSW128];
+
+	carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
+	b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
+
+	carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
+	b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
+
+	b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
+}
+
+/* Note: assembler semantics: "b -= a" */
+static inline void sub128(const int128 a, int128 b)
+{
+	/* rotating borrow flags */
+	unsigned int borrow[2];
+
+	borrow[0] = b[LSW128] < a[LSW128];
+	b[LSW128] -= a[LSW128];
+
+	borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
+	b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
+
+	borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
+	b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
+
+	b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
+}
+
+/* Poor man's 64-bit expanding multiply */
+static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
+{
+	unsigned long long acc;
+	int128 acc128;
+
+	zero128(acc128);
+	zero128(c);
+
+	/* first the low words */
+	if (LO_WORD(a) && LO_WORD(b)) {
+		acc = (long long) LO_WORD(a) * LO_WORD(b);
+		c[NLSW128] = HI_WORD(acc);
+		c[LSW128] = LO_WORD(acc);
+	}
+	/* Next the high words */
+	if (HI_WORD(a) && HI_WORD(b)) {
+		acc = (long long) HI_WORD(a) * HI_WORD(b);
+		c[MSW128] = HI_WORD(acc);
+		c[NMSW128] = LO_WORD(acc);
+	}
+	/* The middle words */
+	if (LO_WORD(a) && HI_WORD(b)) {
+		acc = (long long) LO_WORD(a) * HI_WORD(b);
+		acc128[NMSW128] = HI_WORD(acc);
+		acc128[NLSW128] = LO_WORD(acc);
+		add128(acc128, c);
+	}
+	/* The first and last words */
+	if (HI_WORD(a) && LO_WORD(b)) {
+		acc = (long long) HI_WORD(a) * LO_WORD(b);
+		acc128[NMSW128] = HI_WORD(acc);
+		acc128[NLSW128] = LO_WORD(acc);
+		add128(acc128, c);
+	}
+}
+
+/* Note: unsigned */
+static inline int cmp128(int128 a, int128 b)
+{
+	if (a[MSW128] < b[MSW128])
+		return -1;
+	if (a[MSW128] > b[MSW128])
+		return 1;
+	if (a[NMSW128] < b[NMSW128])
+		return -1;
+	if (a[NMSW128] > b[NMSW128])
+		return 1;
+	if (a[NLSW128] < b[NLSW128])
+		return -1;
+	if (a[NLSW128] > b[NLSW128])
+		return 1;
+
+	return (signed) a[LSW128] - b[LSW128];
+}
+
+inline void div128(int128 a, int128 b, int128 c)
+{
+	int128 mask;
+
+	/* Algorithm:
+
+	   Shift the divisor until it's at least as big as the
+	   dividend, keeping track of the position to which we've
+	   shifted it, i.e. the power of 2 which we've multiplied it
+	   by.
+
+	   Then, for this power of 2 (the mask), and every one smaller
+	   than it, subtract the mask from the dividend and add it to
+	   the quotient until the dividend is smaller than the raised
+	   divisor.  At this point, divide the dividend and the mask
+	   by 2 (i.e. shift one place to the right).  Lather, rinse,
+	   and repeat, until there are no more powers of 2 left. */
+
+	/* FIXME: needless to say, there's room for improvement here too. */
+
+	/* Shift up */
+	/* XXX: since it just has to be "at least as big", we can
+	   probably eliminate this horribly wasteful loop.  I will
+	   have to prove this first, though */
+	set128(0, 0, 0, 1, mask);
+	while (cmp128(b, a) < 0 && !btsthi128(b)) {
+		lslone128(b);
+		lslone128(mask);
+	}
+
+	/* Shift down */
+	zero128(c);
+	do {
+		if (cmp128(a, b) >= 0) {
+			sub128(b, a);
+			add128(mask, c);
+		}
+		lsrone128(mask);
+		lsrone128(b);
+	} while (nonzerop128(mask));
+
+	/* The remainder is in a... */
+}
+#endif
+
+#endif	/* MULTI_ARITH_H */