MIPS: Support for 64-bit FP with O32 binaries

CPUs implementing MIPS32 R2 may include a 64-bit FPU, just as MIPS64 CPUs
do. In order to preserve backwards compatibility a 64-bit FPU will act
like a 32-bit FPU (by accessing doubles from the least significant 32
bits of an even-odd pair of FP registers) when the Status.FR bit is
zero, again just like a mips64 CPU. The standard O32 ABI is defined
expecting a 32-bit FPU, however recent toolchains support use of a
64-bit FPU from an O32 MIPS32 executable. When an ELF executable is
built to use a 64-bit FPU a new flag (EF_MIPS_FP64) is set in the ELF
header.

With this patch the kernel will check the EF_MIPS_FP64 flag when
executing an O32 binary, and set Status.FR accordingly. The addition
of O32 64-bit FP support lessens the opportunity for optimisation in
the FPU emulator, so a CONFIG_MIPS_O32_FP64_SUPPORT Kconfig option is
introduced to allow this support to be disabled for those that don't
require it.

Inspired by an earlier patch by Leonid Yegoshin, but implemented more
cleanly & correctly.

Signed-off-by: Paul Burton <paul.burton@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: Paul Burton <paul.burton@imgtec.com>
Patchwork: https://patchwork.linux-mips.org/patch/6154/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 650de39..c12d9c8 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2335,6 +2335,23 @@
 
 	  This feature requires gcc version 4.2 or above.
 
+config MIPS_O32_FP64_SUPPORT
+	bool "Support for O32 binaries using 64-bit FP"
+	depends on 32BIT || MIPS32_O32
+	default y
+	help
+	  When this is enabled, the kernel will support use of 64-bit floating
+	  point registers with binaries using the O32 ABI along with the
+	  EF_MIPS_FP64 ELF header flag (typically built with -mfp64). On
+	  32-bit MIPS systems this support is at the cost of increasing the
+	  size and complexity of the compiled FPU emulator. Thus if you are
+	  running a MIPS32 system and know that none of your userland binaries
+	  will require 64-bit floating point, you may wish to reduce the size
+	  of your kernel & potentially improve FP emulation performance by
+	  saying N here.
+
+	  If unsure, say Y.
+
 config USE_OF
 	bool
 	select OF
diff --git a/arch/mips/include/asm/asmmacro-32.h b/arch/mips/include/asm/asmmacro-32.h
index 2413afe..70e1f17 100644
--- a/arch/mips/include/asm/asmmacro-32.h
+++ b/arch/mips/include/asm/asmmacro-32.h
@@ -12,27 +12,6 @@
 #include <asm/fpregdef.h>
 #include <asm/mipsregs.h>
 
-	.macro	fpu_save_double thread status tmp1=t0
-	cfc1	\tmp1,  fcr31
-	sdc1	$f0,  THREAD_FPR0(\thread)
-	sdc1	$f2,  THREAD_FPR2(\thread)
-	sdc1	$f4,  THREAD_FPR4(\thread)
-	sdc1	$f6,  THREAD_FPR6(\thread)
-	sdc1	$f8,  THREAD_FPR8(\thread)
-	sdc1	$f10, THREAD_FPR10(\thread)
-	sdc1	$f12, THREAD_FPR12(\thread)
-	sdc1	$f14, THREAD_FPR14(\thread)
-	sdc1	$f16, THREAD_FPR16(\thread)
-	sdc1	$f18, THREAD_FPR18(\thread)
-	sdc1	$f20, THREAD_FPR20(\thread)
-	sdc1	$f22, THREAD_FPR22(\thread)
-	sdc1	$f24, THREAD_FPR24(\thread)
-	sdc1	$f26, THREAD_FPR26(\thread)
-	sdc1	$f28, THREAD_FPR28(\thread)
-	sdc1	$f30, THREAD_FPR30(\thread)
-	sw	\tmp1, THREAD_FCR31(\thread)
-	.endm
-
 	.macro	fpu_save_single thread tmp=t0
 	cfc1	\tmp,  fcr31
 	swc1	$f0,  THREAD_FPR0(\thread)
@@ -70,27 +49,6 @@
 	sw	\tmp, THREAD_FCR31(\thread)
 	.endm
 
-	.macro	fpu_restore_double thread status tmp=t0
-	lw	\tmp, THREAD_FCR31(\thread)
-	ldc1	$f0,  THREAD_FPR0(\thread)
-	ldc1	$f2,  THREAD_FPR2(\thread)
-	ldc1	$f4,  THREAD_FPR4(\thread)
-	ldc1	$f6,  THREAD_FPR6(\thread)
-	ldc1	$f8,  THREAD_FPR8(\thread)
-	ldc1	$f10, THREAD_FPR10(\thread)
-	ldc1	$f12, THREAD_FPR12(\thread)
-	ldc1	$f14, THREAD_FPR14(\thread)
-	ldc1	$f16, THREAD_FPR16(\thread)
-	ldc1	$f18, THREAD_FPR18(\thread)
-	ldc1	$f20, THREAD_FPR20(\thread)
-	ldc1	$f22, THREAD_FPR22(\thread)
-	ldc1	$f24, THREAD_FPR24(\thread)
-	ldc1	$f26, THREAD_FPR26(\thread)
-	ldc1	$f28, THREAD_FPR28(\thread)
-	ldc1	$f30, THREAD_FPR30(\thread)
-	ctc1	\tmp, fcr31
-	.endm
-
 	.macro	fpu_restore_single thread tmp=t0
 	lw	\tmp, THREAD_FCR31(\thread)
 	lwc1	$f0,  THREAD_FPR0(\thread)
diff --git a/arch/mips/include/asm/asmmacro-64.h b/arch/mips/include/asm/asmmacro-64.h
index 08a527d..38ea609 100644
--- a/arch/mips/include/asm/asmmacro-64.h
+++ b/arch/mips/include/asm/asmmacro-64.h
@@ -13,102 +13,6 @@
 #include <asm/fpregdef.h>
 #include <asm/mipsregs.h>
 
-	.macro	fpu_save_16even thread tmp=t0
-	cfc1	\tmp, fcr31
-	sdc1	$f0,  THREAD_FPR0(\thread)
-	sdc1	$f2,  THREAD_FPR2(\thread)
-	sdc1	$f4,  THREAD_FPR4(\thread)
-	sdc1	$f6,  THREAD_FPR6(\thread)
-	sdc1	$f8,  THREAD_FPR8(\thread)
-	sdc1	$f10, THREAD_FPR10(\thread)
-	sdc1	$f12, THREAD_FPR12(\thread)
-	sdc1	$f14, THREAD_FPR14(\thread)
-	sdc1	$f16, THREAD_FPR16(\thread)
-	sdc1	$f18, THREAD_FPR18(\thread)
-	sdc1	$f20, THREAD_FPR20(\thread)
-	sdc1	$f22, THREAD_FPR22(\thread)
-	sdc1	$f24, THREAD_FPR24(\thread)
-	sdc1	$f26, THREAD_FPR26(\thread)
-	sdc1	$f28, THREAD_FPR28(\thread)
-	sdc1	$f30, THREAD_FPR30(\thread)
-	sw	\tmp, THREAD_FCR31(\thread)
-	.endm
-
-	.macro	fpu_save_16odd thread
-	sdc1	$f1,  THREAD_FPR1(\thread)
-	sdc1	$f3,  THREAD_FPR3(\thread)
-	sdc1	$f5,  THREAD_FPR5(\thread)
-	sdc1	$f7,  THREAD_FPR7(\thread)
-	sdc1	$f9,  THREAD_FPR9(\thread)
-	sdc1	$f11, THREAD_FPR11(\thread)
-	sdc1	$f13, THREAD_FPR13(\thread)
-	sdc1	$f15, THREAD_FPR15(\thread)
-	sdc1	$f17, THREAD_FPR17(\thread)
-	sdc1	$f19, THREAD_FPR19(\thread)
-	sdc1	$f21, THREAD_FPR21(\thread)
-	sdc1	$f23, THREAD_FPR23(\thread)
-	sdc1	$f25, THREAD_FPR25(\thread)
-	sdc1	$f27, THREAD_FPR27(\thread)
-	sdc1	$f29, THREAD_FPR29(\thread)
-	sdc1	$f31, THREAD_FPR31(\thread)
-	.endm
-
-	.macro	fpu_save_double thread status tmp
-	sll	\tmp, \status, 5
-	bgez	\tmp, 2f
-	fpu_save_16odd \thread
-2:
-	fpu_save_16even \thread \tmp
-	.endm
-
-	.macro	fpu_restore_16even thread tmp=t0
-	lw	\tmp, THREAD_FCR31(\thread)
-	ldc1	$f0,  THREAD_FPR0(\thread)
-	ldc1	$f2,  THREAD_FPR2(\thread)
-	ldc1	$f4,  THREAD_FPR4(\thread)
-	ldc1	$f6,  THREAD_FPR6(\thread)
-	ldc1	$f8,  THREAD_FPR8(\thread)
-	ldc1	$f10, THREAD_FPR10(\thread)
-	ldc1	$f12, THREAD_FPR12(\thread)
-	ldc1	$f14, THREAD_FPR14(\thread)
-	ldc1	$f16, THREAD_FPR16(\thread)
-	ldc1	$f18, THREAD_FPR18(\thread)
-	ldc1	$f20, THREAD_FPR20(\thread)
-	ldc1	$f22, THREAD_FPR22(\thread)
-	ldc1	$f24, THREAD_FPR24(\thread)
-	ldc1	$f26, THREAD_FPR26(\thread)
-	ldc1	$f28, THREAD_FPR28(\thread)
-	ldc1	$f30, THREAD_FPR30(\thread)
-	ctc1	\tmp, fcr31
-	.endm
-
-	.macro	fpu_restore_16odd thread
-	ldc1	$f1,  THREAD_FPR1(\thread)
-	ldc1	$f3,  THREAD_FPR3(\thread)
-	ldc1	$f5,  THREAD_FPR5(\thread)
-	ldc1	$f7,  THREAD_FPR7(\thread)
-	ldc1	$f9,  THREAD_FPR9(\thread)
-	ldc1	$f11, THREAD_FPR11(\thread)
-	ldc1	$f13, THREAD_FPR13(\thread)
-	ldc1	$f15, THREAD_FPR15(\thread)
-	ldc1	$f17, THREAD_FPR17(\thread)
-	ldc1	$f19, THREAD_FPR19(\thread)
-	ldc1	$f21, THREAD_FPR21(\thread)
-	ldc1	$f23, THREAD_FPR23(\thread)
-	ldc1	$f25, THREAD_FPR25(\thread)
-	ldc1	$f27, THREAD_FPR27(\thread)
-	ldc1	$f29, THREAD_FPR29(\thread)
-	ldc1	$f31, THREAD_FPR31(\thread)
-	.endm
-
-	.macro	fpu_restore_double thread status tmp
-	sll	\tmp, \status, 5
-	bgez	\tmp, 1f				# 16 register mode?
-
-	fpu_restore_16odd \thread
-1:	fpu_restore_16even \thread \tmp
-	.endm
-
 	.macro	cpu_save_nonscratch thread
 	LONG_S	s0, THREAD_REG16(\thread)
 	LONG_S	s1, THREAD_REG17(\thread)
diff --git a/arch/mips/include/asm/asmmacro.h b/arch/mips/include/asm/asmmacro.h
index 6c8342a..3220c93 100644
--- a/arch/mips/include/asm/asmmacro.h
+++ b/arch/mips/include/asm/asmmacro.h
@@ -62,6 +62,113 @@
 	.endm
 #endif /* CONFIG_MIPS_MT_SMTC */
 
+	.macro	fpu_save_16even thread tmp=t0
+	cfc1	\tmp, fcr31
+	sdc1	$f0,  THREAD_FPR0(\thread)
+	sdc1	$f2,  THREAD_FPR2(\thread)
+	sdc1	$f4,  THREAD_FPR4(\thread)
+	sdc1	$f6,  THREAD_FPR6(\thread)
+	sdc1	$f8,  THREAD_FPR8(\thread)
+	sdc1	$f10, THREAD_FPR10(\thread)
+	sdc1	$f12, THREAD_FPR12(\thread)
+	sdc1	$f14, THREAD_FPR14(\thread)
+	sdc1	$f16, THREAD_FPR16(\thread)
+	sdc1	$f18, THREAD_FPR18(\thread)
+	sdc1	$f20, THREAD_FPR20(\thread)
+	sdc1	$f22, THREAD_FPR22(\thread)
+	sdc1	$f24, THREAD_FPR24(\thread)
+	sdc1	$f26, THREAD_FPR26(\thread)
+	sdc1	$f28, THREAD_FPR28(\thread)
+	sdc1	$f30, THREAD_FPR30(\thread)
+	sw	\tmp, THREAD_FCR31(\thread)
+	.endm
+
+	.macro	fpu_save_16odd thread
+	.set	push
+	.set	mips64r2
+	sdc1	$f1,  THREAD_FPR1(\thread)
+	sdc1	$f3,  THREAD_FPR3(\thread)
+	sdc1	$f5,  THREAD_FPR5(\thread)
+	sdc1	$f7,  THREAD_FPR7(\thread)
+	sdc1	$f9,  THREAD_FPR9(\thread)
+	sdc1	$f11, THREAD_FPR11(\thread)
+	sdc1	$f13, THREAD_FPR13(\thread)
+	sdc1	$f15, THREAD_FPR15(\thread)
+	sdc1	$f17, THREAD_FPR17(\thread)
+	sdc1	$f19, THREAD_FPR19(\thread)
+	sdc1	$f21, THREAD_FPR21(\thread)
+	sdc1	$f23, THREAD_FPR23(\thread)
+	sdc1	$f25, THREAD_FPR25(\thread)
+	sdc1	$f27, THREAD_FPR27(\thread)
+	sdc1	$f29, THREAD_FPR29(\thread)
+	sdc1	$f31, THREAD_FPR31(\thread)
+	.set	pop
+	.endm
+
+	.macro	fpu_save_double thread status tmp
+#if defined(CONFIG_MIPS64) || defined(CONFIG_CPU_MIPS32_R2)
+	sll	\tmp, \status, 5
+	bgez	\tmp, 10f
+	fpu_save_16odd \thread
+10:
+#endif
+	fpu_save_16even \thread \tmp
+	.endm
+
+	.macro	fpu_restore_16even thread tmp=t0
+	lw	\tmp, THREAD_FCR31(\thread)
+	ldc1	$f0,  THREAD_FPR0(\thread)
+	ldc1	$f2,  THREAD_FPR2(\thread)
+	ldc1	$f4,  THREAD_FPR4(\thread)
+	ldc1	$f6,  THREAD_FPR6(\thread)
+	ldc1	$f8,  THREAD_FPR8(\thread)
+	ldc1	$f10, THREAD_FPR10(\thread)
+	ldc1	$f12, THREAD_FPR12(\thread)
+	ldc1	$f14, THREAD_FPR14(\thread)
+	ldc1	$f16, THREAD_FPR16(\thread)
+	ldc1	$f18, THREAD_FPR18(\thread)
+	ldc1	$f20, THREAD_FPR20(\thread)
+	ldc1	$f22, THREAD_FPR22(\thread)
+	ldc1	$f24, THREAD_FPR24(\thread)
+	ldc1	$f26, THREAD_FPR26(\thread)
+	ldc1	$f28, THREAD_FPR28(\thread)
+	ldc1	$f30, THREAD_FPR30(\thread)
+	ctc1	\tmp, fcr31
+	.endm
+
+	.macro	fpu_restore_16odd thread
+	.set	push
+	.set	mips64r2
+	ldc1	$f1,  THREAD_FPR1(\thread)
+	ldc1	$f3,  THREAD_FPR3(\thread)
+	ldc1	$f5,  THREAD_FPR5(\thread)
+	ldc1	$f7,  THREAD_FPR7(\thread)
+	ldc1	$f9,  THREAD_FPR9(\thread)
+	ldc1	$f11, THREAD_FPR11(\thread)
+	ldc1	$f13, THREAD_FPR13(\thread)
+	ldc1	$f15, THREAD_FPR15(\thread)
+	ldc1	$f17, THREAD_FPR17(\thread)
+	ldc1	$f19, THREAD_FPR19(\thread)
+	ldc1	$f21, THREAD_FPR21(\thread)
+	ldc1	$f23, THREAD_FPR23(\thread)
+	ldc1	$f25, THREAD_FPR25(\thread)
+	ldc1	$f27, THREAD_FPR27(\thread)
+	ldc1	$f29, THREAD_FPR29(\thread)
+	ldc1	$f31, THREAD_FPR31(\thread)
+	.set	pop
+	.endm
+
+	.macro	fpu_restore_double thread status tmp
+#if defined(CONFIG_MIPS64) || defined(CONFIG_CPU_MIPS32_R2)
+	sll	\tmp, \status, 5
+	bgez	\tmp, 10f				# 16 register mode?
+
+	fpu_restore_16odd \thread
+10:
+#endif
+	fpu_restore_16even \thread \tmp
+	.endm
+
 /*
  * Temporary until all gas have MT ASE support
  */
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index a66359e..d414405 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -36,6 +36,7 @@
 #define EF_MIPS_ABI2		0x00000020
 #define EF_MIPS_OPTIONS_FIRST	0x00000080
 #define EF_MIPS_32BITMODE	0x00000100
+#define EF_MIPS_FP64		0x00000200
 #define EF_MIPS_ABI		0x0000f000
 #define EF_MIPS_ARCH		0xf0000000
 
@@ -176,6 +177,18 @@
 #ifdef CONFIG_32BIT
 
 /*
+ * In order to be sure that we don't attempt to execute an O32 binary which
+ * requires 64 bit FP (FR=1) on a system which does not support it we refuse
+ * to execute any binary which has bits specified by the following macro set
+ * in its ELF header flags.
+ */
+#ifdef CONFIG_MIPS_O32_FP64_SUPPORT
+# define __MIPS_O32_FP64_MUST_BE_ZERO	0
+#else
+# define __MIPS_O32_FP64_MUST_BE_ZERO	EF_MIPS_FP64
+#endif
+
+/*
  * This is used to ensure we don't load something for the wrong architecture.
  */
 #define elf_check_arch(hdr)						\
@@ -192,6 +205,8 @@
 	if (((__h->e_flags & EF_MIPS_ABI) != 0) &&			\
 	    ((__h->e_flags & EF_MIPS_ABI) != EF_MIPS_ABI_O32))		\
 		__res = 0;						\
+	if (__h->e_flags & __MIPS_O32_FP64_MUST_BE_ZERO)		\
+		__res = 0;						\
 									\
 	__res;								\
 })
@@ -249,6 +264,11 @@
 
 #define SET_PERSONALITY(ex)						\
 do {									\
+	if ((ex).e_flags & EF_MIPS_FP64)				\
+		clear_thread_flag(TIF_32BIT_FPREGS);			\
+	else								\
+		set_thread_flag(TIF_32BIT_FPREGS);			\
+									\
 	if (personality(current->personality) != PER_LINUX)		\
 		set_personality(PER_LINUX);				\
 									\
@@ -271,14 +291,18 @@
 #endif
 
 #ifdef CONFIG_MIPS32_O32
-#define __SET_PERSONALITY32_O32()					\
+#define __SET_PERSONALITY32_O32(ex)					\
 	do {								\
 		set_thread_flag(TIF_32BIT_REGS);			\
 		set_thread_flag(TIF_32BIT_ADDR);			\
+									\
+		if (!((ex).e_flags & EF_MIPS_FP64))			\
+			set_thread_flag(TIF_32BIT_FPREGS);		\
+									\
 		current->thread.abi = &mips_abi_32;			\
 	} while (0)
 #else
-#define __SET_PERSONALITY32_O32()					\
+#define __SET_PERSONALITY32_O32(ex)					\
 	do { } while (0)
 #endif
 
@@ -289,7 +313,7 @@
 	     ((ex).e_flags & EF_MIPS_ABI) == 0)				\
 		__SET_PERSONALITY32_N32();				\
 	else								\
-		__SET_PERSONALITY32_O32();				\
+		__SET_PERSONALITY32_O32(ex);                            \
 } while (0)
 #else
 #define __SET_PERSONALITY32(ex) do { } while (0)
@@ -300,6 +324,7 @@
 	unsigned int p;							\
 									\
 	clear_thread_flag(TIF_32BIT_REGS);				\
+	clear_thread_flag(TIF_32BIT_FPREGS);				\
 	clear_thread_flag(TIF_32BIT_ADDR);				\
 									\
 	if ((ex).e_ident[EI_CLASS] == ELFCLASS32)			\
diff --git a/arch/mips/include/asm/fpu.h b/arch/mips/include/asm/fpu.h
index 3bf023f..cfe092f 100644
--- a/arch/mips/include/asm/fpu.h
+++ b/arch/mips/include/asm/fpu.h
@@ -33,11 +33,48 @@
 extern void _save_fp(struct task_struct *);
 extern void _restore_fp(struct task_struct *);
 
-#define __enable_fpu()							\
-do {									\
-	set_c0_status(ST0_CU1);						\
-	enable_fpu_hazard();						\
-} while (0)
+/*
+ * This enum specifies a mode in which we want the FPU to operate, for cores
+ * which implement the Status.FR bit. Note that FPU_32BIT & FPU_64BIT
+ * purposefully have the values 0 & 1 respectively, so that an integer value
+ * of Status.FR can be trivially casted to the corresponding enum fpu_mode.
+ */
+enum fpu_mode {
+	FPU_32BIT = 0,		/* FR = 0 */
+	FPU_64BIT,		/* FR = 1 */
+	FPU_AS_IS,
+};
+
+static inline int __enable_fpu(enum fpu_mode mode)
+{
+	int fr;
+
+	switch (mode) {
+	case FPU_AS_IS:
+		/* just enable the FPU in its current mode */
+		set_c0_status(ST0_CU1);
+		enable_fpu_hazard();
+		return 0;
+
+	case FPU_64BIT:
+#if !(defined(CONFIG_CPU_MIPS32_R2) || defined(CONFIG_MIPS64))
+		/* we only have a 32-bit FPU */
+		return SIGFPE;
+#endif
+		/* fall through */
+	case FPU_32BIT:
+		/* set CU1 & change FR appropriately */
+		fr = (int)mode;
+		change_c0_status(ST0_CU1 | ST0_FR, ST0_CU1 | (fr ? ST0_FR : 0));
+		enable_fpu_hazard();
+
+		/* check FR has the desired value */
+		return (!!(read_c0_status() & ST0_FR) == !!fr) ? 0 : SIGFPE;
+
+	default:
+		BUG();
+	}
+}
 
 #define __disable_fpu()							\
 do {									\
@@ -57,27 +94,46 @@
 	return cpu_has_fpu && __is_fpu_owner();
 }
 
-static inline void __own_fpu(void)
+static inline int __own_fpu(void)
 {
-	__enable_fpu();
+	enum fpu_mode mode;
+	int ret;
+
+	mode = !test_thread_flag(TIF_32BIT_FPREGS);
+	ret = __enable_fpu(mode);
+	if (ret)
+		return ret;
+
 	KSTK_STATUS(current) |= ST0_CU1;
+	if (mode == FPU_64BIT)
+		KSTK_STATUS(current) |= ST0_FR;
+	else /* mode == FPU_32BIT */
+		KSTK_STATUS(current) &= ~ST0_FR;
+
 	set_thread_flag(TIF_USEDFPU);
+	return 0;
 }
 
-static inline void own_fpu_inatomic(int restore)
+static inline int own_fpu_inatomic(int restore)
 {
+	int ret = 0;
+
 	if (cpu_has_fpu && !__is_fpu_owner()) {
-		__own_fpu();
-		if (restore)
+		ret = __own_fpu();
+		if (restore && !ret)
 			_restore_fp(current);
 	}
+	return ret;
 }
 
-static inline void own_fpu(int restore)
+static inline int own_fpu(int restore)
 {
+	int ret;
+
 	preempt_disable();
-	own_fpu_inatomic(restore);
+	ret = own_fpu_inatomic(restore);
 	preempt_enable();
+	return ret;
 }
 
 static inline void lose_fpu(int save)
@@ -93,16 +149,21 @@
 	preempt_enable();
 }
 
-static inline void init_fpu(void)
+static inline int init_fpu(void)
 {
+	int ret = 0;
+
 	preempt_disable();
 	if (cpu_has_fpu) {
-		__own_fpu();
-		_init_fpu();
+		ret = __own_fpu();
+		if (!ret)
+			_init_fpu();
 	} else {
 		fpu_emulator_init_fpu();
 	}
+
 	preempt_enable();
+	return ret;
 }
 
 static inline void save_fp(struct task_struct *tsk)
diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h
index 4f58ef6..24846f9 100644
--- a/arch/mips/include/asm/thread_info.h
+++ b/arch/mips/include/asm/thread_info.h
@@ -110,11 +110,12 @@
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
 #define TIF_FIXADE		20	/* Fix address errors in software */
 #define TIF_LOGADE		21	/* Log address errors to syslog */
-#define TIF_32BIT_REGS		22	/* also implies 16/32 fprs */
+#define TIF_32BIT_REGS		22	/* 32-bit general purpose registers */
 #define TIF_32BIT_ADDR		23	/* 32-bit address space (o32/n32) */
 #define TIF_FPUBOUND		24	/* thread bound to FPU-full CPU set */
 #define TIF_LOAD_WATCH		25	/* If set, load watch registers */
 #define TIF_SYSCALL_TRACEPOINT	26	/* syscall tracepoint instrumentation */
+#define TIF_32BIT_FPREGS	27	/* 32-bit floating point registers */
 #define TIF_SYSCALL_TRACE	31	/* syscall trace active */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -131,6 +132,7 @@
 #define _TIF_32BIT_ADDR		(1<<TIF_32BIT_ADDR)
 #define _TIF_FPUBOUND		(1<<TIF_FPUBOUND)
 #define _TIF_LOAD_WATCH		(1<<TIF_LOAD_WATCH)
+#define _TIF_32BIT_FPREGS	(1<<TIF_32BIT_FPREGS)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 
 #define _TIF_WORK_SYSCALL_ENTRY	(_TIF_NOHZ | _TIF_SYSCALL_TRACE |	\
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c
index 202e581..7faf5f2 100644
--- a/arch/mips/kernel/binfmt_elfo32.c
+++ b/arch/mips/kernel/binfmt_elfo32.c
@@ -28,6 +28,18 @@
 typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG];
 
 /*
+ * In order to be sure that we don't attempt to execute an O32 binary which
+ * requires 64 bit FP (FR=1) on a system which does not support it we refuse
+ * to execute any binary which has bits specified by the following macro set
+ * in its ELF header flags.
+ */
+#ifdef CONFIG_MIPS_O32_FP64_SUPPORT
+# define __MIPS_O32_FP64_MUST_BE_ZERO	0
+#else
+# define __MIPS_O32_FP64_MUST_BE_ZERO	EF_MIPS_FP64
+#endif
+
+/*
  * This is used to ensure we don't load something for the wrong architecture.
  */
 #define elf_check_arch(hdr)						\
@@ -44,6 +56,8 @@
 	if (((__h->e_flags & EF_MIPS_ABI) != 0) &&			\
 	    ((__h->e_flags & EF_MIPS_ABI) != EF_MIPS_ABI_O32))		\
 		__res = 0;						\
+	if (__h->e_flags & __MIPS_O32_FP64_MUST_BE_ZERO)		\
+		__res = 0;						\
 									\
 	__res;								\
 })
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index c814287..e2b2d20 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -112,7 +112,7 @@
 	unsigned long tmp, fpu_id;
 
 	tmp = read_c0_status();
-	__enable_fpu();
+	__enable_fpu(FPU_AS_IS);
 	fpu_id = read_32bit_cp1_register(CP1_REVISION);
 	write_c0_status(tmp);
 	return fpu_id;
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index ddc7610..747a6cf 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -60,9 +60,6 @@
 
 	/* New thread loses kernel privileges. */
 	status = regs->cp0_status & ~(ST0_CU0|ST0_CU1|ST0_FR|KU_MASK);
-#ifdef CONFIG_64BIT
-	status |= test_thread_flag(TIF_32BIT_REGS) ? 0 : ST0_FR;
-#endif
 	status |= KU_USER;
 	regs->cp0_status = status;
 	clear_used_math();
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index b52e1d2..7da9b76 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -137,13 +137,13 @@
 		if (cpu_has_mipsmt) {
 			unsigned int vpflags = dvpe();
 			flags = read_c0_status();
-			__enable_fpu();
+			__enable_fpu(FPU_AS_IS);
 			__asm__ __volatile__("cfc1\t%0,$0" : "=r" (tmp));
 			write_c0_status(flags);
 			evpe(vpflags);
 		} else {
 			flags = read_c0_status();
-			__enable_fpu();
+			__enable_fpu(FPU_AS_IS);
 			__asm__ __volatile__("cfc1\t%0,$0" : "=r" (tmp));
 			write_c0_status(flags);
 		}
@@ -408,6 +408,7 @@
 	/* Read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
 		struct pt_regs *regs;
+		fpureg_t *fregs;
 		unsigned long tmp = 0;
 
 		regs = task_pt_regs(child);
@@ -418,26 +419,28 @@
 			tmp = regs->regs[addr];
 			break;
 		case FPR_BASE ... FPR_BASE + 31:
-			if (tsk_used_math(child)) {
-				fpureg_t *fregs = get_fpu_regs(child);
+			if (!tsk_used_math(child)) {
+				/* FP not yet used */
+				tmp = -1;
+				break;
+			}
+			fregs = get_fpu_regs(child);
 
 #ifdef CONFIG_32BIT
+			if (test_thread_flag(TIF_32BIT_FPREGS)) {
 				/*
 				 * The odd registers are actually the high
 				 * order bits of the values stored in the even
 				 * registers - unless we're using r2k_switch.S.
 				 */
 				if (addr & 1)
-					tmp = (unsigned long) (fregs[((addr & ~1) - 32)] >> 32);
+					tmp = fregs[(addr & ~1) - 32] >> 32;
 				else
-					tmp = (unsigned long) (fregs[(addr - 32)] & 0xffffffff);
-#endif
-#ifdef CONFIG_64BIT
-				tmp = fregs[addr - FPR_BASE];
-#endif
-			} else {
-				tmp = -1;	/* FP not yet used  */
+					tmp = fregs[addr - 32];
+				break;
 			}
+#endif
+			tmp = fregs[addr - FPR_BASE];
 			break;
 		case PC:
 			tmp = regs->cp0_epc;
@@ -483,13 +486,13 @@
 			if (cpu_has_mipsmt) {
 				unsigned int vpflags = dvpe();
 				flags = read_c0_status();
-				__enable_fpu();
+				__enable_fpu(FPU_AS_IS);
 				__asm__ __volatile__("cfc1\t%0,$0": "=r" (tmp));
 				write_c0_status(flags);
 				evpe(vpflags);
 			} else {
 				flags = read_c0_status();
-				__enable_fpu();
+				__enable_fpu(FPU_AS_IS);
 				__asm__ __volatile__("cfc1\t%0,$0": "=r" (tmp));
 				write_c0_status(flags);
 			}
@@ -554,22 +557,25 @@
 				child->thread.fpu.fcr31 = 0;
 			}
 #ifdef CONFIG_32BIT
-			/*
-			 * The odd registers are actually the high order bits
-			 * of the values stored in the even registers - unless
-			 * we're using r2k_switch.S.
-			 */
-			if (addr & 1) {
-				fregs[(addr & ~1) - FPR_BASE] &= 0xffffffff;
-				fregs[(addr & ~1) - FPR_BASE] |= ((unsigned long long) data) << 32;
-			} else {
-				fregs[addr - FPR_BASE] &= ~0xffffffffLL;
-				fregs[addr - FPR_BASE] |= data;
+			if (test_thread_flag(TIF_32BIT_FPREGS)) {
+				/*
+				 * The odd registers are actually the high
+				 * order bits of the values stored in the even
+				 * registers - unless we're using r2k_switch.S.
+				 */
+				if (addr & 1) {
+					fregs[(addr & ~1) - FPR_BASE] &=
+						0xffffffff;
+					fregs[(addr & ~1) - FPR_BASE] |=
+						((u64)data) << 32;
+				} else {
+					fregs[addr - FPR_BASE] &= ~0xffffffffLL;
+					fregs[addr - FPR_BASE] |= data;
+				}
+				break;
 			}
 #endif
-#ifdef CONFIG_64BIT
 			fregs[addr - FPR_BASE] = data;
-#endif
 			break;
 		}
 		case PC:
diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c
index 9486055..b8aa2dd 100644
--- a/arch/mips/kernel/ptrace32.c
+++ b/arch/mips/kernel/ptrace32.c
@@ -80,6 +80,7 @@
 	/* Read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
 		struct pt_regs *regs;
+		fpureg_t *fregs;
 		unsigned int tmp;
 
 		regs = task_pt_regs(child);
@@ -90,21 +91,25 @@
 			tmp = regs->regs[addr];
 			break;
 		case FPR_BASE ... FPR_BASE + 31:
-			if (tsk_used_math(child)) {
-				fpureg_t *fregs = get_fpu_regs(child);
-
+			if (!tsk_used_math(child)) {
+				/* FP not yet used */
+				tmp = -1;
+				break;
+			}
+			fregs = get_fpu_regs(child);
+			if (test_thread_flag(TIF_32BIT_FPREGS)) {
 				/*
 				 * The odd registers are actually the high
 				 * order bits of the values stored in the even
 				 * registers - unless we're using r2k_switch.S.
 				 */
 				if (addr & 1)
-					tmp = (unsigned long) (fregs[((addr & ~1) - 32)] >> 32);
+					tmp = fregs[(addr & ~1) - 32] >> 32;
 				else
-					tmp = (unsigned long) (fregs[(addr - 32)] & 0xffffffff);
-			} else {
-				tmp = -1;	/* FP not yet used  */
+					tmp = fregs[addr - 32];
+				break;
 			}
+			tmp = fregs[addr - FPR_BASE];
 			break;
 		case PC:
 			tmp = regs->cp0_epc;
@@ -147,13 +152,13 @@
 			if (cpu_has_mipsmt) {
 				unsigned int vpflags = dvpe();
 				flags = read_c0_status();
-				__enable_fpu();
+				__enable_fpu(FPU_AS_IS);
 				__asm__ __volatile__("cfc1\t%0,$0": "=r" (tmp));
 				write_c0_status(flags);
 				evpe(vpflags);
 			} else {
 				flags = read_c0_status();
-				__enable_fpu();
+				__enable_fpu(FPU_AS_IS);
 				__asm__ __volatile__("cfc1\t%0,$0": "=r" (tmp));
 				write_c0_status(flags);
 			}
@@ -236,20 +241,24 @@
 				       sizeof(child->thread.fpu));
 				child->thread.fpu.fcr31 = 0;
 			}
-			/*
-			 * The odd registers are actually the high order bits
-			 * of the values stored in the even registers - unless
-			 * we're using r2k_switch.S.
-			 */
-			if (addr & 1) {
-				fregs[(addr & ~1) - FPR_BASE] &= 0xffffffff;
-				fregs[(addr & ~1) - FPR_BASE] |= ((unsigned long long) data) << 32;
-			} else {
-				fregs[addr - FPR_BASE] &= ~0xffffffffLL;
-				/* Must cast, lest sign extension fill upper
-				   bits!  */
-				fregs[addr - FPR_BASE] |= (unsigned int)data;
+			if (test_thread_flag(TIF_32BIT_FPREGS)) {
+				/*
+				 * The odd registers are actually the high
+				 * order bits of the values stored in the even
+				 * registers - unless we're using r2k_switch.S.
+				 */
+				if (addr & 1) {
+					fregs[(addr & ~1) - FPR_BASE] &=
+						0xffffffff;
+					fregs[(addr & ~1) - FPR_BASE] |=
+						((u64)data) << 32;
+				} else {
+					fregs[addr - FPR_BASE] &= ~0xffffffffLL;
+					fregs[addr - FPR_BASE] |= data;
+				}
+				break;
 			}
+			fregs[addr - FPR_BASE] = data;
 			break;
 		}
 		case PC:
diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
index 55ffe14..253b2fb 100644
--- a/arch/mips/kernel/r4k_fpu.S
+++ b/arch/mips/kernel/r4k_fpu.S
@@ -35,7 +35,15 @@
 LEAF(_save_fp_context)
 	cfc1	t1, fcr31
 
-#ifdef CONFIG_64BIT
+#if defined(CONFIG_64BIT) || defined(CONFIG_MIPS32_R2)
+	.set	push
+#ifdef CONFIG_MIPS32_R2
+	.set	mips64r2
+	mfc0	t0, CP0_STATUS
+	sll	t0, t0, 5
+	bgez	t0, 1f			# skip storing odd if FR=0
+	 nop
+#endif
 	/* Store the 16 odd double precision registers */
 	EX	sdc1 $f1, SC_FPREGS+8(a0)
 	EX	sdc1 $f3, SC_FPREGS+24(a0)
@@ -53,6 +61,7 @@
 	EX	sdc1 $f27, SC_FPREGS+216(a0)
 	EX	sdc1 $f29, SC_FPREGS+232(a0)
 	EX	sdc1 $f31, SC_FPREGS+248(a0)
+1:	.set	pop
 #endif
 
 	/* Store the 16 even double precision registers */
@@ -82,7 +91,31 @@
 LEAF(_save_fp_context32)
 	cfc1	t1, fcr31
 
-	EX	sdc1 $f0, SC32_FPREGS+0(a0)
+	mfc0	t0, CP0_STATUS
+	sll	t0, t0, 5
+	bgez	t0, 1f			# skip storing odd if FR=0
+	 nop
+
+	/* Store the 16 odd double precision registers */
+	EX      sdc1 $f1, SC32_FPREGS+8(a0)
+	EX      sdc1 $f3, SC32_FPREGS+24(a0)
+	EX      sdc1 $f5, SC32_FPREGS+40(a0)
+	EX      sdc1 $f7, SC32_FPREGS+56(a0)
+	EX      sdc1 $f9, SC32_FPREGS+72(a0)
+	EX      sdc1 $f11, SC32_FPREGS+88(a0)
+	EX      sdc1 $f13, SC32_FPREGS+104(a0)
+	EX      sdc1 $f15, SC32_FPREGS+120(a0)
+	EX      sdc1 $f17, SC32_FPREGS+136(a0)
+	EX      sdc1 $f19, SC32_FPREGS+152(a0)
+	EX      sdc1 $f21, SC32_FPREGS+168(a0)
+	EX      sdc1 $f23, SC32_FPREGS+184(a0)
+	EX      sdc1 $f25, SC32_FPREGS+200(a0)
+	EX      sdc1 $f27, SC32_FPREGS+216(a0)
+	EX      sdc1 $f29, SC32_FPREGS+232(a0)
+	EX      sdc1 $f31, SC32_FPREGS+248(a0)
+
+	/* Store the 16 even double precision registers */
+1:	EX	sdc1 $f0, SC32_FPREGS+0(a0)
 	EX	sdc1 $f2, SC32_FPREGS+16(a0)
 	EX	sdc1 $f4, SC32_FPREGS+32(a0)
 	EX	sdc1 $f6, SC32_FPREGS+48(a0)
@@ -114,7 +147,16 @@
  */
 LEAF(_restore_fp_context)
 	EX	lw t0, SC_FPC_CSR(a0)
-#ifdef CONFIG_64BIT
+
+#if defined(CONFIG_64BIT) || defined(CONFIG_MIPS32_R2)
+	.set	push
+#ifdef CONFIG_MIPS32_R2
+	.set	mips64r2
+	mfc0	t0, CP0_STATUS
+	sll	t0, t0, 5
+	bgez	t0, 1f			# skip loading odd if FR=0
+	 nop
+#endif
 	EX	ldc1 $f1, SC_FPREGS+8(a0)
 	EX	ldc1 $f3, SC_FPREGS+24(a0)
 	EX	ldc1 $f5, SC_FPREGS+40(a0)
@@ -131,6 +173,7 @@
 	EX	ldc1 $f27, SC_FPREGS+216(a0)
 	EX	ldc1 $f29, SC_FPREGS+232(a0)
 	EX	ldc1 $f31, SC_FPREGS+248(a0)
+1:	.set pop
 #endif
 	EX	ldc1 $f0, SC_FPREGS+0(a0)
 	EX	ldc1 $f2, SC_FPREGS+16(a0)
@@ -157,7 +200,30 @@
 LEAF(_restore_fp_context32)
 	/* Restore an o32 sigcontext.  */
 	EX	lw t0, SC32_FPC_CSR(a0)
-	EX	ldc1 $f0, SC32_FPREGS+0(a0)
+
+	mfc0	t0, CP0_STATUS
+	sll	t0, t0, 5
+	bgez	t0, 1f			# skip loading odd if FR=0
+	 nop
+
+	EX      ldc1 $f1, SC32_FPREGS+8(a0)
+	EX      ldc1 $f3, SC32_FPREGS+24(a0)
+	EX      ldc1 $f5, SC32_FPREGS+40(a0)
+	EX      ldc1 $f7, SC32_FPREGS+56(a0)
+	EX      ldc1 $f9, SC32_FPREGS+72(a0)
+	EX      ldc1 $f11, SC32_FPREGS+88(a0)
+	EX      ldc1 $f13, SC32_FPREGS+104(a0)
+	EX      ldc1 $f15, SC32_FPREGS+120(a0)
+	EX      ldc1 $f17, SC32_FPREGS+136(a0)
+	EX      ldc1 $f19, SC32_FPREGS+152(a0)
+	EX      ldc1 $f21, SC32_FPREGS+168(a0)
+	EX      ldc1 $f23, SC32_FPREGS+184(a0)
+	EX      ldc1 $f25, SC32_FPREGS+200(a0)
+	EX      ldc1 $f27, SC32_FPREGS+216(a0)
+	EX      ldc1 $f29, SC32_FPREGS+232(a0)
+	EX      ldc1 $f31, SC32_FPREGS+248(a0)
+
+1:	EX	ldc1 $f0, SC32_FPREGS+0(a0)
 	EX	ldc1 $f2, SC32_FPREGS+16(a0)
 	EX	ldc1 $f4, SC32_FPREGS+32(a0)
 	EX	ldc1 $f6, SC32_FPREGS+48(a0)
diff --git a/arch/mips/kernel/r4k_switch.S b/arch/mips/kernel/r4k_switch.S
index 078de5ea..cc78dd9 100644
--- a/arch/mips/kernel/r4k_switch.S
+++ b/arch/mips/kernel/r4k_switch.S
@@ -123,7 +123,7 @@
  * Save a thread's fp context.
  */
 LEAF(_save_fp)
-#ifdef CONFIG_64BIT
+#if defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPS32_R2)
 	mfc0	t0, CP0_STATUS
 #endif
 	fpu_save_double a0 t0 t1		# clobbers t1
@@ -134,7 +134,7 @@
  * Restore a thread's fp context.
  */
 LEAF(_restore_fp)
-#ifdef CONFIG_64BIT
+#if defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPS32_R2)
 	mfc0	t0, CP0_STATUS
 #endif
 	fpu_restore_double a0 t0 t1		# clobbers t1
@@ -228,6 +228,47 @@
 	mtc1	t1, $f29
 	mtc1	t1, $f30
 	mtc1	t1, $f31
+
+#ifdef CONFIG_CPU_MIPS32_R2
+	.set    push
+	.set    mips64r2
+	sll     t0, t0, 5			# is Status.FR set?
+	bgez    t0, 1f				# no: skip setting upper 32b
+
+	mthc1   t1, $f0
+	mthc1   t1, $f1
+	mthc1   t1, $f2
+	mthc1   t1, $f3
+	mthc1   t1, $f4
+	mthc1   t1, $f5
+	mthc1   t1, $f6
+	mthc1   t1, $f7
+	mthc1   t1, $f8
+	mthc1   t1, $f9
+	mthc1   t1, $f10
+	mthc1   t1, $f11
+	mthc1   t1, $f12
+	mthc1   t1, $f13
+	mthc1   t1, $f14
+	mthc1   t1, $f15
+	mthc1   t1, $f16
+	mthc1   t1, $f17
+	mthc1   t1, $f18
+	mthc1   t1, $f19
+	mthc1   t1, $f20
+	mthc1   t1, $f21
+	mthc1   t1, $f22
+	mthc1   t1, $f23
+	mthc1   t1, $f24
+	mthc1   t1, $f25
+	mthc1   t1, $f26
+	mthc1   t1, $f27
+	mthc1   t1, $f28
+	mthc1   t1, $f29
+	mthc1   t1, $f30
+	mthc1   t1, $f31
+1:	.set    pop
+#endif /* CONFIG_CPU_MIPS32_R2 */
 #else
 	.set	mips3
 	dmtc1	t1, $f0
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index 2f285ab..5199563 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -71,8 +71,9 @@
 	int err;
 	while (1) {
 		lock_fpu_owner();
-		own_fpu_inatomic(1);
-		err = save_fp_context(sc); /* this might fail */
+		err = own_fpu_inatomic(1);
+		if (!err)
+			err = save_fp_context(sc); /* this might fail */
 		unlock_fpu_owner();
 		if (likely(!err))
 			break;
@@ -91,8 +92,9 @@
 	int err, tmp __maybe_unused;
 	while (1) {
 		lock_fpu_owner();
-		own_fpu_inatomic(0);
-		err = restore_fp_context(sc); /* this might fail */
+		err = own_fpu_inatomic(0);
+		if (!err)
+			err = restore_fp_context(sc); /* this might fail */
 		unlock_fpu_owner();
 		if (likely(!err))
 			break;
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 1905a41..3d60f77 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -85,8 +85,9 @@
 	int err;
 	while (1) {
 		lock_fpu_owner();
-		own_fpu_inatomic(1);
-		err = save_fp_context32(sc); /* this might fail */
+		err = own_fpu_inatomic(1);
+		if (!err)
+			err = save_fp_context32(sc); /* this might fail */
 		unlock_fpu_owner();
 		if (likely(!err))
 			break;
@@ -105,8 +106,9 @@
 	int err, tmp __maybe_unused;
 	while (1) {
 		lock_fpu_owner();
-		own_fpu_inatomic(0);
-		err = restore_fp_context32(sc); /* this might fail */
+		err = own_fpu_inatomic(0);
+		if (!err)
+			err = restore_fp_context32(sc); /* this might fail */
 		unlock_fpu_owner();
 		if (likely(!err))
 			break;
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index f9c8746..f40f688 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1080,7 +1080,7 @@
 	unsigned long old_epc, old31;
 	unsigned int opcode;
 	unsigned int cpid;
-	int status;
+	int status, err;
 	unsigned long __maybe_unused flags;
 
 	prev_state = exception_enter();
@@ -1153,19 +1153,19 @@
 
 	case 1:
 		if (used_math())	/* Using the FPU again.	 */
-			own_fpu(1);
+			err = own_fpu(1);
 		else {			/* First time FPU user.	 */
-			init_fpu();
+			err = init_fpu();
 			set_used_math();
 		}
 
-		if (!raw_cpu_has_fpu) {
+		if (!raw_cpu_has_fpu || err) {
 			int sig;
 			void __user *fault_addr = NULL;
 			sig = fpu_emulator_cop1Handler(regs,
 						       &current->thread.fpu,
 						       0, &fault_addr);
-			if (!process_fpemu_return(sig, fault_addr))
+			if (!process_fpemu_return(sig, fault_addr) && !err)
 				mt_ase_fp_affinity();
 		}
 
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 0e47ae2..506925b 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -859,20 +859,20 @@
  * In the Linux kernel, we support selection of FPR format on the
  * basis of the Status.FR bit.	If an FPU is not present, the FR bit
  * is hardwired to zero, which would imply a 32-bit FPU even for
- * 64-bit CPUs so we rather look at TIF_32BIT_REGS.
+ * 64-bit CPUs so we rather look at TIF_32BIT_FPREGS.
  * FPU emu is slow and bulky and optimizing this function offers fairly
  * sizeable benefits so we try to be clever and make this function return
  * a constant whenever possible, that is on 64-bit kernels without O32
- * compatibility enabled and on 32-bit kernels.
+ * compatibility enabled and on 32-bit without 64-bit FPU support.
  */
 static inline int cop1_64bit(struct pt_regs *xcp)
 {
 #if defined(CONFIG_64BIT) && !defined(CONFIG_MIPS32_O32)
 	return 1;
-#elif defined(CONFIG_64BIT) && defined(CONFIG_MIPS32_O32)
-	return !test_thread_flag(TIF_32BIT_REGS);
-#else
+#elif defined(CONFIG_32BIT) && !defined(CONFIG_MIPS_O32_FP64_SUPPORT)
 	return 0;
+#else
+	return !test_thread_flag(TIF_32BIT_FPREGS);
 #endif
 }
 
diff --git a/arch/mips/math-emu/kernel_linkage.c b/arch/mips/math-emu/kernel_linkage.c
index 1c58657..3aeae07 100644
--- a/arch/mips/math-emu/kernel_linkage.c
+++ b/arch/mips/math-emu/kernel_linkage.c
@@ -89,8 +89,9 @@
 {
 	int i;
 	int err = 0;
+	int inc = test_thread_flag(TIF_32BIT_FPREGS) ? 2 : 1;
 
-	for (i = 0; i < 32; i+=2) {
+	for (i = 0; i < 32; i += inc) {
 		err |=
 		    __put_user(current->thread.fpu.fpr[i], &sc->sc_fpregs[i]);
 	}
@@ -103,8 +104,9 @@
 {
 	int i;
 	int err = 0;
+	int inc = test_thread_flag(TIF_32BIT_FPREGS) ? 2 : 1;
 
-	for (i = 0; i < 32; i+=2) {
+	for (i = 0; i < 32; i += inc) {
 		err |=
 		    __get_user(current->thread.fpu.fpr[i], &sc->sc_fpregs[i]);
 	}