[PATCH] vdso: randomize the i386 vDSO by moving it into a vma

Move the i386 VDSO down into a vma and thus randomize it.

Besides the security implications, this feature also helps debuggers, which
can COW a vma-backed VDSO just like a normal DSO and can thus do
single-stepping and other debugging features.

It's good for hypervisors (Xen, VMWare) too, which typically live in the same
high-mapped address space as the VDSO, hence whenever the VDSO is used, they
get lots of guest pagefaults and have to fix such guest accesses up - which
slows things down instead of speeding things up (the primary purpose of the
VDSO).

There's a new CONFIG_COMPAT_VDSO (default=y) option, which provides support
for older glibcs that still rely on a prelinked high-mapped VDSO.  Newer
distributions (using glibc 2.3.3 or later) can turn this option off.  Turning
it off is also recommended for security reasons: attackers cannot use the
predictable high-mapped VDSO page as syscall trampoline anymore.

There is a new vdso=[0|1] boot option as well, and a runtime
/proc/sys/vm/vdso_enabled sysctl switch, that allows the VDSO to be turned
on/off.

(This version of the VDSO-randomization patch also has working ELF
coredumping, the previous patch crashed in the coredumping code.)

This code is a combined work of the exec-shield VDSO randomization
code and Gerd Hoffmann's hypervisor-centric VDSO patch. Rusty Russell
started this patch and i completed it.

[akpm@osdl.org: cleanups]
[akpm@osdl.org: compile fix]
[akpm@osdl.org: compile fix 2]
[akpm@osdl.org: compile fix 3]
[akpm@osdl.org: revernt MAXMEM change]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Cc: Gerd Hoffmann <kraxel@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index 4153d80..1eac92c 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -10,6 +10,7 @@
 #include <asm/processor.h>
 #include <asm/system.h>		/* for savesegment */
 #include <asm/auxvec.h>
+#include <asm/desc.h>
 
 #include <linux/utsname.h>
 
@@ -129,15 +130,41 @@
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, elf_xfpregs)
 
-#define VSYSCALL_BASE	(__fix_to_virt(FIX_VSYSCALL))
-#define VSYSCALL_EHDR	((const struct elfhdr *) VSYSCALL_BASE)
-#define VSYSCALL_ENTRY	((unsigned long) &__kernel_vsyscall)
+#define VDSO_HIGH_BASE		(__fix_to_virt(FIX_VDSO))
+#define VDSO_BASE		((unsigned long)current->mm->context.vdso)
+
+#ifdef CONFIG_COMPAT_VDSO
+# define VDSO_COMPAT_BASE	VDSO_HIGH_BASE
+# define VDSO_PRELINK		VDSO_HIGH_BASE
+#else
+# define VDSO_COMPAT_BASE	VDSO_BASE
+# define VDSO_PRELINK		0
+#endif
+
+#define VDSO_COMPAT_SYM(x) \
+		(VDSO_COMPAT_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_SYM(x) \
+		(VDSO_BASE + (unsigned long)(x) - VDSO_PRELINK)
+
+#define VDSO_HIGH_EHDR		((const struct elfhdr *) VDSO_HIGH_BASE)
+#define VDSO_EHDR		((const struct elfhdr *) VDSO_COMPAT_BASE)
+
 extern void __kernel_vsyscall;
 
+#define VDSO_ENTRY		VDSO_SYM(&__kernel_vsyscall)
+
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+                                       int executable_stack);
+
+extern unsigned int vdso_enabled;
+
 #define ARCH_DLINFO						\
-do {								\
-		NEW_AUX_ENT(AT_SYSINFO,	VSYSCALL_ENTRY);	\
-		NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE);	\
+do if (vdso_enabled) {						\
+		NEW_AUX_ENT(AT_SYSINFO,	VDSO_ENTRY);		\
+		NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_COMPAT_BASE);	\
 } while (0)
 
 /*
@@ -148,15 +175,15 @@
  * Dumping its extra ELF program headers includes all the other information
  * a debugger needs to easily find how the vsyscall DSO was being used.
  */
-#define ELF_CORE_EXTRA_PHDRS		(VSYSCALL_EHDR->e_phnum)
+#define ELF_CORE_EXTRA_PHDRS		(VDSO_HIGH_EHDR->e_phnum)
 #define ELF_CORE_WRITE_EXTRA_PHDRS					      \
 do {									      \
 	const struct elf_phdr *const vsyscall_phdrs =			      \
-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
-					   + VSYSCALL_EHDR->e_phoff);	      \
+		(const struct elf_phdr *) (VDSO_HIGH_BASE		      \
+					   + VDSO_HIGH_EHDR->e_phoff);    \
 	int i;								      \
 	Elf32_Off ofs = 0;						      \
-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+	for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {		      \
 		struct elf_phdr phdr = vsyscall_phdrs[i];		      \
 		if (phdr.p_type == PT_LOAD) {				      \
 			BUG_ON(ofs != 0);				      \
@@ -174,10 +201,10 @@
 #define ELF_CORE_WRITE_EXTRA_DATA					      \
 do {									      \
 	const struct elf_phdr *const vsyscall_phdrs =			      \
-		(const struct elf_phdr *) (VSYSCALL_BASE		      \
-					   + VSYSCALL_EHDR->e_phoff);	      \
+		(const struct elf_phdr *) (VDSO_HIGH_BASE		      \
+					   + VDSO_HIGH_EHDR->e_phoff);    \
 	int i;								      \
-	for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) {			      \
+	for (i = 0; i < VDSO_HIGH_EHDR->e_phnum; ++i) {		      \
 		if (vsyscall_phdrs[i].p_type == PT_LOAD)		      \
 			DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr,	      \
 				   PAGE_ALIGN(vsyscall_phdrs[i].p_memsz));    \
diff --git a/include/asm-i386/fixmap.h b/include/asm-i386/fixmap.h
index f7e068f..a48cc3f 100644
--- a/include/asm-i386/fixmap.h
+++ b/include/asm-i386/fixmap.h
@@ -51,7 +51,7 @@
  */
 enum fixed_addresses {
 	FIX_HOLE,
-	FIX_VSYSCALL,
+	FIX_VDSO,
 #ifdef CONFIG_X86_LOCAL_APIC
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 #endif
@@ -115,14 +115,6 @@
 #define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
 #define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
 
-/*
- * This is the range that is readable by user mode, and things
- * acting like user mode such as get_user_pages.
- */
-#define FIXADDR_USER_START	(__fix_to_virt(FIX_VSYSCALL))
-#define FIXADDR_USER_END	(FIXADDR_USER_START + PAGE_SIZE)
-
-
 extern void __this_fixmap_does_not_exist(void);
 
 /*
diff --git a/include/asm-i386/mmu.h b/include/asm-i386/mmu.h
index f431a0b..8358dd3 100644
--- a/include/asm-i386/mmu.h
+++ b/include/asm-i386/mmu.h
@@ -12,6 +12,7 @@
 	int size;
 	struct semaphore sem;
 	void *ldt;
+	void *vdso;
 } mm_context_t;
 
 #endif
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index e3a552f..f5bf544 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -96,6 +96,8 @@
 
 #ifndef __ASSEMBLY__
 
+struct vm_area_struct;
+
 /*
  * This much address space is reserved for vmalloc() and iomap()
  * as well as fixmap mappings.
@@ -139,6 +141,7 @@
 #include <asm-generic/memory_model.h>
 #include <asm-generic/page.h>
 
+#define __HAVE_ARCH_GATE_AREA 1
 #endif /* __KERNEL__ */
 
 #endif /* _I386_PAGE_H */
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index ff1e2b1..2833fa2 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -37,6 +37,7 @@
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
+	void			*sysenter_return;
 	struct restart_block    restart_block;
 
 	unsigned long           previous_esp;   /* ESP of the previous stack in case
diff --git a/include/asm-i386/unwind.h b/include/asm-i386/unwind.h
index d480f2e..69f0f1d 100644
--- a/include/asm-i386/unwind.h
+++ b/include/asm-i386/unwind.h
@@ -78,8 +78,8 @@
 	return user_mode_vm(&info->regs);
 #else
 	return info->regs.eip < PAGE_OFFSET
-	       || (info->regs.eip >= __fix_to_virt(FIX_VSYSCALL)
-	            && info->regs.eip < __fix_to_virt(FIX_VSYSCALL) + PAGE_SIZE)
+	       || (info->regs.eip >= __fix_to_virt(FIX_VDSO)
+	            && info->regs.eip < __fix_to_virt(FIX_VDSO) + PAGE_SIZE)
 	       || info->regs.esp < PAGE_OFFSET;
 #endif
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a929ea1..ff1fa87 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1065,5 +1065,7 @@
 extern int randomize_va_space;
 #endif
 
+const char *arch_vma_name(struct vm_area_struct *vma);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 349ef90..bee12a7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -189,6 +189,7 @@
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
+	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 };