Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 pti fixes from Thomas Gleixner:
"A set of updates for the x86/pti related code:
- Preserve r8-r11 in int $0x80. r8-r11 need to be preserved, but the
int$80 entry code removed that quite some time ago. Make it correct
again.
- A set of fixes for the Global Bit work which went into 4.17 and
caused a bunch of interesting regressions:
- Triggering a BUG in the page attribute code due to a missing
check for early boot stage
- Warnings in the page attribute code about holes in the kernel
text mapping which are caused by the freeing of the init code.
Handle such holes gracefully.
- Reduce the amount of kernel memory which is set global to the
actual text and do not incidentally overlap with data.
- Disable the global bit when RANDSTRUCT is enabled as it
partially defeats the hardening.
- Make the page protection setup correct for vma->page_prot
population again. The adjustment of the protections fell through
the crack during the Global bit rework and triggers warnings on
machines which do not support certain features, e.g. NX"
* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/entry/64/compat: Preserve r8-r11 in int $0x80
x86/pti: Filter at vma->vm_page_prot population
x86/pti: Disallow global kernel text with RANDSTRUCT
x86/pti: Reduce amount of kernel text allowed to be Global
x86/pti: Fix boot warning from Global-bit setting
x86/pti: Fix boot problems from Global-bit setting
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 00fcf81..c07f492 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -52,6 +52,7 @@
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
+ select ARCH_HAS_FILTER_PGPROT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
@@ -273,6 +274,9 @@
config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
+config ARCH_HAS_FILTER_PGPROT
+ def_bool y
+
config HAVE_SETUP_PER_CPU_AREA
def_bool y
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 9af927e..9de7f1e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -84,13 +84,13 @@
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r8 */
xorl %r8d, %r8d /* nospec r8 */
- pushq $0 /* pt_regs->r9 = 0 */
+ pushq %r9 /* pt_regs->r9 */
xorl %r9d, %r9d /* nospec r9 */
- pushq $0 /* pt_regs->r10 = 0 */
+ pushq %r10 /* pt_regs->r10 */
xorl %r10d, %r10d /* nospec r10 */
- pushq $0 /* pt_regs->r11 = 0 */
+ pushq %r11 /* pt_regs->r11 */
xorl %r11d, %r11d /* nospec r11 */
pushq %rbx /* pt_regs->rbx */
xorl %ebx, %ebx /* nospec rbx */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5f49b4f..f1633de 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -601,6 +601,11 @@
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
+static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
+{
+ return canon_pgprot(prot);
+}
+
static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
enum page_cache_mode pcm,
enum page_cache_mode new_pcm)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 0f3d50f..3bded76e 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -93,6 +93,18 @@
static inline void split_page_count(int level) { }
#endif
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr < end;
+}
+
+static inline int
+within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr <= end;
+}
+
#ifdef CONFIG_X86_64
static inline unsigned long highmap_start_pfn(void)
@@ -106,20 +118,25 @@
return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
+{
+ /*
+ * Kernel text has an alias mapping at a high address, known
+ * here as "highmap".
+ */
+ return within_inclusive(pfn, highmap_start_pfn(), highmap_end_pfn());
+}
+
+#else
+
+static bool __cpa_pfn_in_highmap(unsigned long pfn)
+{
+ /* There is no highmap on 32-bit */
+ return false;
+}
+
#endif
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr < end;
-}
-
-static inline int
-within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr <= end;
-}
-
/*
* Flushing functions
*/
@@ -172,7 +189,7 @@
static void cpa_flush_all(unsigned long cache)
{
- BUG_ON(irqs_disabled());
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
@@ -236,7 +253,7 @@
unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
#endif
- BUG_ON(irqs_disabled());
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
@@ -1183,6 +1200,10 @@
cpa->numpages = 1;
cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
return 0;
+
+ } else if (__cpa_pfn_in_highmap(cpa->pfn)) {
+ /* Faults in the highmap are OK, so do not warn: */
+ return -EFAULT;
} else {
WARN(1, KERN_WARNING "CPA: called for zero pte. "
"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
@@ -1335,8 +1356,7 @@
* to touch the high mapped kernel as well:
*/
if (!within(vaddr, (unsigned long)_text, _brk_end) &&
- within_inclusive(cpa->pfn, highmap_start_pfn(),
- highmap_end_pfn())) {
+ __cpa_pfn_in_highmap(cpa->pfn)) {
unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
__START_KERNEL_map - phys_base;
alias_cpa = *cpa;
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index f1fd52f..4d418e7 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -421,6 +421,16 @@
if (boot_cpu_has(X86_FEATURE_K8))
return false;
+ /*
+ * RANDSTRUCT derives its hardening benefits from the
+ * attacker's lack of knowledge about the layout of kernel
+ * data structures. Keep the kernel image non-global in
+ * cases where RANDSTRUCT is in use to help keep the layout a
+ * secret.
+ */
+ if (IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT))
+ return false;
+
return true;
}
@@ -430,12 +440,24 @@
*/
void pti_clone_kernel_text(void)
{
+ /*
+ * rodata is part of the kernel image and is normally
+ * readable on the filesystem or on the web. But, do not
+ * clone the areas past rodata, they might contain secrets.
+ */
unsigned long start = PFN_ALIGN(_text);
- unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
+ unsigned long end = (unsigned long)__end_rodata_hpage_align;
if (!pti_kernel_image_global_ok())
return;
+ pr_debug("mapping partial kernel image into user address space\n");
+
+ /*
+ * Note that this will undo _some_ of the work that
+ * pti_set_kernel_image_nonglobal() did to clear the
+ * global bit.
+ */
pti_clone_pmds(start, end, _PAGE_RW);
}
@@ -458,8 +480,6 @@
if (pti_kernel_image_global_ok())
return;
- pr_debug("set kernel image non-global\n");
-
set_memory_nonglobal(start, (end - start) >> PAGE_SHIFT);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 188f195..9d5968d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -100,11 +100,20 @@
__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
};
+#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
+static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
+{
+ return prot;
+}
+#endif
+
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- return __pgprot(pgprot_val(protection_map[vm_flags &
+ pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
pgprot_val(arch_vm_get_page_prot(vm_flags)));
+
+ return arch_filter_pgprot(ret);
}
EXPORT_SYMBOL(vm_get_page_prot);
diff --git a/tools/testing/selftests/x86/test_syscall_vdso.c b/tools/testing/selftests/x86/test_syscall_vdso.c
index 4037035..c9c3281 100644
--- a/tools/testing/selftests/x86/test_syscall_vdso.c
+++ b/tools/testing/selftests/x86/test_syscall_vdso.c
@@ -100,12 +100,19 @@
" shl $32, %r8\n"
" orq $0x7f7f7f7f, %r8\n"
" movq %r8, %r9\n"
- " movq %r8, %r10\n"
- " movq %r8, %r11\n"
- " movq %r8, %r12\n"
- " movq %r8, %r13\n"
- " movq %r8, %r14\n"
- " movq %r8, %r15\n"
+ " incq %r9\n"
+ " movq %r9, %r10\n"
+ " incq %r10\n"
+ " movq %r10, %r11\n"
+ " incq %r11\n"
+ " movq %r11, %r12\n"
+ " incq %r12\n"
+ " movq %r12, %r13\n"
+ " incq %r13\n"
+ " movq %r13, %r14\n"
+ " incq %r14\n"
+ " movq %r14, %r15\n"
+ " incq %r15\n"
" ret\n"
" .code32\n"
" .popsection\n"
@@ -128,12 +135,13 @@
int err = 0;
int num = 8;
uint64_t *r64 = ®s64.r8;
+ uint64_t expected = 0x7f7f7f7f7f7f7f7fULL;
if (!kernel_is_64bit)
return 0;
do {
- if (*r64 == 0x7f7f7f7f7f7f7f7fULL)
+ if (*r64 == expected++)
continue; /* register did not change */
if (syscall_addr != (long)&int80) {
/*
@@ -147,18 +155,17 @@
continue;
}
} else {
- /* INT80 syscall entrypoint can be used by
+ /*
+ * INT80 syscall entrypoint can be used by
* 64-bit programs too, unlike SYSCALL/SYSENTER.
* Therefore it must preserve R12+
* (they are callee-saved registers in 64-bit C ABI).
*
- * This was probably historically not intended,
- * but R8..11 are clobbered (cleared to 0).
- * IOW: they are the only registers which aren't
- * preserved across INT80 syscall.
+ * Starting in Linux 4.17 (and any kernel that
+ * backports the change), R8..11 are preserved.
+ * Historically (and probably unintentionally), they
+ * were clobbered or zeroed.
*/
- if (*r64 == 0 && num <= 11)
- continue;
}
printf("[FAIL]\tR%d has changed:%016llx\n", num, *r64);
err++;