x86: clean up the page table dumper and add 32-bit support
Clean up the page table dumper (fix boundary conditions, table driven
address ranges, some formatting changes since it is no longer using
the kernel log but a separate virtual file), and generalize to 32
bits.
[ mingo@elte.hu: x86: fix the pagetable dumper ]
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index cb7002e..7ce8e70 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -56,7 +56,7 @@
config X86_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
- depends on X86_64
+ depends on DEBUG_KERNEL
select DEBUG_FS
help
Say Y here if you want to show the kernel pagetable layout in a
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 28632f4..9ab9889 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -3,6 +3,7 @@
obj-$(CONFIG_X86_32) += pgtable_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
@@ -12,5 +13,4 @@
obj-$(CONFIG_NUMA) += numa_64.o
obj-$(CONFIG_K8_NUMA) += k8topology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_64.o
-obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
endif
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 5e7f643..6d84033 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -12,9 +12,10 @@
* of the License.
*/
+#include <linux/debugfs.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/seq_file.h>
-#include <linux/debugfs.h>
#include <asm/pgtable.h>
@@ -28,73 +29,107 @@
pgprot_t current_prot;
unsigned long start_address;
unsigned long current_address;
- int printed_vmalloc;
- int printed_modules;
- int printed_vmemmap;
- int printed_highmap;
+ const struct addr_marker *marker;
+};
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+ { 0, "User Space" },
+#ifdef CONFIG_X86_64
+ { 0x8000000000000000UL, "Kernel Space" },
+ { 0xffff810000000000UL, "Low Kernel Mapping" },
+ { VMALLOC_START, "vmalloc() Area" },
+ { MODULES_VADDR, "Modules" },
+ { MODULES_END, "End Modules" },
+ { VMEMMAP_START, "Vmemmap" },
+ { __START_KERNEL_map, "High Kernel Mapping" },
+#else
+ { PAGE_OFFSET, "Kernel Mapping" },
+ { 0/* VMALLOC_START */, "vmalloc() Area" },
+ { 0/*VMALLOC_END*/, "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+ { 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
+# endif
+ { 0/*FIXADDR_START*/, "Fixmap Area" },
+#endif
+ { -1, NULL } /* End of list */
};
/* Multipliers for offsets within the PTEs */
-#define LEVEL_4_MULT (PAGE_SIZE)
-#define LEVEL_3_MULT (512UL * LEVEL_4_MULT)
-#define LEVEL_2_MULT (512UL * LEVEL_3_MULT)
-#define LEVEL_1_MULT (512UL * LEVEL_2_MULT)
-
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
/*
* Print a readable form of a pgprot_t to the seq_file
*/
static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
{
- unsigned long pr = pgprot_val(prot);
+ pgprotval_t pr = pgprot_val(prot);
+ static const char * const level_name[] =
+ { "cr3", "pgd", "pud", "pmd", "pte" };
- if (pr & _PAGE_USER)
- seq_printf(m, "USR ");
- else
- seq_printf(m, " ");
- if (pr & _PAGE_RW)
- seq_printf(m, "RW ");
- else
- seq_printf(m, "ro ");
- if (pr & _PAGE_PWT)
- seq_printf(m, "PWT ");
- else
- seq_printf(m, " ");
- if (pr & _PAGE_PCD)
- seq_printf(m, "PCD ");
- else
- seq_printf(m, " ");
-
- /* Bit 9 has a different meaning on level 3 vs 4 */
- if (level <= 3) {
- if (pr & _PAGE_PSE)
- seq_printf(m, "PSE ");
- else
- seq_printf(m, " ");
+ if (!pgprot_val(prot)) {
+ /* Not present */
+ seq_printf(m, " ");
} else {
- if (pr & _PAGE_PAT)
- seq_printf(m, "pat ");
+ if (pr & _PAGE_USER)
+ seq_printf(m, "USR ");
else
seq_printf(m, " ");
+ if (pr & _PAGE_RW)
+ seq_printf(m, "RW ");
+ else
+ seq_printf(m, "ro ");
+ if (pr & _PAGE_PWT)
+ seq_printf(m, "PWT ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_PCD)
+ seq_printf(m, "PCD ");
+ else
+ seq_printf(m, " ");
+
+ /* Bit 9 has a different meaning on level 3 vs 4 */
+ if (level <= 3) {
+ if (pr & _PAGE_PSE)
+ seq_printf(m, "PSE ");
+ else
+ seq_printf(m, " ");
+ } else {
+ if (pr & _PAGE_PAT)
+ seq_printf(m, "pat ");
+ else
+ seq_printf(m, " ");
+ }
+ if (pr & _PAGE_GLOBAL)
+ seq_printf(m, "GLB ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_NX)
+ seq_printf(m, "NX ");
+ else
+ seq_printf(m, "x ");
}
- if (pr & _PAGE_GLOBAL)
- seq_printf(m, "GLB ");
- else
- seq_printf(m, " ");
- if (pr & _PAGE_NX)
- seq_printf(m, "NX ");
- else
- seq_printf(m, "x ");
+ seq_printf(m, "%s\n", level_name[level]);
}
/*
- * Sign-extend the 48 bit address to 64 bit
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
*/
-static unsigned long sign_extend(unsigned long u)
+static unsigned long normalize_addr(unsigned long u)
{
- if (u>>47)
- u = u | (0xffffUL << 48);
+#ifdef CONFIG_X86_64
+ return (signed long)(u << 16) >> 16;
+#else
return u;
+#endif
}
/*
@@ -103,81 +138,62 @@
* print what we collected so far.
*/
static void note_page(struct seq_file *m, struct pg_state *st,
- pgprot_t new_prot, int level)
+ pgprot_t new_prot, int level)
{
- unsigned long prot, cur;
+ pgprotval_t prot, cur;
+ static const char units[] = "KMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
- * we have now. "break" is either changing perms or a different level.
+ * we have now. "break" is either changing perms, levels or
+ * address space marker.
*/
prot = pgprot_val(new_prot) & ~(PTE_MASK);
cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
- if ((prot != cur || level != st->level) &&
- st->current_address != st->start_address) {
- char unit = 'K';
+ if (!st->level) {
+ /* First entry */
+ st->current_prot = new_prot;
+ st->level = level;
+ st->marker = address_markers;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ } else if (prot != cur || level != st->level ||
+ st->current_address >= st->marker[1].start_address) {
+ const char *unit = units;
unsigned long delta;
/*
+ * Now print the actual finished series
+ */
+ seq_printf(m, "0x%p-0x%p ",
+ (void *)st->start_address,
+ (void *)st->current_address);
+
+ delta = (st->current_address - st->start_address) >> 10;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(m, "%9lu%c ", delta, *unit);
+ printk_prot(m, st->current_prot, st->level);
+
+ /*
* We print markers for special areas of address space,
* such as the start of vmalloc space etc.
* This helps in the interpretation.
*/
- if (!st->printed_vmalloc &&
- st->start_address >= VMALLOC_START) {
- seq_printf(m, "---[ VMALLOC SPACE ]---\n");
- st->printed_vmalloc = 1;
- }
- if (!st->printed_modules &&
- st->start_address >= MODULES_VADDR) {
- seq_printf(m, "---[ MODULES SPACE ]---\n");
- st->printed_modules = 1;
- }
- if (st->printed_modules < 2 &&
- st->start_address >= MODULES_END) {
- seq_printf(m, "---[ END MODULES SPACE ]---\n");
- st->printed_modules = 2;
- }
- if (!st->printed_vmemmap &&
- st->start_address >= VMEMMAP_START) {
- seq_printf(m, "---[ VMMEMMAP SPACE ]---\n");
- st->printed_vmemmap = 1;
- }
- if (!st->printed_highmap &&
- st->start_address >= __START_KERNEL_map) {
- seq_printf(m, "---[ HIGH KERNEL MAPPING ]---\n");
- st->printed_highmap = 1;
+ if (st->current_address >= st->marker[1].start_address) {
+ st->marker++;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
- /*
- * Now print the actual finished series
- */
- seq_printf(m, "[ %016lx - %016lx ",
- st->start_address, st->current_address);
-
- delta = (st->current_address - st->start_address) >> 10;
- if ((delta & 1023) == 0) {
- delta = delta >> 10;
- unit = 'M';
- }
- if (pgprot_val(st->current_prot)) {
- seq_printf(m, "Size %9lu%cb ", delta, unit);
- printk_prot(m, st->current_prot, st->level);
- seq_printf(m, "L%i]\n", st->level);
- } else {
- /* don't print protections on non-present memory */
- seq_printf(m, "%14lu%cb", delta, unit);
- seq_printf(m, " L%i]\n",
- st->level);
- }
st->start_address = st->current_address;
st->current_prot = new_prot;
st->level = level;
- };
+ }
}
-static void walk_level_4(struct seq_file *m, struct pg_state *st, pmd_t addr,
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
unsigned long P)
{
int i;
@@ -187,14 +203,15 @@
for (i = 0; i < PTRS_PER_PTE; i++) {
pgprot_t prot = pte_pgprot(*start);
- st->current_address = sign_extend(P + i * LEVEL_4_MULT);
+ st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
note_page(m, st, prot, 4);
start++;
}
}
+#if PTRS_PER_PMD > 1
-static void walk_level_3(struct seq_file *m, struct pg_state *st, pud_t addr,
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
unsigned long P)
{
int i;
@@ -202,25 +219,30 @@
start = (pmd_t *) pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
- st->current_address = sign_extend(P + i * LEVEL_3_MULT);
+ st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
- unsigned long prot;
+ pgprotval_t prot = pmd_val(*start) & ~PTE_MASK;
- prot = pmd_val(*start) & ~(PTE_MASK);
- /* Deal with 2Mb pages */
- if (pmd_large(*start))
+ if (pmd_large(*start) || !pmd_present(*start))
note_page(m, st, __pgprot(prot), 3);
else
- walk_level_4(m, st, *start,
- P + i * LEVEL_3_MULT);
+ walk_pte_level(m, st, *start,
+ P + i * PMD_LEVEL_MULT);
} else
note_page(m, st, __pgprot(0), 3);
start++;
}
}
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a) pmd_none(__pmd(pud_val(a)))
+#endif
-static void walk_level_2(struct seq_file *m, struct pg_state *st, pgd_t addr,
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
unsigned long P)
{
int i;
@@ -229,16 +251,15 @@
start = (pud_t *) pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
+ st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
if (!pud_none(*start)) {
- unsigned long prot;
+ pgprotval_t prot = pud_val(*start) & ~PTE_MASK;
- prot = pud_val(*start) & ~(PTE_MASK);
- /* Deal with 1Gb pages */
- if (pud_large(*start))
+ if (pud_large(*start) || !pud_present(*start))
note_page(m, st, __pgprot(prot), 2);
else
- walk_level_3(m, st, *start,
- P + i * LEVEL_2_MULT);
+ walk_pmd_level(m, st, *start,
+ P + i * PUD_LEVEL_MULT);
} else
note_page(m, st, __pgprot(0), 2);
@@ -246,28 +267,48 @@
}
}
-static void walk_level_1(struct seq_file *m)
+#else
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#endif
+
+static void walk_pgd_level(struct seq_file *m)
{
+#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+ pgd_t *start = swapper_pg_dir;
+#endif
int i;
struct pg_state st;
memset(&st, 0, sizeof(st));
- st.level = 1;
for (i = 0; i < PTRS_PER_PGD; i++) {
- if (!pgd_none(*start))
- walk_level_2(m, &st, *start, i * LEVEL_1_MULT);
- else
+ st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+ if (!pgd_none(*start)) {
+ pgprotval_t prot = pgd_val(*start) & ~PTE_MASK;
+
+ if (pgd_large(*start) || !pgd_present(*start))
+ note_page(m, &st, __pgprot(prot), 1);
+ else
+ walk_pud_level(m, &st, *start,
+ i * PGD_LEVEL_MULT);
+ } else
note_page(m, &st, __pgprot(0), 1);
+
start++;
}
+
+ /* Flush out the last page */
+ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+ note_page(m, &st, __pgprot(0), 0);
}
static int ptdump_show(struct seq_file *m, void *v)
{
- seq_puts(m, "Kernel pagetable dump\n");
- walk_level_1(m);
+ walk_pgd_level(m);
return 0;
}
@@ -287,6 +328,18 @@
{
struct dentry *pe;
+#ifdef CONFIG_X86_32
+ /* Not a compile-time constant on x86-32 */
+ address_markers[2].start_address = VMALLOC_START;
+ address_markers[3].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+ address_markers[4].start_address = PKMAP_BASE;
+ address_markers[5].start_address = FIXADDR_START;
+# else
+ address_markers[4].start_address = FIXADDR_START;
+# endif
+#endif
+
pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
&ptdump_fops);
if (!pe)