x86: introduce max_low_pfn_mapped for 64-bit
when more than 4g memory is installed, don't map the big hole below 4g.
Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a31a579..9c981c4 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -130,7 +130,7 @@
if (!phys || !size)
return NULL;
- if (phys+size <= (max_pfn_mapped << PAGE_SHIFT))
+ if (phys+size <= (max_low_pfn_mapped << PAGE_SHIFT))
return __va(phys);
offset = phys & (PAGE_SIZE - 1);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index 958526d..bd182b76 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -199,10 +199,14 @@
* Don't do it for gbpages because there seems very little
* benefit in doing so.
*/
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
- (tseg >> PMD_SHIFT) <
- (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ if ((tseg>>PMD_SHIFT) <
+ (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
+ ((tseg>>PMD_SHIFT) <
+ (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
+ (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
set_memory_4k((unsigned long)__va(tseg), 1);
+ }
}
}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 3451e0b..9f5002e 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1056,7 +1056,7 @@
/*
* Find the highest page frame number we have available
*/
-unsigned long __init e820_end(void)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
{
int i;
unsigned long last_pfn = 0;
@@ -1064,12 +1064,21 @@
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
+ unsigned long start_pfn;
unsigned long end_pfn;
- if (ei->type != E820_RAM)
+ if (ei->type != type)
continue;
+ start_pfn = ei->addr >> PAGE_SHIFT;
end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
+
+ if (start_pfn >= limit_pfn)
+ continue;
+ if (end_pfn > limit_pfn) {
+ last_pfn = limit_pfn;
+ break;
+ }
if (end_pfn > last_pfn)
last_pfn = end_pfn;
}
@@ -1083,7 +1092,15 @@
last_pfn, max_arch_pfn);
return last_pfn;
}
+unsigned long __init e820_end_of_ram_pfn(void)
+{
+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+}
+unsigned long __init e820_end_of_low_ram_pfn(void)
+{
+ return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+}
/*
* Finds an active region in the address range from start_pfn to last_pfn and
* returns its range in ei_startpfn and ei_endpfn for the e820 entry.
@@ -1206,7 +1223,7 @@
* the real mem size before original memory map is
* reset.
*/
- saved_max_pfn = e820_end();
+ saved_max_pfn = e820_end_of_ram_pfn();
#endif
e820.nr_map = 0;
userdef = 1;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 94382fa..06cc8d4 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -473,7 +473,7 @@
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
- if (PFN_UP(end) <= max_pfn_mapped)
+ if (PFN_UP(end) <= max_low_pfn_mapped)
va = __va(md->phys_addr);
else
va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index a7c3471..86fc2d6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -713,14 +713,14 @@
* partially used pages are not usable - thus
* we are rounding upwards:
*/
- max_pfn = e820_end();
+ max_pfn = e820_end_of_ram_pfn();
/* preallocate 4k for mptable mpc */
early_reserve_e820_mpc_new();
/* update e820 for memory not covered by WB MTRRs */
mtrr_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
- max_pfn = e820_end();
+ max_pfn = e820_end_of_ram_pfn();
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
@@ -732,12 +732,26 @@
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
- max_low_pfn = max_pfn;
+ if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
+ max_low_pfn = e820_end_of_low_ram_pfn();
+ else
+ max_low_pfn = max_pfn;
+
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
/* max_pfn_mapped is updated here */
- max_pfn_mapped = init_memory_mapping(0, (max_low_pfn << PAGE_SHIFT));
+ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
+ max_pfn_mapped = max_low_pfn_mapped;
+
+#ifdef CONFIG_X86_64
+ if (max_pfn > max_low_pfn) {
+ max_pfn_mapped = init_memory_mapping(1UL<<32,
+ max_pfn<<PAGE_SHIFT);
+ /* can we preseve max_low_pfn ?*/
+ max_low_pfn = max_pfn;
+ }
+#endif
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b5a0fd5..029e8cf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -50,6 +50,7 @@
unsigned int __VMALLOC_RESERVE = 128 << 20;
+unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 48548ef..122bcef 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,6 +53,7 @@
* The direct mapping extends to max_pfn_mapped, so that we can directly access
* apertures, ACPI and other tables without having to play with fixmaps.
*/
+unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
static unsigned long dma_reserve __initdata;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index afd4005..0389cb8 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -536,8 +536,14 @@
set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
if (address >= (unsigned long)__va(0) &&
+ address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+ split_page_count(level);
+
+#ifdef CONFIG_X86_64
+ if (address >= (unsigned long)__va(1UL<<32) &&
address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
split_page_count(level);
+#endif
/*
* Install the new, split up pagetable. Important details here:
@@ -655,12 +661,21 @@
if (cpa->pfn > max_pfn_mapped)
return 0;
+#ifdef CONFIG_X86_64
+ if (cpa->pfn > max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+ return 0;
+#endif
/*
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (!within(cpa->vaddr, PAGE_OFFSET,
- PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+ if (!(within(cpa->vaddr, PAGE_OFFSET,
+ PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
+#ifdef CONFIG_X86_64
+ || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32),
+ PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
+#endif
+ )) {
alias_cpa = *cpa;
alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index a885a10..749766c3 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -449,7 +449,8 @@
if (retval < 0)
return 0;
- if (pfn <= max_pfn_mapped &&
+ if (((pfn <= max_low_pfn_mapped) ||
+ (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn <= max_pfn_mapped)) &&
ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
free_memtype(offset, offset + size);
printk(KERN_INFO
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 6ccd7a1..5281e34 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -334,7 +334,9 @@
flags = new_flags;
}
- if (vma->vm_pgoff <= max_pfn_mapped &&
+ if (((vma->vm_pgoff <= max_low_pfn_mapped) ||
+ (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
+ vma->vm_pgoff <= max_pfn_mapped)) &&
ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
free_memtype(addr, addr + len);
return -EINVAL;