KVM: MMU: Partial swapping of guest memory
This allows guest memory to be swapped. Pages which are currently mapped
via shadow page tables are pinned into memory, but all other pages can
be freely swapped.
The patch makes gfn_to_page() elevate the page's reference count, and
introduces kvm_release_page() that pairs with it.
Signed-off-by: Izik Eidus <izike@qumranet.com>
Signed-off-by: Avi Kivity <avi@qumranet.com>
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index 0c17c76..df0711c 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -409,6 +409,7 @@
unsigned long *rmap;
unsigned long *dirty_bitmap;
int user_alloc; /* user allocated memory */
+ unsigned long userspace_addr;
};
struct kvm {
@@ -570,6 +571,7 @@
int is_error_page(struct page *page);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
+void kvm_release_page(struct page *page);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 47000be..f86a47c 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -300,19 +300,6 @@
return kvm;
}
-static void kvm_free_userspace_physmem(struct kvm_memory_slot *free)
-{
- int i;
-
- for (i = 0; i < free->npages; ++i) {
- if (free->phys_mem[i]) {
- if (!PageReserved(free->phys_mem[i]))
- SetPageDirty(free->phys_mem[i]);
- page_cache_release(free->phys_mem[i]);
- }
- }
-}
-
static void kvm_free_kernel_physmem(struct kvm_memory_slot *free)
{
int i;
@@ -330,9 +317,7 @@
{
if (!dont || free->phys_mem != dont->phys_mem)
if (free->phys_mem) {
- if (free->user_alloc)
- kvm_free_userspace_physmem(free);
- else
+ if (!free->user_alloc)
kvm_free_kernel_physmem(free);
vfree(free->phys_mem);
}
@@ -361,7 +346,7 @@
for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
if (vcpu->pio.guest_pages[i]) {
- __free_page(vcpu->pio.guest_pages[i]);
+ kvm_release_page(vcpu->pio.guest_pages[i]);
vcpu->pio.guest_pages[i] = NULL;
}
}
@@ -752,19 +737,8 @@
memset(new.phys_mem, 0, npages * sizeof(struct page *));
memset(new.rmap, 0, npages * sizeof(*new.rmap));
if (user_alloc) {
- unsigned long pages_num;
-
new.user_alloc = 1;
- down_read(¤t->mm->mmap_sem);
-
- pages_num = get_user_pages(current, current->mm,
- mem->userspace_addr,
- npages, 1, 1, new.phys_mem,
- NULL);
-
- up_read(¤t->mm->mmap_sem);
- if (pages_num != npages)
- goto out_unlock;
+ new.userspace_addr = mem->userspace_addr;
} else {
for (i = 0; i < npages; ++i) {
new.phys_mem[i] = alloc_page(GFP_HIGHUSER
@@ -1039,12 +1013,39 @@
gfn = unalias_gfn(kvm, gfn);
slot = __gfn_to_memslot(kvm, gfn);
- if (!slot)
+ if (!slot) {
+ get_page(bad_page);
return bad_page;
+ }
+ if (slot->user_alloc) {
+ struct page *page[1];
+ int npages;
+
+ down_read(¤t->mm->mmap_sem);
+ npages = get_user_pages(current, current->mm,
+ slot->userspace_addr
+ + (gfn - slot->base_gfn) * PAGE_SIZE, 1,
+ 1, 1, page, NULL);
+ up_read(¤t->mm->mmap_sem);
+ if (npages != 1) {
+ get_page(bad_page);
+ return bad_page;
+ }
+ return page[0];
+ }
+ get_page(slot->phys_mem[gfn - slot->base_gfn]);
return slot->phys_mem[gfn - slot->base_gfn];
}
EXPORT_SYMBOL_GPL(gfn_to_page);
+void kvm_release_page(struct page *page)
+{
+ if (!PageReserved(page))
+ SetPageDirty(page);
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_release_page);
+
static int next_segment(unsigned long len, int offset)
{
if (len > PAGE_SIZE - offset)
@@ -1060,13 +1061,16 @@
struct page *page;
page = gfn_to_page(kvm, gfn);
- if (is_error_page(page))
+ if (is_error_page(page)) {
+ kvm_release_page(page);
return -EFAULT;
+ }
page_virt = kmap_atomic(page, KM_USER0);
memcpy(data, page_virt + offset, len);
kunmap_atomic(page_virt, KM_USER0);
+ kvm_release_page(page);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page);
@@ -1098,14 +1102,17 @@
struct page *page;
page = gfn_to_page(kvm, gfn);
- if (is_error_page(page))
+ if (is_error_page(page)) {
+ kvm_release_page(page);
return -EFAULT;
+ }
page_virt = kmap_atomic(page, KM_USER0);
memcpy(page_virt + offset, data, len);
kunmap_atomic(page_virt, KM_USER0);
mark_page_dirty(kvm, gfn);
+ kvm_release_page(page);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);
@@ -1136,13 +1143,16 @@
struct page *page;
page = gfn_to_page(kvm, gfn);
- if (is_error_page(page))
+ if (is_error_page(page)) {
+ kvm_release_page(page);
return -EFAULT;
+ }
page_virt = kmap_atomic(page, KM_USER0);
memset(page_virt + offset, 0, len);
kunmap_atomic(page_virt, KM_USER0);
+ kvm_release_page(page);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
@@ -2070,8 +2080,6 @@
for (i = 0; i < nr_pages; ++i) {
mutex_lock(&vcpu->kvm->lock);
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
- if (page)
- get_page(page);
vcpu->pio.guest_pages[i] = page;
mutex_unlock(&vcpu->kvm->lock);
if (!page) {
@@ -3074,9 +3082,10 @@
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
page = gfn_to_page(kvm, pgoff);
- if (is_error_page(page))
+ if (is_error_page(page)) {
+ kvm_release_page(page);
return NOPAGE_SIGBUS;
- get_page(page);
+ }
if (type != NULL)
*type = VM_FAULT_MINOR;
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 2ad14fb..5d7af4b 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -425,6 +425,8 @@
if (!is_rmap_pte(*spte))
return;
page = page_header(__pa(spte));
+ kvm_release_page(pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
+ PAGE_SHIFT));
rmapp = gfn_to_rmap(kvm, page->gfns[spte - page->spt]);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
@@ -911,6 +913,8 @@
PT_USER_MASK;
if (!was_rmapped)
rmap_add(vcpu, &table[index], v >> PAGE_SHIFT);
+ else
+ kvm_release_page(pfn_to_page(p >> PAGE_SHIFT));
return 0;
}
@@ -925,6 +929,7 @@
1, 3, &table[index]);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_page(pfn_to_page(p >> PAGE_SHIFT));
return -ENOMEM;
}
@@ -1039,8 +1044,11 @@
paddr = gpa_to_hpa(vcpu->kvm, addr & PT64_BASE_ADDR_MASK);
- if (is_error_hpa(paddr))
+ if (is_error_hpa(paddr)) {
+ kvm_release_page(pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
+ >> PAGE_SHIFT));
return 1;
+ }
return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
}
@@ -1507,6 +1515,7 @@
} else {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
hpa_t hpa = gpa_to_hpa(vcpu, gpa);
+ struct page *page;
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
@@ -1519,6 +1528,9 @@
&& !is_error_hpa(hpa))
printk(KERN_ERR "audit: (%s) notrap shadow,"
" valid guest gva %lx\n", audit_msg, va);
+ page = pfn_to_page((gpa & PT64_BASE_ADDR_MASK)
+ >> PAGE_SHIFT);
+ kvm_release_page(page);
}
}
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 572e5b6..0f0266a 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@
struct kvm_vcpu *vcpu, gva_t addr,
int write_fault, int user_fault, int fetch_fault)
{
- struct page *page;
+ struct page *page = NULL;
pt_element_t *table;
pt_element_t pte;
gfn_t table_gfn;
@@ -149,6 +149,7 @@
walker->inherited_ar &= pte;
--walker->level;
+ kvm_release_page(page);
}
if (write_fault && !is_dirty_pte(pte)) {
@@ -162,6 +163,7 @@
kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
}
+ kvm_release_page(page);
walker->pte = pte;
pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)pte);
return 1;
@@ -180,6 +182,8 @@
walker->error_code |= PFERR_USER_MASK;
if (fetch_fault)
walker->error_code |= PFERR_FETCH_MASK;
+ if (page)
+ kvm_release_page(page);
return 0;
}
@@ -223,6 +227,8 @@
if (is_error_hpa(paddr)) {
set_shadow_pte(shadow_pte,
shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
+ kvm_release_page(pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
+ >> PAGE_SHIFT));
return;
}
@@ -260,9 +266,20 @@
pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
set_shadow_pte(shadow_pte, spte);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
- if (!was_rmapped)
+ if (!was_rmapped) {
rmap_add(vcpu, shadow_pte, (gaddr & PT64_BASE_ADDR_MASK)
>> PAGE_SHIFT);
+ if (!is_rmap_pte(*shadow_pte)) {
+ struct page *page;
+
+ page = pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
+ >> PAGE_SHIFT);
+ kvm_release_page(page);
+ }
+ }
+ else
+ kvm_release_page(pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
+ >> PAGE_SHIFT));
if (!ptwrite || !*ptwrite)
vcpu->last_pte_updated = shadow_pte;
}
@@ -486,19 +503,22 @@
{
int i;
pt_element_t *gpt;
+ struct page *page;
if (sp->role.metaphysical || PTTYPE == 32) {
nonpaging_prefetch_page(vcpu, sp);
return;
}
- gpt = kmap_atomic(gfn_to_page(vcpu->kvm, sp->gfn), KM_USER0);
+ page = gfn_to_page(vcpu->kvm, sp->gfn);
+ gpt = kmap_atomic(page, KM_USER0);
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
if (is_present_pte(gpt[i]))
sp->spt[i] = shadow_trap_nonpresent_pte;
else
sp->spt[i] = shadow_notrap_nonpresent_pte;
kunmap_atomic(gpt, KM_USER0);
+ kvm_release_page(page);
}
#undef pt_element_t