blob: f2a6d9b8ca2d84f754d55b49f4ed2a5a19f6fec2 [file] [log] [blame]
Christoffer Dall749cf76c2013-01-20 18:28:06 -05001/*
2 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
3 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License, version 2, as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */
Christoffer Dall342cd0a2013-01-20 18:28:06 -050018
19#include <linux/mman.h>
20#include <linux/kvm_host.h>
21#include <linux/io.h>
Christoffer Dallad361f02012-11-01 17:14:45 +010022#include <linux/hugetlb.h>
Christoffer Dall45e96ea2013-01-20 18:43:58 -050023#include <trace/events/kvm.h>
Christoffer Dall342cd0a2013-01-20 18:28:06 -050024#include <asm/pgalloc.h>
Christoffer Dall94f8e642013-01-20 18:28:12 -050025#include <asm/cacheflush.h>
Christoffer Dall342cd0a2013-01-20 18:28:06 -050026#include <asm/kvm_arm.h>
27#include <asm/kvm_mmu.h>
Christoffer Dall45e96ea2013-01-20 18:43:58 -050028#include <asm/kvm_mmio.h>
Christoffer Dalld5d81842013-01-20 18:28:07 -050029#include <asm/kvm_asm.h>
Christoffer Dall94f8e642013-01-20 18:28:12 -050030#include <asm/kvm_emulate.h>
Marc Zyngier1e947ba2015-01-29 11:59:54 +000031#include <asm/virt.h>
Christoffer Dalld5d81842013-01-20 18:28:07 -050032
33#include "trace.h"
Christoffer Dall342cd0a2013-01-20 18:28:06 -050034
35extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
36
Marc Zyngier5a677ce2013-04-12 19:12:06 +010037static pgd_t *boot_hyp_pgd;
Marc Zyngier2fb41052013-04-12 19:12:03 +010038static pgd_t *hyp_pgd;
Ard Biesheuvele4c5a682015-03-19 16:42:28 +000039static pgd_t *merged_hyp_pgd;
Christoffer Dall342cd0a2013-01-20 18:28:06 -050040static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
41
Marc Zyngier5a677ce2013-04-12 19:12:06 +010042static unsigned long hyp_idmap_start;
43static unsigned long hyp_idmap_end;
44static phys_addr_t hyp_idmap_vector;
45
Christoffer Dall38f791a2014-10-10 12:14:28 +020046#define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t))
Mark Salter5d4e08c2014-03-28 14:25:19 +000047
Mario Smarduch15a49a42015-01-15 15:58:58 -080048#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
49#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
50
51static bool memslot_is_logging(struct kvm_memory_slot *memslot)
52{
Mario Smarduch15a49a42015-01-15 15:58:58 -080053 return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
Mario Smarduch72760302015-01-15 15:59:01 -080054}
55
56/**
57 * kvm_flush_remote_tlbs() - flush all VM TLB entries for v7/8
58 * @kvm: pointer to kvm structure.
59 *
60 * Interface to HYP function to flush all VM TLB entries
61 */
62void kvm_flush_remote_tlbs(struct kvm *kvm)
63{
64 kvm_call_hyp(__kvm_tlb_flush_vmid, kvm);
Mario Smarduch15a49a42015-01-15 15:58:58 -080065}
Christoffer Dallad361f02012-11-01 17:14:45 +010066
Marc Zyngier48762762013-01-28 15:27:00 +000067static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
Christoffer Dalld5d81842013-01-20 18:28:07 -050068{
Marc Zyngierd4cb9df52013-05-14 12:11:34 +010069 /*
70 * This function also gets called when dealing with HYP page
71 * tables. As HYP doesn't have an associated struct kvm (and
72 * the HYP page tables are fairly static), we don't do
73 * anything there.
74 */
75 if (kvm)
76 kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
Christoffer Dalld5d81842013-01-20 18:28:07 -050077}
78
Marc Zyngier363ef892014-12-19 16:48:06 +000079/*
80 * D-Cache management functions. They take the page table entries by
81 * value, as they are flushing the cache using the kernel mapping (or
82 * kmap on 32bit).
83 */
84static void kvm_flush_dcache_pte(pte_t pte)
85{
86 __kvm_flush_dcache_pte(pte);
87}
88
89static void kvm_flush_dcache_pmd(pmd_t pmd)
90{
91 __kvm_flush_dcache_pmd(pmd);
92}
93
94static void kvm_flush_dcache_pud(pud_t pud)
95{
96 __kvm_flush_dcache_pud(pud);
97}
98
Ard Biesheuvele6fab542015-11-10 15:11:20 +010099static bool kvm_is_device_pfn(unsigned long pfn)
100{
101 return !pfn_valid(pfn);
102}
103
Mario Smarduch15a49a42015-01-15 15:58:58 -0800104/**
105 * stage2_dissolve_pmd() - clear and flush huge PMD entry
106 * @kvm: pointer to kvm structure.
107 * @addr: IPA
108 * @pmd: pmd pointer for IPA
109 *
110 * Function clears a PMD entry, flushes addr 1st and 2nd stage TLBs. Marks all
111 * pages in the range dirty.
112 */
113static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
114{
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +0000115 if (!pmd_thp_or_huge(*pmd))
Mario Smarduch15a49a42015-01-15 15:58:58 -0800116 return;
117
118 pmd_clear(pmd);
119 kvm_tlb_flush_vmid_ipa(kvm, addr);
120 put_page(virt_to_page(pmd));
121}
122
Christoffer Dalld5d81842013-01-20 18:28:07 -0500123static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
124 int min, int max)
125{
126 void *page;
127
128 BUG_ON(max > KVM_NR_MEM_OBJS);
129 if (cache->nobjs >= min)
130 return 0;
131 while (cache->nobjs < max) {
132 page = (void *)__get_free_page(PGALLOC_GFP);
133 if (!page)
134 return -ENOMEM;
135 cache->objects[cache->nobjs++] = page;
136 }
137 return 0;
138}
139
140static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
141{
142 while (mc->nobjs)
143 free_page((unsigned long)mc->objects[--mc->nobjs]);
144}
145
146static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
147{
148 void *p;
149
150 BUG_ON(!mc || !mc->nobjs);
151 p = mc->objects[--mc->nobjs];
152 return p;
153}
154
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000155static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr)
Marc Zyngier979acd52013-08-06 13:05:48 +0100156{
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000157 pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL);
158 stage2_pgd_clear(pgd);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200159 kvm_tlb_flush_vmid_ipa(kvm, addr);
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000160 stage2_pud_free(pud_table);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200161 put_page(virt_to_page(pgd));
Marc Zyngier979acd52013-08-06 13:05:48 +0100162}
163
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000164static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500165{
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000166 pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0);
167 VM_BUG_ON(stage2_pud_huge(*pud));
168 stage2_pud_clear(pud);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200169 kvm_tlb_flush_vmid_ipa(kvm, addr);
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000170 stage2_pmd_free(pmd_table);
Marc Zyngier4f728272013-04-12 19:12:05 +0100171 put_page(virt_to_page(pud));
172}
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500173
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000174static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr)
Marc Zyngier4f728272013-04-12 19:12:05 +0100175{
Christoffer Dall4f853a72014-05-09 23:31:31 +0200176 pte_t *pte_table = pte_offset_kernel(pmd, 0);
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +0000177 VM_BUG_ON(pmd_thp_or_huge(*pmd));
Christoffer Dall4f853a72014-05-09 23:31:31 +0200178 pmd_clear(pmd);
179 kvm_tlb_flush_vmid_ipa(kvm, addr);
180 pte_free_kernel(NULL, pte_table);
Marc Zyngier4f728272013-04-12 19:12:05 +0100181 put_page(virt_to_page(pmd));
182}
183
Marc Zyngier363ef892014-12-19 16:48:06 +0000184/*
185 * Unmapping vs dcache management:
186 *
187 * If a guest maps certain memory pages as uncached, all writes will
188 * bypass the data cache and go directly to RAM. However, the CPUs
189 * can still speculate reads (not writes) and fill cache lines with
190 * data.
191 *
192 * Those cache lines will be *clean* cache lines though, so a
193 * clean+invalidate operation is equivalent to an invalidate
194 * operation, because no cache lines are marked dirty.
195 *
196 * Those clean cache lines could be filled prior to an uncached write
197 * by the guest, and the cache coherent IO subsystem would therefore
198 * end up writing old data to disk.
199 *
200 * This is why right after unmapping a page/section and invalidating
201 * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure
202 * the IO subsystem will never hit in the cache.
203 */
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000204static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd,
Christoffer Dall4f853a72014-05-09 23:31:31 +0200205 phys_addr_t addr, phys_addr_t end)
Marc Zyngier4f728272013-04-12 19:12:05 +0100206{
Christoffer Dall4f853a72014-05-09 23:31:31 +0200207 phys_addr_t start_addr = addr;
208 pte_t *pte, *start_pte;
209
210 start_pte = pte = pte_offset_kernel(pmd, addr);
211 do {
212 if (!pte_none(*pte)) {
Marc Zyngier363ef892014-12-19 16:48:06 +0000213 pte_t old_pte = *pte;
214
Christoffer Dall4f853a72014-05-09 23:31:31 +0200215 kvm_set_pte(pte, __pte(0));
Christoffer Dall4f853a72014-05-09 23:31:31 +0200216 kvm_tlb_flush_vmid_ipa(kvm, addr);
Marc Zyngier363ef892014-12-19 16:48:06 +0000217
218 /* No need to invalidate the cache for device mappings */
Ard Biesheuvel0de58f82015-12-03 09:25:22 +0100219 if (!kvm_is_device_pfn(pte_pfn(old_pte)))
Marc Zyngier363ef892014-12-19 16:48:06 +0000220 kvm_flush_dcache_pte(old_pte);
221
222 put_page(virt_to_page(pte));
Christoffer Dall4f853a72014-05-09 23:31:31 +0200223 }
224 } while (pte++, addr += PAGE_SIZE, addr != end);
225
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000226 if (stage2_pte_table_empty(start_pte))
227 clear_stage2_pmd_entry(kvm, pmd, start_addr);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500228}
229
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000230static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud,
Christoffer Dall4f853a72014-05-09 23:31:31 +0200231 phys_addr_t addr, phys_addr_t end)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500232{
Christoffer Dall4f853a72014-05-09 23:31:31 +0200233 phys_addr_t next, start_addr = addr;
234 pmd_t *pmd, *start_pmd;
Marc Zyngier000d3992013-03-05 02:43:17 +0000235
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000236 start_pmd = pmd = stage2_pmd_offset(pud, addr);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200237 do {
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000238 next = stage2_pmd_addr_end(addr, end);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200239 if (!pmd_none(*pmd)) {
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +0000240 if (pmd_thp_or_huge(*pmd)) {
Marc Zyngier363ef892014-12-19 16:48:06 +0000241 pmd_t old_pmd = *pmd;
242
Christoffer Dall4f853a72014-05-09 23:31:31 +0200243 pmd_clear(pmd);
244 kvm_tlb_flush_vmid_ipa(kvm, addr);
Marc Zyngier363ef892014-12-19 16:48:06 +0000245
246 kvm_flush_dcache_pmd(old_pmd);
247
Christoffer Dall4f853a72014-05-09 23:31:31 +0200248 put_page(virt_to_page(pmd));
249 } else {
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000250 unmap_stage2_ptes(kvm, pmd, addr, next);
Marc Zyngier4f728272013-04-12 19:12:05 +0100251 }
252 }
Christoffer Dall4f853a72014-05-09 23:31:31 +0200253 } while (pmd++, addr = next, addr != end);
Marc Zyngier4f728272013-04-12 19:12:05 +0100254
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000255 if (stage2_pmd_table_empty(start_pmd))
256 clear_stage2_pud_entry(kvm, pud, start_addr);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200257}
258
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000259static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd,
Christoffer Dall4f853a72014-05-09 23:31:31 +0200260 phys_addr_t addr, phys_addr_t end)
261{
262 phys_addr_t next, start_addr = addr;
263 pud_t *pud, *start_pud;
264
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000265 start_pud = pud = stage2_pud_offset(pgd, addr);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200266 do {
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000267 next = stage2_pud_addr_end(addr, end);
268 if (!stage2_pud_none(*pud)) {
269 if (stage2_pud_huge(*pud)) {
Marc Zyngier363ef892014-12-19 16:48:06 +0000270 pud_t old_pud = *pud;
271
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000272 stage2_pud_clear(pud);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200273 kvm_tlb_flush_vmid_ipa(kvm, addr);
Marc Zyngier363ef892014-12-19 16:48:06 +0000274 kvm_flush_dcache_pud(old_pud);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200275 put_page(virt_to_page(pud));
276 } else {
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000277 unmap_stage2_pmds(kvm, pud, addr, next);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200278 }
279 }
280 } while (pud++, addr = next, addr != end);
281
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000282 if (stage2_pud_table_empty(start_pud))
283 clear_stage2_pgd_entry(kvm, pgd, start_addr);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200284}
285
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000286/**
287 * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
288 * @kvm: The VM pointer
289 * @start: The intermediate physical base address of the range to unmap
290 * @size: The size of the area to unmap
291 *
292 * Clear a range of stage-2 mappings, lowering the various ref-counts. Must
293 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
294 * destroying the VM), otherwise another faulting VCPU may come in and mess
295 * with things behind our backs.
296 */
297static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
Christoffer Dall4f853a72014-05-09 23:31:31 +0200298{
299 pgd_t *pgd;
300 phys_addr_t addr = start, end = start + size;
301 phys_addr_t next;
302
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000303 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200304 do {
Suzuki K Poulose7a1c8312016-03-23 12:08:02 +0000305 next = stage2_pgd_addr_end(addr, end);
306 if (!stage2_pgd_none(*pgd))
307 unmap_stage2_puds(kvm, pgd, addr, next);
Christoffer Dall4f853a72014-05-09 23:31:31 +0200308 } while (pgd++, addr = next, addr != end);
Marc Zyngier000d3992013-03-05 02:43:17 +0000309}
310
Marc Zyngier9d218a12014-01-15 12:50:23 +0000311static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
312 phys_addr_t addr, phys_addr_t end)
313{
314 pte_t *pte;
315
316 pte = pte_offset_kernel(pmd, addr);
317 do {
Ard Biesheuvel0de58f82015-12-03 09:25:22 +0100318 if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte)))
Marc Zyngier363ef892014-12-19 16:48:06 +0000319 kvm_flush_dcache_pte(*pte);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000320 } while (pte++, addr += PAGE_SIZE, addr != end);
321}
322
323static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud,
324 phys_addr_t addr, phys_addr_t end)
325{
326 pmd_t *pmd;
327 phys_addr_t next;
328
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000329 pmd = stage2_pmd_offset(pud, addr);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000330 do {
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000331 next = stage2_pmd_addr_end(addr, end);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000332 if (!pmd_none(*pmd)) {
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +0000333 if (pmd_thp_or_huge(*pmd))
Marc Zyngier363ef892014-12-19 16:48:06 +0000334 kvm_flush_dcache_pmd(*pmd);
335 else
Marc Zyngier9d218a12014-01-15 12:50:23 +0000336 stage2_flush_ptes(kvm, pmd, addr, next);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000337 }
338 } while (pmd++, addr = next, addr != end);
339}
340
341static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd,
342 phys_addr_t addr, phys_addr_t end)
343{
344 pud_t *pud;
345 phys_addr_t next;
346
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000347 pud = stage2_pud_offset(pgd, addr);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000348 do {
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000349 next = stage2_pud_addr_end(addr, end);
350 if (!stage2_pud_none(*pud)) {
351 if (stage2_pud_huge(*pud))
Marc Zyngier363ef892014-12-19 16:48:06 +0000352 kvm_flush_dcache_pud(*pud);
353 else
Marc Zyngier9d218a12014-01-15 12:50:23 +0000354 stage2_flush_pmds(kvm, pud, addr, next);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000355 }
356 } while (pud++, addr = next, addr != end);
357}
358
359static void stage2_flush_memslot(struct kvm *kvm,
360 struct kvm_memory_slot *memslot)
361{
362 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
363 phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
364 phys_addr_t next;
365 pgd_t *pgd;
366
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000367 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000368 do {
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000369 next = stage2_pgd_addr_end(addr, end);
Marc Zyngier9d218a12014-01-15 12:50:23 +0000370 stage2_flush_puds(kvm, pgd, addr, next);
371 } while (pgd++, addr = next, addr != end);
372}
373
374/**
375 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
376 * @kvm: The struct kvm pointer
377 *
378 * Go through the stage 2 page tables and invalidate any cache lines
379 * backing memory already mapped to the VM.
380 */
Marc Zyngier3c1e7162014-12-19 16:05:31 +0000381static void stage2_flush_vm(struct kvm *kvm)
Marc Zyngier9d218a12014-01-15 12:50:23 +0000382{
383 struct kvm_memslots *slots;
384 struct kvm_memory_slot *memslot;
385 int idx;
386
387 idx = srcu_read_lock(&kvm->srcu);
388 spin_lock(&kvm->mmu_lock);
389
390 slots = kvm_memslots(kvm);
391 kvm_for_each_memslot(memslot, slots)
392 stage2_flush_memslot(kvm, memslot);
393
394 spin_unlock(&kvm->mmu_lock);
395 srcu_read_unlock(&kvm->srcu, idx);
396}
397
Suzuki K Poulose64f32492016-03-22 18:56:21 +0000398static void clear_hyp_pgd_entry(pgd_t *pgd)
399{
400 pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL);
401 pgd_clear(pgd);
402 pud_free(NULL, pud_table);
403 put_page(virt_to_page(pgd));
404}
405
406static void clear_hyp_pud_entry(pud_t *pud)
407{
408 pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0);
409 VM_BUG_ON(pud_huge(*pud));
410 pud_clear(pud);
411 pmd_free(NULL, pmd_table);
412 put_page(virt_to_page(pud));
413}
414
415static void clear_hyp_pmd_entry(pmd_t *pmd)
416{
417 pte_t *pte_table = pte_offset_kernel(pmd, 0);
418 VM_BUG_ON(pmd_thp_or_huge(*pmd));
419 pmd_clear(pmd);
420 pte_free_kernel(NULL, pte_table);
421 put_page(virt_to_page(pmd));
422}
423
424static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
425{
426 pte_t *pte, *start_pte;
427
428 start_pte = pte = pte_offset_kernel(pmd, addr);
429 do {
430 if (!pte_none(*pte)) {
431 kvm_set_pte(pte, __pte(0));
432 put_page(virt_to_page(pte));
433 }
434 } while (pte++, addr += PAGE_SIZE, addr != end);
435
436 if (hyp_pte_table_empty(start_pte))
437 clear_hyp_pmd_entry(pmd);
438}
439
440static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
441{
442 phys_addr_t next;
443 pmd_t *pmd, *start_pmd;
444
445 start_pmd = pmd = pmd_offset(pud, addr);
446 do {
447 next = pmd_addr_end(addr, end);
448 /* Hyp doesn't use huge pmds */
449 if (!pmd_none(*pmd))
450 unmap_hyp_ptes(pmd, addr, next);
451 } while (pmd++, addr = next, addr != end);
452
453 if (hyp_pmd_table_empty(start_pmd))
454 clear_hyp_pud_entry(pud);
455}
456
457static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
458{
459 phys_addr_t next;
460 pud_t *pud, *start_pud;
461
462 start_pud = pud = pud_offset(pgd, addr);
463 do {
464 next = pud_addr_end(addr, end);
465 /* Hyp doesn't use huge puds */
466 if (!pud_none(*pud))
467 unmap_hyp_pmds(pud, addr, next);
468 } while (pud++, addr = next, addr != end);
469
470 if (hyp_pud_table_empty(start_pud))
471 clear_hyp_pgd_entry(pgd);
472}
473
474static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
475{
476 pgd_t *pgd;
477 phys_addr_t addr = start, end = start + size;
478 phys_addr_t next;
479
480 /*
481 * We don't unmap anything from HYP, except at the hyp tear down.
482 * Hence, we don't have to invalidate the TLBs here.
483 */
484 pgd = pgdp + pgd_index(addr);
485 do {
486 next = pgd_addr_end(addr, end);
487 if (!pgd_none(*pgd))
488 unmap_hyp_puds(pgd, addr, next);
489 } while (pgd++, addr = next, addr != end);
490}
491
Marc Zyngier000d3992013-03-05 02:43:17 +0000492/**
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100493 * free_boot_hyp_pgd - free HYP boot page tables
494 *
495 * Free the HYP boot page tables. The bounce page is also freed.
496 */
497void free_boot_hyp_pgd(void)
498{
499 mutex_lock(&kvm_hyp_pgd_mutex);
500
501 if (boot_hyp_pgd) {
Suzuki K Poulose64f32492016-03-22 18:56:21 +0000502 unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
503 unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
Christoffer Dall38f791a2014-10-10 12:14:28 +0200504 free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100505 boot_hyp_pgd = NULL;
506 }
507
508 if (hyp_pgd)
Suzuki K Poulose64f32492016-03-22 18:56:21 +0000509 unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100510
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100511 mutex_unlock(&kvm_hyp_pgd_mutex);
512}
513
514/**
Marc Zyngier4f728272013-04-12 19:12:05 +0100515 * free_hyp_pgds - free Hyp-mode page tables
Marc Zyngier000d3992013-03-05 02:43:17 +0000516 *
Marc Zyngier5a677ce2013-04-12 19:12:06 +0100517 * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
518 * therefore contains either mappings in the kernel memory area (above
519 * PAGE_OFFSET), or device mappings in the vmalloc range (from
520 * VMALLOC_START to VMALLOC_END).
521 *
522 * boot_hyp_pgd should only map two pages for the init code.
Marc Zyngier000d3992013-03-05 02:43:17 +0000523 */
Marc Zyngier4f728272013-04-12 19:12:05 +0100524void free_hyp_pgds(void)
Marc Zyngier000d3992013-03-05 02:43:17 +0000525{
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500526 unsigned long addr;
527
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100528 free_boot_hyp_pgd();
Marc Zyngier4f728272013-04-12 19:12:05 +0100529
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100530 mutex_lock(&kvm_hyp_pgd_mutex);
Marc Zyngier5a677ce2013-04-12 19:12:06 +0100531
Marc Zyngier4f728272013-04-12 19:12:05 +0100532 if (hyp_pgd) {
533 for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
Suzuki K Poulose64f32492016-03-22 18:56:21 +0000534 unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
Marc Zyngier4f728272013-04-12 19:12:05 +0100535 for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
Suzuki K Poulose64f32492016-03-22 18:56:21 +0000536 unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
Marc Zyngierd4cb9df52013-05-14 12:11:34 +0100537
Christoffer Dall38f791a2014-10-10 12:14:28 +0200538 free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
Marc Zyngierd157f4a2013-04-12 19:12:07 +0100539 hyp_pgd = NULL;
Marc Zyngier4f728272013-04-12 19:12:05 +0100540 }
Ard Biesheuvele4c5a682015-03-19 16:42:28 +0000541 if (merged_hyp_pgd) {
542 clear_page(merged_hyp_pgd);
543 free_page((unsigned long)merged_hyp_pgd);
544 merged_hyp_pgd = NULL;
545 }
Marc Zyngier4f728272013-04-12 19:12:05 +0100546
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500547 mutex_unlock(&kvm_hyp_pgd_mutex);
548}
549
550static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
Marc Zyngier6060df82013-04-12 19:12:01 +0100551 unsigned long end, unsigned long pfn,
552 pgprot_t prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500553{
554 pte_t *pte;
555 unsigned long addr;
556
Marc Zyngier3562c762013-04-12 19:12:02 +0100557 addr = start;
558 do {
Marc Zyngier6060df82013-04-12 19:12:01 +0100559 pte = pte_offset_kernel(pmd, addr);
560 kvm_set_pte(pte, pfn_pte(pfn, prot));
Marc Zyngier4f728272013-04-12 19:12:05 +0100561 get_page(virt_to_page(pte));
Marc Zyngier5a677ce2013-04-12 19:12:06 +0100562 kvm_flush_dcache_to_poc(pte, sizeof(*pte));
Marc Zyngier6060df82013-04-12 19:12:01 +0100563 pfn++;
Marc Zyngier3562c762013-04-12 19:12:02 +0100564 } while (addr += PAGE_SIZE, addr != end);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500565}
566
567static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
Marc Zyngier6060df82013-04-12 19:12:01 +0100568 unsigned long end, unsigned long pfn,
569 pgprot_t prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500570{
571 pmd_t *pmd;
572 pte_t *pte;
573 unsigned long addr, next;
574
Marc Zyngier3562c762013-04-12 19:12:02 +0100575 addr = start;
576 do {
Marc Zyngier6060df82013-04-12 19:12:01 +0100577 pmd = pmd_offset(pud, addr);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500578
579 BUG_ON(pmd_sect(*pmd));
580
581 if (pmd_none(*pmd)) {
Marc Zyngier6060df82013-04-12 19:12:01 +0100582 pte = pte_alloc_one_kernel(NULL, addr);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500583 if (!pte) {
584 kvm_err("Cannot allocate Hyp pte\n");
585 return -ENOMEM;
586 }
587 pmd_populate_kernel(NULL, pmd, pte);
Marc Zyngier4f728272013-04-12 19:12:05 +0100588 get_page(virt_to_page(pmd));
Marc Zyngier5a677ce2013-04-12 19:12:06 +0100589 kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500590 }
591
592 next = pmd_addr_end(addr, end);
593
Marc Zyngier6060df82013-04-12 19:12:01 +0100594 create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
595 pfn += (next - addr) >> PAGE_SHIFT;
Marc Zyngier3562c762013-04-12 19:12:02 +0100596 } while (addr = next, addr != end);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500597
598 return 0;
599}
600
Christoffer Dall38f791a2014-10-10 12:14:28 +0200601static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start,
602 unsigned long end, unsigned long pfn,
603 pgprot_t prot)
604{
605 pud_t *pud;
606 pmd_t *pmd;
607 unsigned long addr, next;
608 int ret;
609
610 addr = start;
611 do {
612 pud = pud_offset(pgd, addr);
613
614 if (pud_none_or_clear_bad(pud)) {
615 pmd = pmd_alloc_one(NULL, addr);
616 if (!pmd) {
617 kvm_err("Cannot allocate Hyp pmd\n");
618 return -ENOMEM;
619 }
620 pud_populate(NULL, pud, pmd);
621 get_page(virt_to_page(pud));
622 kvm_flush_dcache_to_poc(pud, sizeof(*pud));
623 }
624
625 next = pud_addr_end(addr, end);
626 ret = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
627 if (ret)
628 return ret;
629 pfn += (next - addr) >> PAGE_SHIFT;
630 } while (addr = next, addr != end);
631
632 return 0;
633}
634
Marc Zyngier6060df82013-04-12 19:12:01 +0100635static int __create_hyp_mappings(pgd_t *pgdp,
636 unsigned long start, unsigned long end,
637 unsigned long pfn, pgprot_t prot)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500638{
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500639 pgd_t *pgd;
640 pud_t *pud;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500641 unsigned long addr, next;
642 int err = 0;
643
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500644 mutex_lock(&kvm_hyp_pgd_mutex);
Marc Zyngier3562c762013-04-12 19:12:02 +0100645 addr = start & PAGE_MASK;
646 end = PAGE_ALIGN(end);
647 do {
Marc Zyngier6060df82013-04-12 19:12:01 +0100648 pgd = pgdp + pgd_index(addr);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500649
Christoffer Dall38f791a2014-10-10 12:14:28 +0200650 if (pgd_none(*pgd)) {
651 pud = pud_alloc_one(NULL, addr);
652 if (!pud) {
653 kvm_err("Cannot allocate Hyp pud\n");
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500654 err = -ENOMEM;
655 goto out;
656 }
Christoffer Dall38f791a2014-10-10 12:14:28 +0200657 pgd_populate(NULL, pgd, pud);
658 get_page(virt_to_page(pgd));
659 kvm_flush_dcache_to_poc(pgd, sizeof(*pgd));
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500660 }
661
662 next = pgd_addr_end(addr, end);
Christoffer Dall38f791a2014-10-10 12:14:28 +0200663 err = create_hyp_pud_mappings(pgd, addr, next, pfn, prot);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500664 if (err)
665 goto out;
Marc Zyngier6060df82013-04-12 19:12:01 +0100666 pfn += (next - addr) >> PAGE_SHIFT;
Marc Zyngier3562c762013-04-12 19:12:02 +0100667 } while (addr = next, addr != end);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500668out:
669 mutex_unlock(&kvm_hyp_pgd_mutex);
670 return err;
671}
672
Christoffer Dall40c27292013-11-15 13:14:12 -0800673static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
674{
675 if (!is_vmalloc_addr(kaddr)) {
676 BUG_ON(!virt_addr_valid(kaddr));
677 return __pa(kaddr);
678 } else {
679 return page_to_phys(vmalloc_to_page(kaddr)) +
680 offset_in_page(kaddr);
681 }
682}
683
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500684/**
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100685 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500686 * @from: The virtual kernel start address of the range
687 * @to: The virtual kernel end address of the range (exclusive)
688 *
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100689 * The same virtual address as the kernel virtual address is also used
690 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
691 * physical pages.
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500692 */
693int create_hyp_mappings(void *from, void *to)
694{
Christoffer Dall40c27292013-11-15 13:14:12 -0800695 phys_addr_t phys_addr;
696 unsigned long virt_addr;
Marc Zyngier6060df82013-04-12 19:12:01 +0100697 unsigned long start = KERN_TO_HYP((unsigned long)from);
698 unsigned long end = KERN_TO_HYP((unsigned long)to);
699
Marc Zyngier1e947ba2015-01-29 11:59:54 +0000700 if (is_kernel_in_hyp_mode())
701 return 0;
702
Christoffer Dall40c27292013-11-15 13:14:12 -0800703 start = start & PAGE_MASK;
704 end = PAGE_ALIGN(end);
Marc Zyngier6060df82013-04-12 19:12:01 +0100705
Christoffer Dall40c27292013-11-15 13:14:12 -0800706 for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
707 int err;
708
709 phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
710 err = __create_hyp_mappings(hyp_pgd, virt_addr,
711 virt_addr + PAGE_SIZE,
712 __phys_to_pfn(phys_addr),
713 PAGE_HYP);
714 if (err)
715 return err;
716 }
717
718 return 0;
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500719}
720
721/**
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100722 * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
723 * @from: The kernel start VA of the range
724 * @to: The kernel end VA of the range (exclusive)
Marc Zyngier6060df82013-04-12 19:12:01 +0100725 * @phys_addr: The physical start address which gets mapped
Marc Zyngier06e8c3b2012-10-28 01:09:14 +0100726 *
727 * The resulting HYP VA is the same as the kernel VA, modulo
728 * HYP_PAGE_OFFSET.
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500729 */
Marc Zyngier6060df82013-04-12 19:12:01 +0100730int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500731{
Marc Zyngier6060df82013-04-12 19:12:01 +0100732 unsigned long start = KERN_TO_HYP((unsigned long)from);
733 unsigned long end = KERN_TO_HYP((unsigned long)to);
734
Marc Zyngier1e947ba2015-01-29 11:59:54 +0000735 if (is_kernel_in_hyp_mode())
736 return 0;
737
Marc Zyngier6060df82013-04-12 19:12:01 +0100738 /* Check for a valid kernel IO mapping */
739 if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
740 return -EINVAL;
741
742 return __create_hyp_mappings(hyp_pgd, start, end,
743 __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
Christoffer Dall342cd0a2013-01-20 18:28:06 -0500744}
745
Marc Zyngiera9873702015-03-10 19:06:59 +0000746/* Free the HW pgd, one page at a time */
747static void kvm_free_hwpgd(void *hwpgd)
748{
749 free_pages_exact(hwpgd, kvm_get_hwpgd_size());
750}
751
752/* Allocate the HW PGD, making sure that each page gets its own refcount */
753static void *kvm_alloc_hwpgd(void)
754{
755 unsigned int size = kvm_get_hwpgd_size();
756
757 return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
758}
759
Christoffer Dalld5d81842013-01-20 18:28:07 -0500760/**
761 * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation.
762 * @kvm: The KVM struct pointer for the VM.
763 *
Vladimir Murzin9d4dc6882015-11-16 11:28:16 +0000764 * Allocates only the stage-2 HW PGD level table(s) (can support either full
765 * 40-bit input addresses or limited to 32-bit input addresses). Clears the
766 * allocated pages.
Christoffer Dalld5d81842013-01-20 18:28:07 -0500767 *
768 * Note we don't need locking here as this is only called when the VM is
769 * created, which can only be done once.
770 */
771int kvm_alloc_stage2_pgd(struct kvm *kvm)
772{
773 pgd_t *pgd;
Marc Zyngiera9873702015-03-10 19:06:59 +0000774 void *hwpgd;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500775
776 if (kvm->arch.pgd != NULL) {
777 kvm_err("kvm_arch already initialized?\n");
778 return -EINVAL;
779 }
780
Marc Zyngiera9873702015-03-10 19:06:59 +0000781 hwpgd = kvm_alloc_hwpgd();
782 if (!hwpgd)
783 return -ENOMEM;
784
Suzuki K Poulose120f0772016-03-01 10:03:06 +0000785 /*
786 * When the kernel uses more levels of page tables than the
Marc Zyngiera9873702015-03-10 19:06:59 +0000787 * guest, we allocate a fake PGD and pre-populate it to point
788 * to the next-level page table, which will be the real
789 * initial page table pointed to by the VTTBR.
Marc Zyngiera9873702015-03-10 19:06:59 +0000790 */
Suzuki K Poulose120f0772016-03-01 10:03:06 +0000791 pgd = kvm_setup_fake_pgd(hwpgd);
792 if (IS_ERR(pgd)) {
793 kvm_free_hwpgd(hwpgd);
794 return PTR_ERR(pgd);
Christoffer Dall38f791a2014-10-10 12:14:28 +0200795 }
796
Marc Zyngierc62ee2b2012-10-15 11:27:37 +0100797 kvm_clean_pgd(pgd);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500798 kvm->arch.pgd = pgd;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500799 return 0;
800}
801
Christoffer Dall957db102014-11-27 10:35:03 +0100802static void stage2_unmap_memslot(struct kvm *kvm,
803 struct kvm_memory_slot *memslot)
804{
805 hva_t hva = memslot->userspace_addr;
806 phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
807 phys_addr_t size = PAGE_SIZE * memslot->npages;
808 hva_t reg_end = hva + size;
809
810 /*
811 * A memory region could potentially cover multiple VMAs, and any holes
812 * between them, so iterate over all of them to find out if we should
813 * unmap any of them.
814 *
815 * +--------------------------------------------+
816 * +---------------+----------------+ +----------------+
817 * | : VMA 1 | VMA 2 | | VMA 3 : |
818 * +---------------+----------------+ +----------------+
819 * | memory region |
820 * +--------------------------------------------+
821 */
822 do {
823 struct vm_area_struct *vma = find_vma(current->mm, hva);
824 hva_t vm_start, vm_end;
825
826 if (!vma || vma->vm_start >= reg_end)
827 break;
828
829 /*
830 * Take the intersection of this VMA with the memory region
831 */
832 vm_start = max(hva, vma->vm_start);
833 vm_end = min(reg_end, vma->vm_end);
834
835 if (!(vma->vm_flags & VM_PFNMAP)) {
836 gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
837 unmap_stage2_range(kvm, gpa, vm_end - vm_start);
838 }
839 hva = vm_end;
840 } while (hva < reg_end);
841}
842
843/**
844 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
845 * @kvm: The struct kvm pointer
846 *
847 * Go through the memregions and unmap any reguler RAM
848 * backing memory already mapped to the VM.
849 */
850void stage2_unmap_vm(struct kvm *kvm)
851{
852 struct kvm_memslots *slots;
853 struct kvm_memory_slot *memslot;
854 int idx;
855
856 idx = srcu_read_lock(&kvm->srcu);
857 spin_lock(&kvm->mmu_lock);
858
859 slots = kvm_memslots(kvm);
860 kvm_for_each_memslot(memslot, slots)
861 stage2_unmap_memslot(kvm, memslot);
862
863 spin_unlock(&kvm->mmu_lock);
864 srcu_read_unlock(&kvm->srcu, idx);
865}
866
Christoffer Dalld5d81842013-01-20 18:28:07 -0500867/**
868 * kvm_free_stage2_pgd - free all stage-2 tables
869 * @kvm: The KVM struct pointer for the VM.
870 *
871 * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all
872 * underlying level-2 and level-3 tables before freeing the actual level-1 table
873 * and setting the struct pointer to NULL.
874 *
875 * Note we don't need locking here as this is only called when the VM is
876 * destroyed, which can only be done once.
877 */
878void kvm_free_stage2_pgd(struct kvm *kvm)
879{
880 if (kvm->arch.pgd == NULL)
881 return;
882
883 unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE);
Marc Zyngiera9873702015-03-10 19:06:59 +0000884 kvm_free_hwpgd(kvm_get_hwpgd(kvm));
Suzuki K Poulose120f0772016-03-01 10:03:06 +0000885 kvm_free_fake_pgd(kvm->arch.pgd);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500886 kvm->arch.pgd = NULL;
887}
888
Christoffer Dall38f791a2014-10-10 12:14:28 +0200889static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
890 phys_addr_t addr)
891{
892 pgd_t *pgd;
893 pud_t *pud;
894
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000895 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
896 if (WARN_ON(stage2_pgd_none(*pgd))) {
Christoffer Dall38f791a2014-10-10 12:14:28 +0200897 if (!cache)
898 return NULL;
899 pud = mmu_memory_cache_alloc(cache);
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000900 stage2_pgd_populate(pgd, pud);
Christoffer Dall38f791a2014-10-10 12:14:28 +0200901 get_page(virt_to_page(pgd));
902 }
903
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000904 return stage2_pud_offset(pgd, addr);
Christoffer Dall38f791a2014-10-10 12:14:28 +0200905}
906
Christoffer Dallad361f02012-11-01 17:14:45 +0100907static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
908 phys_addr_t addr)
Christoffer Dalld5d81842013-01-20 18:28:07 -0500909{
Christoffer Dalld5d81842013-01-20 18:28:07 -0500910 pud_t *pud;
911 pmd_t *pmd;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500912
Christoffer Dall38f791a2014-10-10 12:14:28 +0200913 pud = stage2_get_pud(kvm, cache, addr);
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000914 if (stage2_pud_none(*pud)) {
Christoffer Dalld5d81842013-01-20 18:28:07 -0500915 if (!cache)
Christoffer Dallad361f02012-11-01 17:14:45 +0100916 return NULL;
Christoffer Dalld5d81842013-01-20 18:28:07 -0500917 pmd = mmu_memory_cache_alloc(cache);
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000918 stage2_pud_populate(pud, pmd);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500919 get_page(virt_to_page(pud));
Marc Zyngierc62ee2b2012-10-15 11:27:37 +0100920 }
921
Suzuki K Poulose70fd1902016-03-22 18:33:45 +0000922 return stage2_pmd_offset(pud, addr);
Christoffer Dallad361f02012-11-01 17:14:45 +0100923}
Christoffer Dalld5d81842013-01-20 18:28:07 -0500924
Christoffer Dallad361f02012-11-01 17:14:45 +0100925static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
926 *cache, phys_addr_t addr, const pmd_t *new_pmd)
927{
928 pmd_t *pmd, old_pmd;
929
930 pmd = stage2_get_pmd(kvm, cache, addr);
931 VM_BUG_ON(!pmd);
932
933 /*
934 * Mapping in huge pages should only happen through a fault. If a
935 * page is merged into a transparent huge page, the individual
936 * subpages of that huge page should be unmapped through MMU
937 * notifiers before we get here.
938 *
939 * Merging of CompoundPages is not supported; they should become
940 * splitting first, unmapped, merged, and mapped back in on-demand.
941 */
942 VM_BUG_ON(pmd_present(*pmd) && pmd_pfn(*pmd) != pmd_pfn(*new_pmd));
943
944 old_pmd = *pmd;
945 kvm_set_pmd(pmd, *new_pmd);
946 if (pmd_present(old_pmd))
947 kvm_tlb_flush_vmid_ipa(kvm, addr);
948 else
949 get_page(virt_to_page(pmd));
950 return 0;
951}
952
953static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
Mario Smarduch15a49a42015-01-15 15:58:58 -0800954 phys_addr_t addr, const pte_t *new_pte,
955 unsigned long flags)
Christoffer Dallad361f02012-11-01 17:14:45 +0100956{
957 pmd_t *pmd;
958 pte_t *pte, old_pte;
Mario Smarduch15a49a42015-01-15 15:58:58 -0800959 bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
960 bool logging_active = flags & KVM_S2_FLAG_LOGGING_ACTIVE;
961
962 VM_BUG_ON(logging_active && !cache);
Christoffer Dallad361f02012-11-01 17:14:45 +0100963
Christoffer Dall38f791a2014-10-10 12:14:28 +0200964 /* Create stage-2 page table mapping - Levels 0 and 1 */
Christoffer Dallad361f02012-11-01 17:14:45 +0100965 pmd = stage2_get_pmd(kvm, cache, addr);
966 if (!pmd) {
967 /*
968 * Ignore calls from kvm_set_spte_hva for unallocated
969 * address ranges.
970 */
971 return 0;
972 }
973
Mario Smarduch15a49a42015-01-15 15:58:58 -0800974 /*
975 * While dirty page logging - dissolve huge PMD, then continue on to
976 * allocate page.
977 */
978 if (logging_active)
979 stage2_dissolve_pmd(kvm, addr, pmd);
980
Christoffer Dallad361f02012-11-01 17:14:45 +0100981 /* Create stage-2 page mappings - Level 2 */
Christoffer Dalld5d81842013-01-20 18:28:07 -0500982 if (pmd_none(*pmd)) {
983 if (!cache)
984 return 0; /* ignore calls from kvm_set_spte_hva */
985 pte = mmu_memory_cache_alloc(cache);
Marc Zyngierc62ee2b2012-10-15 11:27:37 +0100986 kvm_clean_pte(pte);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500987 pmd_populate_kernel(NULL, pmd, pte);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500988 get_page(virt_to_page(pmd));
Marc Zyngierc62ee2b2012-10-15 11:27:37 +0100989 }
990
991 pte = pte_offset_kernel(pmd, addr);
Christoffer Dalld5d81842013-01-20 18:28:07 -0500992
993 if (iomap && pte_present(*pte))
994 return -EFAULT;
995
996 /* Create 2nd stage page table mapping - Level 3 */
997 old_pte = *pte;
998 kvm_set_pte(pte, *new_pte);
999 if (pte_present(old_pte))
Marc Zyngier48762762013-01-28 15:27:00 +00001000 kvm_tlb_flush_vmid_ipa(kvm, addr);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001001 else
1002 get_page(virt_to_page(pte));
1003
1004 return 0;
1005}
1006
1007/**
1008 * kvm_phys_addr_ioremap - map a device range to guest IPA
1009 *
1010 * @kvm: The KVM pointer
1011 * @guest_ipa: The IPA at which to insert the mapping
1012 * @pa: The physical address of the device
1013 * @size: The size of the mapping
1014 */
1015int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
Ard Biesheuvelc40f2f82014-09-17 14:56:18 -07001016 phys_addr_t pa, unsigned long size, bool writable)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001017{
1018 phys_addr_t addr, end;
1019 int ret = 0;
1020 unsigned long pfn;
1021 struct kvm_mmu_memory_cache cache = { 0, };
1022
1023 end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK;
1024 pfn = __phys_to_pfn(pa);
1025
1026 for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
Marc Zyngierc62ee2b2012-10-15 11:27:37 +01001027 pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001028
Ard Biesheuvelc40f2f82014-09-17 14:56:18 -07001029 if (writable)
1030 kvm_set_s2pte_writable(&pte);
1031
Christoffer Dall38f791a2014-10-10 12:14:28 +02001032 ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES,
1033 KVM_NR_MEM_OBJS);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001034 if (ret)
1035 goto out;
1036 spin_lock(&kvm->mmu_lock);
Mario Smarduch15a49a42015-01-15 15:58:58 -08001037 ret = stage2_set_pte(kvm, &cache, addr, &pte,
1038 KVM_S2PTE_FLAG_IS_IOMAP);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001039 spin_unlock(&kvm->mmu_lock);
1040 if (ret)
1041 goto out;
1042
1043 pfn++;
1044 }
1045
1046out:
1047 mmu_free_memory_cache(&cache);
1048 return ret;
1049}
1050
Dan Williamsba049e92016-01-15 16:56:11 -08001051static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
Christoffer Dall9b5fdb92013-10-02 15:32:01 -07001052{
Dan Williamsba049e92016-01-15 16:56:11 -08001053 kvm_pfn_t pfn = *pfnp;
Christoffer Dall9b5fdb92013-10-02 15:32:01 -07001054 gfn_t gfn = *ipap >> PAGE_SHIFT;
1055
1056 if (PageTransCompound(pfn_to_page(pfn))) {
1057 unsigned long mask;
1058 /*
1059 * The address we faulted on is backed by a transparent huge
1060 * page. However, because we map the compound huge page and
1061 * not the individual tail page, we need to transfer the
1062 * refcount to the head page. We have to be careful that the
1063 * THP doesn't start to split while we are adjusting the
1064 * refcounts.
1065 *
1066 * We are sure this doesn't happen, because mmu_notifier_retry
1067 * was successful and we are holding the mmu_lock, so if this
1068 * THP is trying to split, it will be blocked in the mmu
1069 * notifier before touching any of the pages, specifically
1070 * before being able to call __split_huge_page_refcount().
1071 *
1072 * We can therefore safely transfer the refcount from PG_tail
1073 * to PG_head and switch the pfn from a tail page to the head
1074 * page accordingly.
1075 */
1076 mask = PTRS_PER_PMD - 1;
1077 VM_BUG_ON((gfn & mask) != (pfn & mask));
1078 if (pfn & mask) {
1079 *ipap &= PMD_MASK;
1080 kvm_release_pfn_clean(pfn);
1081 pfn &= ~mask;
1082 kvm_get_pfn(pfn);
1083 *pfnp = pfn;
1084 }
1085
1086 return true;
1087 }
1088
1089 return false;
1090}
1091
Ard Biesheuvela7d079c2014-09-09 11:27:09 +01001092static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
1093{
1094 if (kvm_vcpu_trap_is_iabt(vcpu))
1095 return false;
1096
1097 return kvm_vcpu_dabt_iswrite(vcpu);
1098}
1099
Mario Smarduchc6473552015-01-15 15:58:56 -08001100/**
1101 * stage2_wp_ptes - write protect PMD range
1102 * @pmd: pointer to pmd entry
1103 * @addr: range start address
1104 * @end: range end address
1105 */
1106static void stage2_wp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end)
1107{
1108 pte_t *pte;
1109
1110 pte = pte_offset_kernel(pmd, addr);
1111 do {
1112 if (!pte_none(*pte)) {
1113 if (!kvm_s2pte_readonly(pte))
1114 kvm_set_s2pte_readonly(pte);
1115 }
1116 } while (pte++, addr += PAGE_SIZE, addr != end);
1117}
1118
1119/**
1120 * stage2_wp_pmds - write protect PUD range
1121 * @pud: pointer to pud entry
1122 * @addr: range start address
1123 * @end: range end address
1124 */
1125static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end)
1126{
1127 pmd_t *pmd;
1128 phys_addr_t next;
1129
Suzuki K Poulose70fd1902016-03-22 18:33:45 +00001130 pmd = stage2_pmd_offset(pud, addr);
Mario Smarduchc6473552015-01-15 15:58:56 -08001131
1132 do {
Suzuki K Poulose70fd1902016-03-22 18:33:45 +00001133 next = stage2_pmd_addr_end(addr, end);
Mario Smarduchc6473552015-01-15 15:58:56 -08001134 if (!pmd_none(*pmd)) {
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +00001135 if (pmd_thp_or_huge(*pmd)) {
Mario Smarduchc6473552015-01-15 15:58:56 -08001136 if (!kvm_s2pmd_readonly(pmd))
1137 kvm_set_s2pmd_readonly(pmd);
1138 } else {
1139 stage2_wp_ptes(pmd, addr, next);
1140 }
1141 }
1142 } while (pmd++, addr = next, addr != end);
1143}
1144
1145/**
1146 * stage2_wp_puds - write protect PGD range
1147 * @pgd: pointer to pgd entry
1148 * @addr: range start address
1149 * @end: range end address
1150 *
1151 * Process PUD entries, for a huge PUD we cause a panic.
1152 */
1153static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end)
1154{
1155 pud_t *pud;
1156 phys_addr_t next;
1157
Suzuki K Poulose70fd1902016-03-22 18:33:45 +00001158 pud = stage2_pud_offset(pgd, addr);
Mario Smarduchc6473552015-01-15 15:58:56 -08001159 do {
Suzuki K Poulose70fd1902016-03-22 18:33:45 +00001160 next = stage2_pud_addr_end(addr, end);
1161 if (!stage2_pud_none(*pud)) {
Mario Smarduchc6473552015-01-15 15:58:56 -08001162 /* TODO:PUD not supported, revisit later if supported */
Suzuki K Poulose70fd1902016-03-22 18:33:45 +00001163 BUG_ON(stage2_pud_huge(*pud));
Mario Smarduchc6473552015-01-15 15:58:56 -08001164 stage2_wp_pmds(pud, addr, next);
1165 }
1166 } while (pud++, addr = next, addr != end);
1167}
1168
1169/**
1170 * stage2_wp_range() - write protect stage2 memory region range
1171 * @kvm: The KVM pointer
1172 * @addr: Start address of range
1173 * @end: End address of range
1174 */
1175static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
1176{
1177 pgd_t *pgd;
1178 phys_addr_t next;
1179
Suzuki K Poulose70fd1902016-03-22 18:33:45 +00001180 pgd = kvm->arch.pgd + stage2_pgd_index(addr);
Mario Smarduchc6473552015-01-15 15:58:56 -08001181 do {
1182 /*
1183 * Release kvm_mmu_lock periodically if the memory region is
1184 * large. Otherwise, we may see kernel panics with
Christoffer Dall227ea812015-01-23 10:49:31 +01001185 * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR,
1186 * CONFIG_LOCKDEP. Additionally, holding the lock too long
Mario Smarduchc6473552015-01-15 15:58:56 -08001187 * will also starve other vCPUs.
1188 */
1189 if (need_resched() || spin_needbreak(&kvm->mmu_lock))
1190 cond_resched_lock(&kvm->mmu_lock);
1191
Suzuki K Poulose70fd1902016-03-22 18:33:45 +00001192 next = stage2_pgd_addr_end(addr, end);
1193 if (stage2_pgd_present(*pgd))
Mario Smarduchc6473552015-01-15 15:58:56 -08001194 stage2_wp_puds(pgd, addr, next);
1195 } while (pgd++, addr = next, addr != end);
1196}
1197
1198/**
1199 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
1200 * @kvm: The KVM pointer
1201 * @slot: The memory slot to write protect
1202 *
1203 * Called to start logging dirty pages after memory region
1204 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1205 * all present PMD and PTEs are write protected in the memory region.
1206 * Afterwards read of dirty page log can be called.
1207 *
1208 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
1209 * serializing operations for VM memory regions.
1210 */
1211void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1212{
Paolo Bonzini9f6b8022015-05-17 16:20:07 +02001213 struct kvm_memslots *slots = kvm_memslots(kvm);
1214 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
Mario Smarduchc6473552015-01-15 15:58:56 -08001215 phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
1216 phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1217
1218 spin_lock(&kvm->mmu_lock);
1219 stage2_wp_range(kvm, start, end);
1220 spin_unlock(&kvm->mmu_lock);
1221 kvm_flush_remote_tlbs(kvm);
1222}
Mario Smarduch53c810c2015-01-15 15:58:57 -08001223
1224/**
Kai Huang3b0f1d02015-01-28 10:54:23 +08001225 * kvm_mmu_write_protect_pt_masked() - write protect dirty pages
Mario Smarduch53c810c2015-01-15 15:58:57 -08001226 * @kvm: The KVM pointer
1227 * @slot: The memory slot associated with mask
1228 * @gfn_offset: The gfn offset in memory slot
1229 * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
1230 * slot to be write protected
1231 *
1232 * Walks bits set in mask write protects the associated pte's. Caller must
1233 * acquire kvm_mmu_lock.
1234 */
Kai Huang3b0f1d02015-01-28 10:54:23 +08001235static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
Mario Smarduch53c810c2015-01-15 15:58:57 -08001236 struct kvm_memory_slot *slot,
1237 gfn_t gfn_offset, unsigned long mask)
1238{
1239 phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
1240 phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT;
1241 phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
1242
1243 stage2_wp_range(kvm, start, end);
1244}
Mario Smarduchc6473552015-01-15 15:58:56 -08001245
Kai Huang3b0f1d02015-01-28 10:54:23 +08001246/*
1247 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1248 * dirty pages.
1249 *
1250 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1251 * enable dirty logging for them.
1252 */
1253void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1254 struct kvm_memory_slot *slot,
1255 gfn_t gfn_offset, unsigned long mask)
1256{
1257 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1258}
1259
Dan Williamsba049e92016-01-15 16:56:11 -08001260static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
Marc Zyngier0d3e4d42015-01-05 21:13:24 +00001261 unsigned long size, bool uncached)
1262{
1263 __coherent_cache_guest_page(vcpu, pfn, size, uncached);
1264}
1265
Christoffer Dall94f8e642013-01-20 18:28:12 -05001266static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
Christoffer Dall98047882014-08-19 12:18:04 +02001267 struct kvm_memory_slot *memslot, unsigned long hva,
Christoffer Dall94f8e642013-01-20 18:28:12 -05001268 unsigned long fault_status)
1269{
Christoffer Dall94f8e642013-01-20 18:28:12 -05001270 int ret;
Christoffer Dall9b5fdb92013-10-02 15:32:01 -07001271 bool write_fault, writable, hugetlb = false, force_pte = false;
Christoffer Dall94f8e642013-01-20 18:28:12 -05001272 unsigned long mmu_seq;
Christoffer Dallad361f02012-11-01 17:14:45 +01001273 gfn_t gfn = fault_ipa >> PAGE_SHIFT;
Christoffer Dallad361f02012-11-01 17:14:45 +01001274 struct kvm *kvm = vcpu->kvm;
Christoffer Dall94f8e642013-01-20 18:28:12 -05001275 struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
Christoffer Dallad361f02012-11-01 17:14:45 +01001276 struct vm_area_struct *vma;
Dan Williamsba049e92016-01-15 16:56:11 -08001277 kvm_pfn_t pfn;
Kim Phillipsb8865762014-06-26 01:45:51 +01001278 pgprot_t mem_type = PAGE_S2;
Laszlo Ersek840f4bf2014-11-17 14:58:52 +00001279 bool fault_ipa_uncached;
Mario Smarduch15a49a42015-01-15 15:58:58 -08001280 bool logging_active = memslot_is_logging(memslot);
1281 unsigned long flags = 0;
Christoffer Dall94f8e642013-01-20 18:28:12 -05001282
Ard Biesheuvela7d079c2014-09-09 11:27:09 +01001283 write_fault = kvm_is_write_fault(vcpu);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001284 if (fault_status == FSC_PERM && !write_fault) {
1285 kvm_err("Unexpected L2 read permission error\n");
1286 return -EFAULT;
1287 }
1288
Christoffer Dallad361f02012-11-01 17:14:45 +01001289 /* Let's check if we will get back a huge page backed by hugetlbfs */
1290 down_read(&current->mm->mmap_sem);
1291 vma = find_vma_intersection(current->mm, hva, hva + 1);
Ard Biesheuvel37b54402014-09-17 14:56:17 -07001292 if (unlikely(!vma)) {
1293 kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1294 up_read(&current->mm->mmap_sem);
1295 return -EFAULT;
1296 }
1297
Mario Smarduch15a49a42015-01-15 15:58:58 -08001298 if (is_vm_hugetlb_page(vma) && !logging_active) {
Christoffer Dallad361f02012-11-01 17:14:45 +01001299 hugetlb = true;
1300 gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
Christoffer Dall9b5fdb92013-10-02 15:32:01 -07001301 } else {
1302 /*
Marc Zyngier136d7372013-12-13 16:56:06 +00001303 * Pages belonging to memslots that don't have the same
1304 * alignment for userspace and IPA cannot be mapped using
1305 * block descriptors even if the pages belong to a THP for
1306 * the process, because the stage-2 block descriptor will
1307 * cover more than a single THP and we loose atomicity for
1308 * unmapping, updates, and splits of the THP or other pages
1309 * in the stage-2 block range.
Christoffer Dall9b5fdb92013-10-02 15:32:01 -07001310 */
Marc Zyngier136d7372013-12-13 16:56:06 +00001311 if ((memslot->userspace_addr & ~PMD_MASK) !=
1312 ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
Christoffer Dall9b5fdb92013-10-02 15:32:01 -07001313 force_pte = true;
Christoffer Dallad361f02012-11-01 17:14:45 +01001314 }
1315 up_read(&current->mm->mmap_sem);
1316
Christoffer Dall94f8e642013-01-20 18:28:12 -05001317 /* We need minimum second+third level pages */
Christoffer Dall38f791a2014-10-10 12:14:28 +02001318 ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
1319 KVM_NR_MEM_OBJS);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001320 if (ret)
1321 return ret;
1322
1323 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1324 /*
1325 * Ensure the read of mmu_notifier_seq happens before we call
1326 * gfn_to_pfn_prot (which calls get_user_pages), so that we don't risk
1327 * the page we just got a reference to gets unmapped before we have a
1328 * chance to grab the mmu_lock, which ensure that if the page gets
1329 * unmapped afterwards, the call to kvm_unmap_hva will take it away
1330 * from us again properly. This smp_rmb() interacts with the smp_wmb()
1331 * in kvm_mmu_notifier_invalidate_<page|range_end>.
1332 */
1333 smp_rmb();
1334
Christoffer Dallad361f02012-11-01 17:14:45 +01001335 pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001336 if (is_error_pfn(pfn))
1337 return -EFAULT;
1338
Mario Smarduch15a49a42015-01-15 15:58:58 -08001339 if (kvm_is_device_pfn(pfn)) {
Kim Phillipsb8865762014-06-26 01:45:51 +01001340 mem_type = PAGE_S2_DEVICE;
Mario Smarduch15a49a42015-01-15 15:58:58 -08001341 flags |= KVM_S2PTE_FLAG_IS_IOMAP;
1342 } else if (logging_active) {
1343 /*
1344 * Faults on pages in a memslot with logging enabled
1345 * should not be mapped with huge pages (it introduces churn
1346 * and performance degradation), so force a pte mapping.
1347 */
1348 force_pte = true;
1349 flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
1350
1351 /*
1352 * Only actually map the page as writable if this was a write
1353 * fault.
1354 */
1355 if (!write_fault)
1356 writable = false;
1357 }
Kim Phillipsb8865762014-06-26 01:45:51 +01001358
Christoffer Dallad361f02012-11-01 17:14:45 +01001359 spin_lock(&kvm->mmu_lock);
1360 if (mmu_notifier_retry(kvm, mmu_seq))
Christoffer Dall94f8e642013-01-20 18:28:12 -05001361 goto out_unlock;
Mario Smarduch15a49a42015-01-15 15:58:58 -08001362
Christoffer Dall9b5fdb92013-10-02 15:32:01 -07001363 if (!hugetlb && !force_pte)
1364 hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
Christoffer Dallad361f02012-11-01 17:14:45 +01001365
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001366 fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
Laszlo Ersek840f4bf2014-11-17 14:58:52 +00001367
Christoffer Dallad361f02012-11-01 17:14:45 +01001368 if (hugetlb) {
Kim Phillipsb8865762014-06-26 01:45:51 +01001369 pmd_t new_pmd = pfn_pmd(pfn, mem_type);
Christoffer Dallad361f02012-11-01 17:14:45 +01001370 new_pmd = pmd_mkhuge(new_pmd);
1371 if (writable) {
1372 kvm_set_s2pmd_writable(&new_pmd);
1373 kvm_set_pfn_dirty(pfn);
1374 }
Marc Zyngier0d3e4d42015-01-05 21:13:24 +00001375 coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
Christoffer Dallad361f02012-11-01 17:14:45 +01001376 ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
1377 } else {
Kim Phillipsb8865762014-06-26 01:45:51 +01001378 pte_t new_pte = pfn_pte(pfn, mem_type);
Mario Smarduch15a49a42015-01-15 15:58:58 -08001379
Christoffer Dallad361f02012-11-01 17:14:45 +01001380 if (writable) {
1381 kvm_set_s2pte_writable(&new_pte);
1382 kvm_set_pfn_dirty(pfn);
Mario Smarduch15a49a42015-01-15 15:58:58 -08001383 mark_page_dirty(kvm, gfn);
Christoffer Dallad361f02012-11-01 17:14:45 +01001384 }
Marc Zyngier0d3e4d42015-01-05 21:13:24 +00001385 coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
Mario Smarduch15a49a42015-01-15 15:58:58 -08001386 ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001387 }
Christoffer Dallad361f02012-11-01 17:14:45 +01001388
Christoffer Dall94f8e642013-01-20 18:28:12 -05001389out_unlock:
Christoffer Dallad361f02012-11-01 17:14:45 +01001390 spin_unlock(&kvm->mmu_lock);
Marc Zyngier35307b92015-03-12 18:16:51 +00001391 kvm_set_pfn_accessed(pfn);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001392 kvm_release_pfn_clean(pfn);
Christoffer Dallad361f02012-11-01 17:14:45 +01001393 return ret;
Christoffer Dall94f8e642013-01-20 18:28:12 -05001394}
1395
Marc Zyngieraeda9132015-03-12 18:16:52 +00001396/*
1397 * Resolve the access fault by making the page young again.
1398 * Note that because the faulting entry is guaranteed not to be
1399 * cached in the TLB, we don't need to invalidate anything.
1400 */
1401static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
1402{
1403 pmd_t *pmd;
1404 pte_t *pte;
Dan Williamsba049e92016-01-15 16:56:11 -08001405 kvm_pfn_t pfn;
Marc Zyngieraeda9132015-03-12 18:16:52 +00001406 bool pfn_valid = false;
1407
1408 trace_kvm_access_fault(fault_ipa);
1409
1410 spin_lock(&vcpu->kvm->mmu_lock);
1411
1412 pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
1413 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1414 goto out;
1415
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +00001416 if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */
Marc Zyngieraeda9132015-03-12 18:16:52 +00001417 *pmd = pmd_mkyoung(*pmd);
1418 pfn = pmd_pfn(*pmd);
1419 pfn_valid = true;
1420 goto out;
1421 }
1422
1423 pte = pte_offset_kernel(pmd, fault_ipa);
1424 if (pte_none(*pte)) /* Nothing there either */
1425 goto out;
1426
1427 *pte = pte_mkyoung(*pte); /* Just a page... */
1428 pfn = pte_pfn(*pte);
1429 pfn_valid = true;
1430out:
1431 spin_unlock(&vcpu->kvm->mmu_lock);
1432 if (pfn_valid)
1433 kvm_set_pfn_accessed(pfn);
1434}
1435
Christoffer Dall94f8e642013-01-20 18:28:12 -05001436/**
1437 * kvm_handle_guest_abort - handles all 2nd stage aborts
1438 * @vcpu: the VCPU pointer
1439 * @run: the kvm_run structure
1440 *
1441 * Any abort that gets to the host is almost guaranteed to be caused by a
1442 * missing second stage translation table entry, which can mean that either the
1443 * guest simply needs more memory and we must allocate an appropriate page or it
1444 * can mean that the guest tried to access I/O memory, which is emulated by user
1445 * space. The distinction is based on the IPA causing the fault and whether this
1446 * memory region has been registered as standard RAM by user space.
1447 */
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001448int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
1449{
Christoffer Dall94f8e642013-01-20 18:28:12 -05001450 unsigned long fault_status;
1451 phys_addr_t fault_ipa;
1452 struct kvm_memory_slot *memslot;
Christoffer Dall98047882014-08-19 12:18:04 +02001453 unsigned long hva;
1454 bool is_iabt, write_fault, writable;
Christoffer Dall94f8e642013-01-20 18:28:12 -05001455 gfn_t gfn;
1456 int ret, idx;
1457
Marc Zyngier52d1dba2012-10-15 10:33:38 +01001458 is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
Marc Zyngier7393b592012-09-17 19:27:09 +01001459 fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001460
Marc Zyngier7393b592012-09-17 19:27:09 +01001461 trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_hsr(vcpu),
1462 kvm_vcpu_get_hfar(vcpu), fault_ipa);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001463
1464 /* Check the stage-2 fault is trans. fault or write fault */
Christoffer Dall0496daa52014-09-26 12:29:34 +02001465 fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
Marc Zyngier35307b92015-03-12 18:16:51 +00001466 if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
1467 fault_status != FSC_ACCESS) {
Christoffer Dall0496daa52014-09-26 12:29:34 +02001468 kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
1469 kvm_vcpu_trap_get_class(vcpu),
1470 (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
1471 (unsigned long)kvm_vcpu_get_hsr(vcpu));
Christoffer Dall94f8e642013-01-20 18:28:12 -05001472 return -EFAULT;
1473 }
1474
1475 idx = srcu_read_lock(&vcpu->kvm->srcu);
1476
1477 gfn = fault_ipa >> PAGE_SHIFT;
Christoffer Dall98047882014-08-19 12:18:04 +02001478 memslot = gfn_to_memslot(vcpu->kvm, gfn);
1479 hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
Ard Biesheuvela7d079c2014-09-09 11:27:09 +01001480 write_fault = kvm_is_write_fault(vcpu);
Christoffer Dall98047882014-08-19 12:18:04 +02001481 if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
Christoffer Dall94f8e642013-01-20 18:28:12 -05001482 if (is_iabt) {
1483 /* Prefetch Abort on I/O address */
Marc Zyngier7393b592012-09-17 19:27:09 +01001484 kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
Christoffer Dall94f8e642013-01-20 18:28:12 -05001485 ret = 1;
1486 goto out_unlock;
1487 }
1488
Marc Zyngiercfe39502012-12-12 14:42:09 +00001489 /*
Marc Zyngier57c841f2016-01-29 15:01:28 +00001490 * Check for a cache maintenance operation. Since we
1491 * ended-up here, we know it is outside of any memory
1492 * slot. But we can't find out if that is for a device,
1493 * or if the guest is just being stupid. The only thing
1494 * we know for sure is that this range cannot be cached.
1495 *
1496 * So let's assume that the guest is just being
1497 * cautious, and skip the instruction.
1498 */
1499 if (kvm_vcpu_dabt_is_cm(vcpu)) {
1500 kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
1501 ret = 1;
1502 goto out_unlock;
1503 }
1504
1505 /*
Marc Zyngiercfe39502012-12-12 14:42:09 +00001506 * The IPA is reported as [MAX:12], so we need to
1507 * complement it with the bottom 12 bits from the
1508 * faulting VA. This is always 12 bits, irrespective
1509 * of the page size.
1510 */
1511 fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
Christoffer Dall45e96ea2013-01-20 18:43:58 -05001512 ret = io_mem_abort(vcpu, run, fault_ipa);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001513 goto out_unlock;
1514 }
1515
Christoffer Dallc3058d52014-10-10 12:14:29 +02001516 /* Userspace should not be able to register out-of-bounds IPAs */
1517 VM_BUG_ON(fault_ipa >= KVM_PHYS_SIZE);
1518
Marc Zyngieraeda9132015-03-12 18:16:52 +00001519 if (fault_status == FSC_ACCESS) {
1520 handle_access_fault(vcpu, fault_ipa);
1521 ret = 1;
1522 goto out_unlock;
1523 }
1524
Christoffer Dall98047882014-08-19 12:18:04 +02001525 ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
Christoffer Dall94f8e642013-01-20 18:28:12 -05001526 if (ret == 0)
1527 ret = 1;
1528out_unlock:
1529 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1530 return ret;
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001531}
1532
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001533static int handle_hva_to_gpa(struct kvm *kvm,
1534 unsigned long start,
1535 unsigned long end,
1536 int (*handler)(struct kvm *kvm,
1537 gpa_t gpa, void *data),
1538 void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001539{
1540 struct kvm_memslots *slots;
1541 struct kvm_memory_slot *memslot;
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001542 int ret = 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001543
1544 slots = kvm_memslots(kvm);
1545
1546 /* we only care about the pages that the guest sees */
1547 kvm_for_each_memslot(memslot, slots) {
1548 unsigned long hva_start, hva_end;
1549 gfn_t gfn, gfn_end;
1550
1551 hva_start = max(start, memslot->userspace_addr);
1552 hva_end = min(end, memslot->userspace_addr +
1553 (memslot->npages << PAGE_SHIFT));
1554 if (hva_start >= hva_end)
1555 continue;
1556
1557 /*
1558 * {gfn(page) | page intersects with [hva_start, hva_end)} =
1559 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1560 */
1561 gfn = hva_to_gfn_memslot(hva_start, memslot);
1562 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1563
1564 for (; gfn < gfn_end; ++gfn) {
1565 gpa_t gpa = gfn << PAGE_SHIFT;
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001566 ret |= handler(kvm, gpa, data);
Christoffer Dalld5d81842013-01-20 18:28:07 -05001567 }
1568 }
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001569
1570 return ret;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001571}
1572
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001573static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001574{
1575 unmap_stage2_range(kvm, gpa, PAGE_SIZE);
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001576 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001577}
1578
1579int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
1580{
1581 unsigned long end = hva + PAGE_SIZE;
1582
1583 if (!kvm->arch.pgd)
1584 return 0;
1585
1586 trace_kvm_unmap_hva(hva);
1587 handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
1588 return 0;
1589}
1590
1591int kvm_unmap_hva_range(struct kvm *kvm,
1592 unsigned long start, unsigned long end)
1593{
1594 if (!kvm->arch.pgd)
1595 return 0;
1596
1597 trace_kvm_unmap_hva_range(start, end);
1598 handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
1599 return 0;
1600}
1601
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001602static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data)
Christoffer Dalld5d81842013-01-20 18:28:07 -05001603{
1604 pte_t *pte = (pte_t *)data;
1605
Mario Smarduch15a49a42015-01-15 15:58:58 -08001606 /*
1607 * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE
1608 * flag clear because MMU notifiers will have unmapped a huge PMD before
1609 * calling ->change_pte() (which in turn calls kvm_set_spte_hva()) and
1610 * therefore stage2_set_pte() never needs to clear out a huge PMD
1611 * through this calling path.
1612 */
1613 stage2_set_pte(kvm, NULL, gpa, pte, 0);
Marc Zyngier1d2ebac2015-03-12 18:16:50 +00001614 return 0;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001615}
1616
1617
1618void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1619{
1620 unsigned long end = hva + PAGE_SIZE;
1621 pte_t stage2_pte;
1622
1623 if (!kvm->arch.pgd)
1624 return;
1625
1626 trace_kvm_set_spte_hva(hva);
1627 stage2_pte = pfn_pte(pte_pfn(pte), PAGE_S2);
1628 handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
1629}
1630
Marc Zyngier35307b92015-03-12 18:16:51 +00001631static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1632{
1633 pmd_t *pmd;
1634 pte_t *pte;
1635
1636 pmd = stage2_get_pmd(kvm, NULL, gpa);
1637 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1638 return 0;
1639
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +00001640 if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */
Marc Zyngier35307b92015-03-12 18:16:51 +00001641 if (pmd_young(*pmd)) {
1642 *pmd = pmd_mkold(*pmd);
1643 return 1;
1644 }
1645
1646 return 0;
1647 }
1648
1649 pte = pte_offset_kernel(pmd, gpa);
1650 if (pte_none(*pte))
1651 return 0;
1652
1653 if (pte_young(*pte)) {
1654 *pte = pte_mkold(*pte); /* Just a page... */
1655 return 1;
1656 }
1657
1658 return 0;
1659}
1660
1661static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data)
1662{
1663 pmd_t *pmd;
1664 pte_t *pte;
1665
1666 pmd = stage2_get_pmd(kvm, NULL, gpa);
1667 if (!pmd || pmd_none(*pmd)) /* Nothing there */
1668 return 0;
1669
Suzuki K Poulosebbb3b6b2016-03-01 12:00:39 +00001670 if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
Marc Zyngier35307b92015-03-12 18:16:51 +00001671 return pmd_young(*pmd);
1672
1673 pte = pte_offset_kernel(pmd, gpa);
1674 if (!pte_none(*pte)) /* Just a page... */
1675 return pte_young(*pte);
1676
1677 return 0;
1678}
1679
1680int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1681{
1682 trace_kvm_age_hva(start, end);
1683 return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
1684}
1685
1686int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1687{
1688 trace_kvm_test_age_hva(hva);
1689 return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
1690}
1691
Christoffer Dalld5d81842013-01-20 18:28:07 -05001692void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1693{
1694 mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
1695}
1696
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001697phys_addr_t kvm_mmu_get_httbr(void)
1698{
Ard Biesheuvele4c5a682015-03-19 16:42:28 +00001699 if (__kvm_cpu_uses_extended_idmap())
1700 return virt_to_phys(merged_hyp_pgd);
1701 else
1702 return virt_to_phys(hyp_pgd);
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001703}
1704
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001705phys_addr_t kvm_mmu_get_boot_httbr(void)
1706{
Ard Biesheuvele4c5a682015-03-19 16:42:28 +00001707 if (__kvm_cpu_uses_extended_idmap())
1708 return virt_to_phys(merged_hyp_pgd);
1709 else
1710 return virt_to_phys(boot_hyp_pgd);
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001711}
1712
1713phys_addr_t kvm_get_idmap_vector(void)
1714{
1715 return hyp_idmap_vector;
1716}
1717
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001718int kvm_mmu_init(void)
1719{
Marc Zyngier2fb41052013-04-12 19:12:03 +01001720 int err;
1721
Santosh Shilimkar4fda3422013-11-19 14:59:12 -05001722 hyp_idmap_start = kvm_virt_to_phys(__hyp_idmap_text_start);
1723 hyp_idmap_end = kvm_virt_to_phys(__hyp_idmap_text_end);
1724 hyp_idmap_vector = kvm_virt_to_phys(__kvm_hyp_init);
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001725
Ard Biesheuvel06f75a12015-03-19 16:42:26 +00001726 /*
1727 * We rely on the linker script to ensure at build time that the HYP
1728 * init code does not cross a page boundary.
1729 */
1730 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001731
Christoffer Dall38f791a2014-10-10 12:14:28 +02001732 hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
1733 boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
Mark Salter5d4e08c2014-03-28 14:25:19 +00001734
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001735 if (!hyp_pgd || !boot_hyp_pgd) {
Christoffer Dalld5d81842013-01-20 18:28:07 -05001736 kvm_err("Hyp mode PGD not allocated\n");
Marc Zyngier2fb41052013-04-12 19:12:03 +01001737 err = -ENOMEM;
1738 goto out;
1739 }
1740
1741 /* Create the idmap in the boot page tables */
1742 err = __create_hyp_mappings(boot_hyp_pgd,
1743 hyp_idmap_start, hyp_idmap_end,
1744 __phys_to_pfn(hyp_idmap_start),
1745 PAGE_HYP);
1746
1747 if (err) {
1748 kvm_err("Failed to idmap %lx-%lx\n",
1749 hyp_idmap_start, hyp_idmap_end);
1750 goto out;
Christoffer Dalld5d81842013-01-20 18:28:07 -05001751 }
1752
Ard Biesheuvele4c5a682015-03-19 16:42:28 +00001753 if (__kvm_cpu_uses_extended_idmap()) {
1754 merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1755 if (!merged_hyp_pgd) {
1756 kvm_err("Failed to allocate extra HYP pgd\n");
1757 goto out;
1758 }
1759 __kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
1760 hyp_idmap_start);
1761 return 0;
1762 }
1763
Marc Zyngier5a677ce2013-04-12 19:12:06 +01001764 /* Map the very same page at the trampoline VA */
1765 err = __create_hyp_mappings(boot_hyp_pgd,
1766 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1767 __phys_to_pfn(hyp_idmap_start),
1768 PAGE_HYP);
1769 if (err) {
1770 kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
1771 TRAMPOLINE_VA);
1772 goto out;
1773 }
1774
1775 /* Map the same page again into the runtime page tables */
1776 err = __create_hyp_mappings(hyp_pgd,
1777 TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
1778 __phys_to_pfn(hyp_idmap_start),
1779 PAGE_HYP);
1780 if (err) {
1781 kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
1782 TRAMPOLINE_VA);
1783 goto out;
1784 }
1785
Christoffer Dalld5d81842013-01-20 18:28:07 -05001786 return 0;
Marc Zyngier2fb41052013-04-12 19:12:03 +01001787out:
Marc Zyngier4f728272013-04-12 19:12:05 +01001788 free_hyp_pgds();
Marc Zyngier2fb41052013-04-12 19:12:03 +01001789 return err;
Christoffer Dall342cd0a2013-01-20 18:28:06 -05001790}
Eric Augerdf6ce242014-06-06 11:10:23 +02001791
1792void kvm_arch_commit_memory_region(struct kvm *kvm,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001793 const struct kvm_userspace_memory_region *mem,
Eric Augerdf6ce242014-06-06 11:10:23 +02001794 const struct kvm_memory_slot *old,
Paolo Bonzinif36f3f22015-05-18 13:20:23 +02001795 const struct kvm_memory_slot *new,
Eric Augerdf6ce242014-06-06 11:10:23 +02001796 enum kvm_mr_change change)
1797{
Mario Smarduchc6473552015-01-15 15:58:56 -08001798 /*
1799 * At this point memslot has been committed and there is an
1800 * allocated dirty_bitmap[], dirty pages will be be tracked while the
1801 * memory slot is write protected.
1802 */
1803 if (change != KVM_MR_DELETE && mem->flags & KVM_MEM_LOG_DIRTY_PAGES)
1804 kvm_mmu_wp_memory_region(kvm, mem->slot);
Eric Augerdf6ce242014-06-06 11:10:23 +02001805}
1806
1807int kvm_arch_prepare_memory_region(struct kvm *kvm,
1808 struct kvm_memory_slot *memslot,
Paolo Bonzini09170a42015-05-18 13:59:39 +02001809 const struct kvm_userspace_memory_region *mem,
Eric Augerdf6ce242014-06-06 11:10:23 +02001810 enum kvm_mr_change change)
1811{
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001812 hva_t hva = mem->userspace_addr;
1813 hva_t reg_end = hva + mem->memory_size;
1814 bool writable = !(mem->flags & KVM_MEM_READONLY);
1815 int ret = 0;
1816
Mario Smarduch15a49a42015-01-15 15:58:58 -08001817 if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
1818 change != KVM_MR_FLAGS_ONLY)
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001819 return 0;
1820
1821 /*
Christoffer Dallc3058d52014-10-10 12:14:29 +02001822 * Prevent userspace from creating a memory region outside of the IPA
1823 * space addressable by the KVM guest IPA space.
1824 */
1825 if (memslot->base_gfn + memslot->npages >=
1826 (KVM_PHYS_SIZE >> PAGE_SHIFT))
1827 return -EFAULT;
1828
1829 /*
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001830 * A memory region could potentially cover multiple VMAs, and any holes
1831 * between them, so iterate over all of them to find out if we can map
1832 * any of them right now.
1833 *
1834 * +--------------------------------------------+
1835 * +---------------+----------------+ +----------------+
1836 * | : VMA 1 | VMA 2 | | VMA 3 : |
1837 * +---------------+----------------+ +----------------+
1838 * | memory region |
1839 * +--------------------------------------------+
1840 */
1841 do {
1842 struct vm_area_struct *vma = find_vma(current->mm, hva);
1843 hva_t vm_start, vm_end;
1844
1845 if (!vma || vma->vm_start >= reg_end)
1846 break;
1847
1848 /*
1849 * Mapping a read-only VMA is only allowed if the
1850 * memory region is configured as read-only.
1851 */
1852 if (writable && !(vma->vm_flags & VM_WRITE)) {
1853 ret = -EPERM;
1854 break;
1855 }
1856
1857 /*
1858 * Take the intersection of this VMA with the memory region
1859 */
1860 vm_start = max(hva, vma->vm_start);
1861 vm_end = min(reg_end, vma->vm_end);
1862
1863 if (vma->vm_flags & VM_PFNMAP) {
1864 gpa_t gpa = mem->guest_phys_addr +
1865 (vm_start - mem->userspace_addr);
Marek Majtykaca09f022015-09-16 12:04:55 +02001866 phys_addr_t pa;
1867
1868 pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
1869 pa += vm_start - vma->vm_start;
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001870
Mario Smarduch15a49a42015-01-15 15:58:58 -08001871 /* IO region dirty page logging not allowed */
1872 if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES)
1873 return -EINVAL;
1874
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001875 ret = kvm_phys_addr_ioremap(kvm, gpa, pa,
1876 vm_end - vm_start,
1877 writable);
1878 if (ret)
1879 break;
1880 }
1881 hva = vm_end;
1882 } while (hva < reg_end);
1883
Mario Smarduch15a49a42015-01-15 15:58:58 -08001884 if (change == KVM_MR_FLAGS_ONLY)
1885 return ret;
1886
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001887 spin_lock(&kvm->mmu_lock);
1888 if (ret)
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001889 unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001890 else
1891 stage2_flush_memslot(kvm, memslot);
1892 spin_unlock(&kvm->mmu_lock);
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001893 return ret;
Eric Augerdf6ce242014-06-06 11:10:23 +02001894}
1895
1896void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
1897 struct kvm_memory_slot *dont)
1898{
1899}
1900
1901int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
1902 unsigned long npages)
1903{
Ard Biesheuvel849260c2014-11-17 14:58:53 +00001904 /*
1905 * Readonly memslots are not incoherent with the caches by definition,
1906 * but in practice, they are used mostly to emulate ROMs or NOR flashes
1907 * that the guest may consider devices and hence map as uncached.
1908 * To prevent incoherency issues in these cases, tag all readonly
1909 * regions as incoherent.
1910 */
1911 if (slot->flags & KVM_MEM_READONLY)
1912 slot->flags |= KVM_MEMSLOT_INCOHERENT;
Eric Augerdf6ce242014-06-06 11:10:23 +02001913 return 0;
1914}
1915
Paolo Bonzini15f46012015-05-17 21:26:08 +02001916void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
Eric Augerdf6ce242014-06-06 11:10:23 +02001917{
1918}
1919
1920void kvm_arch_flush_shadow_all(struct kvm *kvm)
1921{
1922}
1923
1924void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
1925 struct kvm_memory_slot *slot)
1926{
Ard Biesheuvel8eef9122014-10-10 17:00:32 +02001927 gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
1928 phys_addr_t size = slot->npages << PAGE_SHIFT;
1929
1930 spin_lock(&kvm->mmu_lock);
1931 unmap_stage2_range(kvm, gpa, size);
1932 spin_unlock(&kvm->mmu_lock);
Eric Augerdf6ce242014-06-06 11:10:23 +02001933}
Marc Zyngier3c1e7162014-12-19 16:05:31 +00001934
1935/*
1936 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
1937 *
1938 * Main problems:
1939 * - S/W ops are local to a CPU (not broadcast)
1940 * - We have line migration behind our back (speculation)
1941 * - System caches don't support S/W at all (damn!)
1942 *
1943 * In the face of the above, the best we can do is to try and convert
1944 * S/W ops to VA ops. Because the guest is not allowed to infer the
1945 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
1946 * which is a rather good thing for us.
1947 *
1948 * Also, it is only used when turning caches on/off ("The expected
1949 * usage of the cache maintenance instructions that operate by set/way
1950 * is associated with the cache maintenance instructions associated
1951 * with the powerdown and powerup of caches, if this is required by
1952 * the implementation.").
1953 *
1954 * We use the following policy:
1955 *
1956 * - If we trap a S/W operation, we enable VM trapping to detect
1957 * caches being turned on/off, and do a full clean.
1958 *
1959 * - We flush the caches on both caches being turned on and off.
1960 *
1961 * - Once the caches are enabled, we stop trapping VM ops.
1962 */
1963void kvm_set_way_flush(struct kvm_vcpu *vcpu)
1964{
1965 unsigned long hcr = vcpu_get_hcr(vcpu);
1966
1967 /*
1968 * If this is the first time we do a S/W operation
1969 * (i.e. HCR_TVM not set) flush the whole memory, and set the
1970 * VM trapping.
1971 *
1972 * Otherwise, rely on the VM trapping to wait for the MMU +
1973 * Caches to be turned off. At that point, we'll be able to
1974 * clean the caches again.
1975 */
1976 if (!(hcr & HCR_TVM)) {
1977 trace_kvm_set_way_flush(*vcpu_pc(vcpu),
1978 vcpu_has_cache_enabled(vcpu));
1979 stage2_flush_vm(vcpu->kvm);
1980 vcpu_set_hcr(vcpu, hcr | HCR_TVM);
1981 }
1982}
1983
1984void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
1985{
1986 bool now_enabled = vcpu_has_cache_enabled(vcpu);
1987
1988 /*
1989 * If switching the MMU+caches on, need to invalidate the caches.
1990 * If switching it off, need to clean the caches.
1991 * Clean + invalidate does the trick always.
1992 */
1993 if (now_enabled != was_enabled)
1994 stage2_flush_vm(vcpu->kvm);
1995
1996 /* Caches are now on, stop trapping VM ops (until a S/W op) */
1997 if (now_enabled)
1998 vcpu_set_hcr(vcpu, vcpu_get_hcr(vcpu) & ~HCR_TVM);
1999
2000 trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
2001}