Blame - drivers/vfio/vfio_iommu_type1.c - kernel/msm-4.9

blob: 6654a7eb42d361437bed3e4b363751fc8a7353bf [file] [log] [blame]

Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	1	/*
				2	* VFIO: IOMMU DMA mapping support for Type1 IOMMU
				3	*
				4	* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
				5	* Author: Alex Williamson <alex.williamson@redhat.com>
				6	*
				7	* This program is free software; you can redistribute it and/or modify
				8	* it under the terms of the GNU General Public License version 2 as
				9	* published by the Free Software Foundation.
				10	*
				11	* Derived from original vfio:
				12	* Copyright 2010 Cisco Systems, Inc. All rights reserved.
				13	* Author: Tom Lyon, pugs@cisco.com
				14	*
				15	* We arbitrarily define a Type1 IOMMU as one matching the below code.
				16	* It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
				17	* VT-d, but that makes it harder to re-use as theoretically anyone
				18	* implementing a similar IOMMU could make use of this. We expect the
				19	* IOMMU to support the IOMMU API and have few to no restrictions around
				20	* the IOVA range that can be mapped. The Type1 IOMMU is currently
				21	* optimized for relatively static mappings of a userspace process with
				22	* userpsace pages pinned into memory. We also assume devices and IOMMU
				23	* domains are PCI based as the IOMMU API is still centered around a
				24	* device/bus interface rather than a group interface.
				25	*/
				26
				27	#include <linux/compat.h>
				28	#include <linux/device.h>
				29	#include <linux/fs.h>
				30	#include <linux/iommu.h>
				31	#include <linux/module.h>
				32	#include <linux/mm.h>
				33	#include <linux/pci.h> /* pci_bus_type */
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	34	#include <linux/rbtree.h>
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	35	#include <linux/sched.h>
				36	#include <linux/slab.h>
				37	#include <linux/uaccess.h>
				38	#include <linux/vfio.h>
				39	#include <linux/workqueue.h>
				40
				41	#define DRIVER_VERSION "0.2"
				42	#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
				43	#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
				44
				45	static bool allow_unsafe_interrupts;
				46	module_param_named(allow_unsafe_interrupts,
				47	allow_unsafe_interrupts, bool, S_IRUGO \| S_IWUSR);
				48	MODULE_PARM_DESC(allow_unsafe_interrupts,
				49	"Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
				50
				51	struct vfio_iommu {
				52	struct iommu_domain *domain;
				53	struct mutex lock;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	54	struct rb_root dma_list;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	55	struct list_head group_list;
				56	bool cache;
				57	};
				58
				59	struct vfio_dma {
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	60	struct rb_node node;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	61	dma_addr_t iova; /* Device address */
				62	unsigned long vaddr; /* Process virtual addr */
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	63	size_t size; /* Map size (bytes) */
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	64	int prot; /* IOMMU_READ/WRITE */
				65	};
				66
				67	struct vfio_group {
				68	struct iommu_group *iommu_group;
				69	struct list_head next;
				70	};
				71
				72	/*
				73	* This code handles mapping and unmapping of user data buffers
				74	* into DMA'ble space using the IOMMU
				75	*/
				76
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	77	static struct vfio_dma vfio_find_dma(struct vfio_iommu iommu,
				78	dma_addr_t start, size_t size)
				79	{
				80	struct rb_node *node = iommu->dma_list.rb_node;
				81
				82	while (node) {
				83	struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
				84
				85	if (start + size <= dma->iova)
				86	node = node->rb_left;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	87	else if (start >= dma->iova + dma->size)
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	88	node = node->rb_right;
				89	else
				90	return dma;
				91	}
				92
				93	return NULL;
				94	}
				95
				96	static void vfio_insert_dma(struct vfio_iommu iommu, struct vfio_dma new)
				97	{
				98	struct rb_node *link = &iommu->dma_list.rb_node, parent = NULL;
				99	struct vfio_dma *dma;
				100
				101	while (*link) {
				102	parent = *link;
				103	dma = rb_entry(parent, struct vfio_dma, node);
				104
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	105	if (new->iova + new->size <= dma->iova)
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	106	link = &(*link)->rb_left;
				107	else
				108	link = &(*link)->rb_right;
				109	}
				110
				111	rb_link_node(&new->node, parent, link);
				112	rb_insert_color(&new->node, &iommu->dma_list);
				113	}
				114
				115	static void vfio_remove_dma(struct vfio_iommu iommu, struct vfio_dma old)
				116	{
				117	rb_erase(&old->node, &iommu->dma_list);
				118	}
				119
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	120	struct vwork {
				121	struct mm_struct *mm;
				122	long npage;
				123	struct work_struct work;
				124	};
				125
				126	/* delayed decrement/increment for locked_vm */
				127	static void vfio_lock_acct_bg(struct work_struct *work)
				128	{
				129	struct vwork *vwork = container_of(work, struct vwork, work);
				130	struct mm_struct *mm;
				131
				132	mm = vwork->mm;
				133	down_write(&mm->mmap_sem);
				134	mm->locked_vm += vwork->npage;
				135	up_write(&mm->mmap_sem);
				136	mmput(mm);
				137	kfree(vwork);
				138	}
				139
				140	static void vfio_lock_acct(long npage)
				141	{
				142	struct vwork *vwork;
				143	struct mm_struct *mm;
				144
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	145	if (!current->mm \|\| !npage)
				146	return; /* process exited or nothing to do */
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	147
				148	if (down_write_trylock(&current->mm->mmap_sem)) {
				149	current->mm->locked_vm += npage;
				150	up_write(&current->mm->mmap_sem);
				151	return;
				152	}
				153
				154	/*
				155	* Couldn't get mmap_sem lock, so must setup to update
				156	* mm->locked_vm later. If locked_vm were atomic, we
				157	* wouldn't need this silliness
				158	*/
				159	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
				160	if (!vwork)
				161	return;
				162	mm = get_task_mm(current);
				163	if (!mm) {
				164	kfree(vwork);
				165	return;
				166	}
				167	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
				168	vwork->mm = mm;
				169	vwork->npage = npage;
				170	schedule_work(&vwork->work);
				171	}
				172
				173	/*
				174	* Some mappings aren't backed by a struct page, for example an mmap'd
				175	* MMIO range for our own or another device. These use a different
				176	* pfn conversion and shouldn't be tracked as locked pages.
				177	*/
				178	static bool is_invalid_reserved_pfn(unsigned long pfn)
				179	{
				180	if (pfn_valid(pfn)) {
				181	bool reserved;
				182	struct page *tail = pfn_to_page(pfn);
				183	struct page *head = compound_trans_head(tail);
				184	reserved = !!(PageReserved(head));
				185	if (head != tail) {
				186	/*
				187	* "head" is not a dangling pointer
				188	* (compound_trans_head takes care of that)
				189	* but the hugepage may have been split
				190	* from under us (and we may not hold a
				191	* reference count on the head page so it can
				192	* be reused before we run PageReferenced), so
				193	* we've to check PageTail before returning
				194	* what we just read.
				195	*/
				196	smp_rmb();
				197	if (PageTail(tail))
				198	return reserved;
				199	}
				200	return PageReserved(tail);
				201	}
				202
				203	return true;
				204	}
				205
				206	static int put_pfn(unsigned long pfn, int prot)
				207	{
				208	if (!is_invalid_reserved_pfn(pfn)) {
				209	struct page *page = pfn_to_page(pfn);
				210	if (prot & IOMMU_WRITE)
				211	SetPageDirty(page);
				212	put_page(page);
				213	return 1;
				214	}
				215	return 0;
				216	}
				217
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	218	static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
				219	{
				220	struct page *page[1];
				221	struct vm_area_struct *vma;
				222	int ret = -EFAULT;
				223
				224	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
				225	*pfn = page_to_pfn(page[0]);
				226	return 0;
				227	}
				228
				229	down_read(&current->mm->mmap_sem);
				230
				231	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
				232
				233	if (vma && vma->vm_flags & VM_PFNMAP) {
				234	*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
				235	if (is_invalid_reserved_pfn(*pfn))
				236	ret = 0;
				237	}
				238
				239	up_read(&current->mm->mmap_sem);
				240
				241	return ret;
				242	}
				243
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	244	/*
				245	* Attempt to pin pages. We really don't want to track all the pfns and
				246	* the iommu can only map chunks of consecutive pfns anyway, so get the
				247	* first page and all consecutive pages with the same locking.
				248	*/
				249	static long vfio_pin_pages(unsigned long vaddr, long npage,
				250	int prot, unsigned long *pfn_base)
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	251	{
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	252	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				253	bool lock_cap = capable(CAP_IPC_LOCK);
				254	long ret, i;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	255
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	256	if (!current->mm)
				257	return -ENODEV;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	258
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	259	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
				260	if (ret)
				261	return ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	262
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	263	if (is_invalid_reserved_pfn(*pfn_base))
				264	return 1;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	265
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	266	if (!lock_cap && current->mm->locked_vm + 1 > limit) {
				267	put_pfn(*pfn_base, prot);
				268	pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
				269	limit << PAGE_SHIFT);
				270	return -ENOMEM;
				271	}
				272
				273	/* Lock all the consecutive pages from pfn_base */
				274	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	275	unsigned long pfn = 0;
				276
				277	ret = vaddr_get_pfn(vaddr, prot, &pfn);
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	278	if (ret)
				279	break;
				280
				281	if (pfn != *pfn_base + i \|\| is_invalid_reserved_pfn(pfn)) {
				282	put_pfn(pfn, prot);
				283	break;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	284	}
				285
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	286	if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	287	put_pfn(pfn, prot);
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	288	pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
				289	__func__, limit << PAGE_SHIFT);
				290	break;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	291	}
				292	}
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	293
				294	vfio_lock_acct(i);
				295
				296	return i;
				297	}
				298
				299	static long vfio_unpin_pages(unsigned long pfn, long npage,
				300	int prot, bool do_accounting)
				301	{
				302	unsigned long unlocked = 0;
				303	long i;
				304
				305	for (i = 0; i < npage; i++)
				306	unlocked += put_pfn(pfn++, prot);
				307
				308	if (do_accounting)
				309	vfio_lock_acct(-unlocked);
				310
				311	return unlocked;
				312	}
				313
				314	static int vfio_unmap_unpin(struct vfio_iommu iommu, struct vfio_dma dma,
				315	dma_addr_t iova, size_t *size)
				316	{
				317	dma_addr_t start = iova, end = iova + *size;
				318	long unlocked = 0;
				319
				320	while (iova < end) {
				321	size_t unmapped;
				322	phys_addr_t phys;
				323
				324	/*
				325	* We use the IOMMU to track the physical address. This
				326	* saves us from having a lot more entries in our mapping
				327	* tree. The downside is that we don't track the size
				328	* used to do the mapping. We request unmap of a single
				329	* page, but expect IOMMUs that support large pages to
				330	* unmap a larger chunk.
				331	*/
				332	phys = iommu_iova_to_phys(iommu->domain, iova);
				333	if (WARN_ON(!phys)) {
				334	iova += PAGE_SIZE;
				335	continue;
				336	}
				337
				338	unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
				339	if (!unmapped)
				340	break;
				341
				342	unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
				343	unmapped >> PAGE_SHIFT,
				344	dma->prot, false);
				345	iova += unmapped;
				346	}
				347
				348	vfio_lock_acct(-unlocked);
				349
				350	*size = iova - start;
				351
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	352	return 0;
				353	}
				354
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	355	static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	356	size_t size, struct vfio_dma dma)
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	357	{
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	358	size_t offset, overlap, tmp;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	359	struct vfio_dma *split;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	360	int ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	361
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	362	/*
				363	* Existing dma region is completely covered, unmap all. This is
				364	* the likely case since userspace tends to map and unmap buffers
				365	* in one shot rather than multiple mappings within a buffer.
				366	*/
				367	if (likely(start <= dma->iova &&
				368	start + *size >= dma->iova + dma->size)) {
				369	*size = dma->size;
				370	ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
				371	if (ret)
				372	return ret;
				373
				374	/*
				375	* Did we remove more than we have? Should never happen
				376	* since a vfio_dma is contiguous in iova and vaddr.
				377	*/
				378	WARN_ON(*size != dma->size);
				379
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	380	vfio_remove_dma(iommu, dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	381	kfree(dma);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	382	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	383	}
				384
				385	/* Overlap low address of existing range */
				386	if (start <= dma->iova) {
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	387	overlap = start + *size - dma->iova;
				388	ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
				389	if (ret)
				390	return ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	391
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	392	vfio_remove_dma(iommu, dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	393
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	394	/*
				395	* Check, we may have removed to whole vfio_dma. If not
				396	* fixup and re-insert.
				397	*/
				398	if (overlap < dma->size) {
				399	dma->iova += overlap;
				400	dma->vaddr += overlap;
				401	dma->size -= overlap;
				402	vfio_insert_dma(iommu, dma);
				403	}
				404	*size = overlap;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	405	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	406	}
				407
				408	/* Overlap high address of existing range */
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	409	if (start + *size >= dma->iova + dma->size) {
				410	offset = start - dma->iova;
				411	overlap = dma->size - offset;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	412
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	413	ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
				414	if (ret)
				415	return ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	416
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	417	/*
				418	* We may have unmapped the entire vfio_dma if the user is
				419	* trying to unmap a sub-region of what was originally
				420	* mapped. If anything left, we can resize in place since
				421	* iova is unchanged.
				422	*/
				423	if (overlap < dma->size)
				424	dma->size -= overlap;
				425	else
				426	vfio_remove_dma(iommu, dma);
				427
				428	*size = overlap;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	429	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	430	}
				431
				432	/* Split existing */
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	433	offset = start - dma->iova;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	434
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	435	ret = vfio_unmap_unpin(iommu, dma, start, size);
				436	if (ret)
				437	return ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	438
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	439	WARN_ON(!*size);
				440	tmp = dma->size;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	441
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	442	/*
				443	* Resize the lower vfio_dma in place, insert new for remaining
				444	* upper segment.
				445	*/
				446	dma->size = offset;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	447
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	448	if (offset + *size < tmp) {
				449	split = kzalloc(sizeof(*split), GFP_KERNEL);
				450	if (!split)
				451	return -ENOMEM;
				452
				453	split->size = tmp - offset - *size;
				454	split->iova = dma->iova + offset + *size;
				455	split->vaddr = dma->vaddr + offset + *size;
				456	split->prot = dma->prot;
				457	vfio_insert_dma(iommu, split);
				458	}
				459
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	460	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	461	}
				462
				463	static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
				464	struct vfio_iommu_type1_dma_unmap *unmap)
				465	{
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	466	uint64_t mask;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	467	struct vfio_dma *dma;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	468	size_t unmapped = 0, size;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	469	int ret = 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	470
				471	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
				472
				473	if (unmap->iova & mask)
				474	return -EINVAL;
				475	if (unmap->size & mask)
				476	return -EINVAL;
				477
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	478	WARN_ON(mask & PAGE_MASK);
				479
				480	mutex_lock(&iommu->lock);
				481
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	482	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
				483	size = unmap->size;
				484	ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
				485	if (ret)
				486	break;
				487	unmapped += size;
				488	}
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	489
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	490	mutex_unlock(&iommu->lock);
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	491
				492	/*
				493	* We may unmap more than requested, update the unmap struct so
				494	* userspace can know.
				495	*/
				496	unmap->size = unmapped;
				497
				498	return ret;
				499	}
				500
				501	/*
				502	* Turns out AMD IOMMU has a page table bug where it won't map large pages
				503	* to a region that previously mapped smaller pages. This should be fixed
				504	* soon, so this is just a temporary workaround to break mappings down into
				505	* PAGE_SIZE. Better to map smaller pages than nothing.
				506	*/
				507	static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
				508	unsigned long pfn, long npage, int prot)
				509	{
				510	long i;
				511	int ret;
				512
				513	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
				514	ret = iommu_map(iommu->domain, iova,
				515	(phys_addr_t)pfn << PAGE_SHIFT,
				516	PAGE_SIZE, prot);
				517	if (ret)
				518	break;
				519	}
				520
				521	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
				522	iommu_unmap(iommu->domain, iova, PAGE_SIZE);
				523
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	524	return ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	525	}
				526
				527	static int vfio_dma_do_map(struct vfio_iommu *iommu,
				528	struct vfio_iommu_type1_dma_map *map)
				529	{
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	530	dma_addr_t end, iova;
				531	unsigned long vaddr = map->vaddr;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	532	size_t size = map->size;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	533	long npage;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	534	int ret = 0, prot = 0;
				535	uint64_t mask;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	536
				537	end = map->iova + map->size;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	538
				539	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
				540
				541	/* READ/WRITE from device perspective */
				542	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
				543	prot \|= IOMMU_WRITE;
				544	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
				545	prot \|= IOMMU_READ;
				546
				547	if (!prot)
				548	return -EINVAL; /* No READ/WRITE? */
				549
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	550	if (iommu->cache)
				551	prot \|= IOMMU_CACHE;
				552
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	553	if (vaddr & mask)
				554	return -EINVAL;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	555	if (map->iova & mask)
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	556	return -EINVAL;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	557	if (!map->size \|\| map->size & mask)
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	558	return -EINVAL;
				559
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	560	WARN_ON(mask & PAGE_MASK);
				561
				562	/* Don't allow IOVA wrap */
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	563	if (end && end < map->iova)
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	564	return -EINVAL;
				565
				566	/* Don't allow virtual address wrap */
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	567	if (vaddr + map->size && vaddr + map->size < vaddr)
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	568	return -EINVAL;
				569
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	570	mutex_lock(&iommu->lock);
				571
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	572	if (vfio_find_dma(iommu, map->iova, map->size)) {
				573	mutex_unlock(&iommu->lock);
				574	return -EEXIST;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	575	}
				576
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	577	for (iova = map->iova; iova < end; iova += size, vaddr += size) {
				578	struct vfio_dma *dma = NULL;
				579	unsigned long pfn;
				580	long i;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	581
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	582	/* Pin a contiguous chunk of memory */
				583	npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
				584	prot, &pfn);
				585	if (npage <= 0) {
				586	WARN_ON(!npage);
				587	ret = (int)npage;
				588	break;
				589	}
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	590
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	591	/* Verify pages are not already mapped */
				592	for (i = 0; i < npage; i++) {
				593	if (iommu_iova_to_phys(iommu->domain,
				594	iova + (i << PAGE_SHIFT))) {
				595	vfio_unpin_pages(pfn, npage, prot, true);
				596	ret = -EBUSY;
				597	break;
				598	}
				599	}
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	600
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	601	ret = iommu_map(iommu->domain, iova,
				602	(phys_addr_t)pfn << PAGE_SHIFT,
				603	npage << PAGE_SHIFT, prot);
				604	if (ret) {
				605	if (ret != -EBUSY \|\|
				606	map_try_harder(iommu, iova, pfn, npage, prot)) {
				607	vfio_unpin_pages(pfn, npage, prot, true);
				608	break;
				609	}
				610	}
				611
				612	size = npage << PAGE_SHIFT;
				613
				614	/*
				615	* Check if we abut a region below - nothing below 0.
				616	* This is the most likely case when mapping chunks of
				617	* physically contiguous regions within a virtual address
				618	* range. Update the abutting entry in place since iova
				619	* doesn't change.
				620	*/
				621	if (likely(iova)) {
				622	struct vfio_dma *tmp;
				623	tmp = vfio_find_dma(iommu, iova - 1, 1);
				624	if (tmp && tmp->prot == prot &&
				625	tmp->vaddr + tmp->size == vaddr) {
				626	tmp->size += size;
				627
				628	iova = tmp->iova;
				629	size = tmp->size;
				630	vaddr = tmp->vaddr;
				631	dma = tmp;
				632	}
				633	}
				634
				635	/* Check if we abut a region above - nothing above ~0 + 1 */
				636	if (likely(iova + size)) {
				637	struct vfio_dma *tmp;
				638
				639	tmp = vfio_find_dma(iommu, iova + size, 1);
				640	if (tmp && tmp->prot == prot &&
				641	tmp->vaddr == vaddr + size) {
				642	vfio_remove_dma(iommu, tmp);
				643	if (dma)
				644	dma->size += tmp->size;
				645	else
				646	size += tmp->size;
				647	kfree(tmp);
				648	}
				649	}
				650
				651	if (!dma) {
				652	dma = kzalloc(sizeof(*dma), GFP_KERNEL);
				653	if (!dma) {
				654	iommu_unmap(iommu->domain, iova, size);
				655	vfio_unpin_pages(pfn, npage, prot, true);
				656	ret = -ENOMEM;
				657	break;
				658	}
				659
				660	dma->size = size;
				661	dma->iova = iova;
				662	dma->vaddr = vaddr;
				663	dma->prot = prot;
				664	vfio_insert_dma(iommu, dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	665	}
				666	}
				667
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	668	if (ret) {
				669	struct vfio_dma *tmp;
				670	iova = map->iova;
				671	size = map->size;
				672	while ((tmp = vfio_find_dma(iommu, iova, size))) {
				673	if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
				674	pr_warn("%s: Error rolling back failed map\n",
				675	__func__);
				676	break;
				677	}
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	678	}
				679	}
				680
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	681	mutex_unlock(&iommu->lock);
				682	return ret;
				683	}
				684
				685	static int vfio_iommu_type1_attach_group(void *iommu_data,
				686	struct iommu_group *iommu_group)
				687	{
				688	struct vfio_iommu *iommu = iommu_data;
				689	struct vfio_group group, tmp;
				690	int ret;
				691
				692	group = kzalloc(sizeof(*group), GFP_KERNEL);
				693	if (!group)
				694	return -ENOMEM;
				695
				696	mutex_lock(&iommu->lock);
				697
				698	list_for_each_entry(tmp, &iommu->group_list, next) {
				699	if (tmp->iommu_group == iommu_group) {
				700	mutex_unlock(&iommu->lock);
				701	kfree(group);
				702	return -EINVAL;
				703	}
				704	}
				705
				706	/*
				707	* TODO: Domain have capabilities that might change as we add
				708	* groups (see iommu->cache, currently never set). Check for
				709	* them and potentially disallow groups to be attached when it
				710	* would change capabilities (ugh).
				711	*/
				712	ret = iommu_attach_group(iommu->domain, iommu_group);
				713	if (ret) {
				714	mutex_unlock(&iommu->lock);
				715	kfree(group);
				716	return ret;
				717	}
				718
				719	group->iommu_group = iommu_group;
				720	list_add(&group->next, &iommu->group_list);
				721
				722	mutex_unlock(&iommu->lock);
				723
				724	return 0;
				725	}
				726
				727	static void vfio_iommu_type1_detach_group(void *iommu_data,
				728	struct iommu_group *iommu_group)
				729	{
				730	struct vfio_iommu *iommu = iommu_data;
				731	struct vfio_group *group;
				732
				733	mutex_lock(&iommu->lock);
				734
				735	list_for_each_entry(group, &iommu->group_list, next) {
				736	if (group->iommu_group == iommu_group) {
				737	iommu_detach_group(iommu->domain, iommu_group);
				738	list_del(&group->next);
				739	kfree(group);
				740	break;
				741	}
				742	}
				743
				744	mutex_unlock(&iommu->lock);
				745	}
				746
				747	static void *vfio_iommu_type1_open(unsigned long arg)
				748	{
				749	struct vfio_iommu *iommu;
				750
				751	if (arg != VFIO_TYPE1_IOMMU)
				752	return ERR_PTR(-EINVAL);
				753
				754	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
				755	if (!iommu)
				756	return ERR_PTR(-ENOMEM);
				757
				758	INIT_LIST_HEAD(&iommu->group_list);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	759	iommu->dma_list = RB_ROOT;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	760	mutex_init(&iommu->lock);
				761
				762	/*
				763	* Wish we didn't have to know about bus_type here.
				764	*/
				765	iommu->domain = iommu_domain_alloc(&pci_bus_type);
				766	if (!iommu->domain) {
				767	kfree(iommu);
				768	return ERR_PTR(-EIO);
				769	}
				770
				771	/*
				772	* Wish we could specify required capabilities rather than create
				773	* a domain, see what comes out and hope it doesn't change along
				774	* the way. Fortunately we know interrupt remapping is global for
				775	* our iommus.
				776	*/
				777	if (!allow_unsafe_interrupts &&
				778	!iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
				779	pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
				780	__func__);
				781	iommu_domain_free(iommu->domain);
				782	kfree(iommu);
				783	return ERR_PTR(-EPERM);
				784	}
				785
				786	return iommu;
				787	}
				788
				789	static void vfio_iommu_type1_release(void *iommu_data)
				790	{
				791	struct vfio_iommu *iommu = iommu_data;
				792	struct vfio_group group, group_tmp;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	793	struct rb_node *node;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	794
				795	list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
				796	iommu_detach_group(iommu->domain, group->iommu_group);
				797	list_del(&group->next);
				798	kfree(group);
				799	}
				800
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame]	801	while ((node = rb_first(&iommu->dma_list))) {
				802	struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	803	size_t size = dma->size;
				804	vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	805	}
				806
				807	iommu_domain_free(iommu->domain);
				808	iommu->domain = NULL;
				809	kfree(iommu);
				810	}
				811
				812	static long vfio_iommu_type1_ioctl(void *iommu_data,
				813	unsigned int cmd, unsigned long arg)
				814	{
				815	struct vfio_iommu *iommu = iommu_data;
				816	unsigned long minsz;
				817
				818	if (cmd == VFIO_CHECK_EXTENSION) {
				819	switch (arg) {
				820	case VFIO_TYPE1_IOMMU:
				821	return 1;
				822	default:
				823	return 0;
				824	}
				825	} else if (cmd == VFIO_IOMMU_GET_INFO) {
				826	struct vfio_iommu_type1_info info;
				827
				828	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
				829
				830	if (copy_from_user(&info, (void __user *)arg, minsz))
				831	return -EFAULT;
				832
				833	if (info.argsz < minsz)
				834	return -EINVAL;
				835
				836	info.flags = 0;
				837
				838	info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
				839
				840	return copy_to_user((void __user *)arg, &info, minsz);
				841
				842	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
				843	struct vfio_iommu_type1_dma_map map;
				844	uint32_t mask = VFIO_DMA_MAP_FLAG_READ \|
				845	VFIO_DMA_MAP_FLAG_WRITE;
				846
				847	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
				848
				849	if (copy_from_user(&map, (void __user *)arg, minsz))
				850	return -EFAULT;
				851
				852	if (map.argsz < minsz \|\| map.flags & ~mask)
				853	return -EINVAL;
				854
				855	return vfio_dma_do_map(iommu, &map);
				856
				857	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
				858	struct vfio_iommu_type1_dma_unmap unmap;
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	859	long ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	860
				861	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
				862
				863	if (copy_from_user(&unmap, (void __user *)arg, minsz))
				864	return -EFAULT;
				865
				866	if (unmap.argsz < minsz \|\| unmap.flags)
				867	return -EINVAL;
				868
Alex Williamson	166fd7d	2013-06-21 09:38:02 -0600	[diff] [blame^]	869	ret = vfio_dma_do_unmap(iommu, &unmap);
				870	if (ret)
				871	return ret;
				872
				873	return copy_to_user((void __user *)arg, &unmap, minsz);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	874	}
				875
				876	return -ENOTTY;
				877	}
				878
				879	static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
				880	.name = "vfio-iommu-type1",
				881	.owner = THIS_MODULE,
				882	.open = vfio_iommu_type1_open,
				883	.release = vfio_iommu_type1_release,
				884	.ioctl = vfio_iommu_type1_ioctl,
				885	.attach_group = vfio_iommu_type1_attach_group,
				886	.detach_group = vfio_iommu_type1_detach_group,
				887	};
				888
				889	static int __init vfio_iommu_type1_init(void)
				890	{
				891	if (!iommu_present(&pci_bus_type))
				892	return -ENODEV;
				893
				894	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
				895	}
				896
				897	static void __exit vfio_iommu_type1_cleanup(void)
				898	{
				899	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
				900	}
				901
				902	module_init(vfio_iommu_type1_init);
				903	module_exit(vfio_iommu_type1_cleanup);
				904
				905	MODULE_VERSION(DRIVER_VERSION);
				906	MODULE_LICENSE("GPL v2");
				907	MODULE_AUTHOR(DRIVER_AUTHOR);
				908	MODULE_DESCRIPTION(DRIVER_DESC);