Blame - drivers/vfio/vfio_iommu_type1.c - kernel/msm-4.9

blob: 0e863b3ddcab6616d3697c0abcc9be87c1b1ca3c [file] [log] [blame]

Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	1	/*
				2	* VFIO: IOMMU DMA mapping support for Type1 IOMMU
				3	*
				4	* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
				5	* Author: Alex Williamson <alex.williamson@redhat.com>
				6	*
				7	* This program is free software; you can redistribute it and/or modify
				8	* it under the terms of the GNU General Public License version 2 as
				9	* published by the Free Software Foundation.
				10	*
				11	* Derived from original vfio:
				12	* Copyright 2010 Cisco Systems, Inc. All rights reserved.
				13	* Author: Tom Lyon, pugs@cisco.com
				14	*
				15	* We arbitrarily define a Type1 IOMMU as one matching the below code.
				16	* It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
				17	* VT-d, but that makes it harder to re-use as theoretically anyone
				18	* implementing a similar IOMMU could make use of this. We expect the
				19	* IOMMU to support the IOMMU API and have few to no restrictions around
				20	* the IOVA range that can be mapped. The Type1 IOMMU is currently
				21	* optimized for relatively static mappings of a userspace process with
				22	* userpsace pages pinned into memory. We also assume devices and IOMMU
				23	* domains are PCI based as the IOMMU API is still centered around a
				24	* device/bus interface rather than a group interface.
				25	*/
				26
				27	#include <linux/compat.h>
				28	#include <linux/device.h>
				29	#include <linux/fs.h>
				30	#include <linux/iommu.h>
				31	#include <linux/module.h>
				32	#include <linux/mm.h>
				33	#include <linux/pci.h> /* pci_bus_type */
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	34	#include <linux/rbtree.h>
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	35	#include <linux/sched.h>
				36	#include <linux/slab.h>
				37	#include <linux/uaccess.h>
				38	#include <linux/vfio.h>
				39	#include <linux/workqueue.h>
				40
				41	#define DRIVER_VERSION "0.2"
				42	#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
				43	#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
				44
				45	static bool allow_unsafe_interrupts;
				46	module_param_named(allow_unsafe_interrupts,
				47	allow_unsafe_interrupts, bool, S_IRUGO \| S_IWUSR);
				48	MODULE_PARM_DESC(allow_unsafe_interrupts,
				49	"Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
				50
				51	struct vfio_iommu {
				52	struct iommu_domain *domain;
				53	struct mutex lock;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	54	struct rb_root dma_list;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	55	struct list_head group_list;
				56	bool cache;
				57	};
				58
				59	struct vfio_dma {
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	60	struct rb_node node;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	61	dma_addr_t iova; /* Device address */
				62	unsigned long vaddr; /* Process virtual addr */
				63	long npage; /* Number of pages */
				64	int prot; /* IOMMU_READ/WRITE */
				65	};
				66
				67	struct vfio_group {
				68	struct iommu_group *iommu_group;
				69	struct list_head next;
				70	};
				71
				72	/*
				73	* This code handles mapping and unmapping of user data buffers
				74	* into DMA'ble space using the IOMMU
				75	*/
				76
				77	#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
				78
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	79	static struct vfio_dma vfio_find_dma(struct vfio_iommu iommu,
				80	dma_addr_t start, size_t size)
				81	{
				82	struct rb_node *node = iommu->dma_list.rb_node;
				83
				84	while (node) {
				85	struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
				86
				87	if (start + size <= dma->iova)
				88	node = node->rb_left;
				89	else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
				90	node = node->rb_right;
				91	else
				92	return dma;
				93	}
				94
				95	return NULL;
				96	}
				97
				98	static void vfio_insert_dma(struct vfio_iommu iommu, struct vfio_dma new)
				99	{
				100	struct rb_node *link = &iommu->dma_list.rb_node, parent = NULL;
				101	struct vfio_dma *dma;
				102
				103	while (*link) {
				104	parent = *link;
				105	dma = rb_entry(parent, struct vfio_dma, node);
				106
				107	if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
				108	link = &(*link)->rb_left;
				109	else
				110	link = &(*link)->rb_right;
				111	}
				112
				113	rb_link_node(&new->node, parent, link);
				114	rb_insert_color(&new->node, &iommu->dma_list);
				115	}
				116
				117	static void vfio_remove_dma(struct vfio_iommu iommu, struct vfio_dma old)
				118	{
				119	rb_erase(&old->node, &iommu->dma_list);
				120	}
				121
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	122	struct vwork {
				123	struct mm_struct *mm;
				124	long npage;
				125	struct work_struct work;
				126	};
				127
				128	/* delayed decrement/increment for locked_vm */
				129	static void vfio_lock_acct_bg(struct work_struct *work)
				130	{
				131	struct vwork *vwork = container_of(work, struct vwork, work);
				132	struct mm_struct *mm;
				133
				134	mm = vwork->mm;
				135	down_write(&mm->mmap_sem);
				136	mm->locked_vm += vwork->npage;
				137	up_write(&mm->mmap_sem);
				138	mmput(mm);
				139	kfree(vwork);
				140	}
				141
				142	static void vfio_lock_acct(long npage)
				143	{
				144	struct vwork *vwork;
				145	struct mm_struct *mm;
				146
				147	if (!current->mm)
				148	return; /* process exited */
				149
				150	if (down_write_trylock(&current->mm->mmap_sem)) {
				151	current->mm->locked_vm += npage;
				152	up_write(&current->mm->mmap_sem);
				153	return;
				154	}
				155
				156	/*
				157	* Couldn't get mmap_sem lock, so must setup to update
				158	* mm->locked_vm later. If locked_vm were atomic, we
				159	* wouldn't need this silliness
				160	*/
				161	vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
				162	if (!vwork)
				163	return;
				164	mm = get_task_mm(current);
				165	if (!mm) {
				166	kfree(vwork);
				167	return;
				168	}
				169	INIT_WORK(&vwork->work, vfio_lock_acct_bg);
				170	vwork->mm = mm;
				171	vwork->npage = npage;
				172	schedule_work(&vwork->work);
				173	}
				174
				175	/*
				176	* Some mappings aren't backed by a struct page, for example an mmap'd
				177	* MMIO range for our own or another device. These use a different
				178	* pfn conversion and shouldn't be tracked as locked pages.
				179	*/
				180	static bool is_invalid_reserved_pfn(unsigned long pfn)
				181	{
				182	if (pfn_valid(pfn)) {
				183	bool reserved;
				184	struct page *tail = pfn_to_page(pfn);
				185	struct page *head = compound_trans_head(tail);
				186	reserved = !!(PageReserved(head));
				187	if (head != tail) {
				188	/*
				189	* "head" is not a dangling pointer
				190	* (compound_trans_head takes care of that)
				191	* but the hugepage may have been split
				192	* from under us (and we may not hold a
				193	* reference count on the head page so it can
				194	* be reused before we run PageReferenced), so
				195	* we've to check PageTail before returning
				196	* what we just read.
				197	*/
				198	smp_rmb();
				199	if (PageTail(tail))
				200	return reserved;
				201	}
				202	return PageReserved(tail);
				203	}
				204
				205	return true;
				206	}
				207
				208	static int put_pfn(unsigned long pfn, int prot)
				209	{
				210	if (!is_invalid_reserved_pfn(pfn)) {
				211	struct page *page = pfn_to_page(pfn);
				212	if (prot & IOMMU_WRITE)
				213	SetPageDirty(page);
				214	put_page(page);
				215	return 1;
				216	}
				217	return 0;
				218	}
				219
				220	/* Unmap DMA region */
				221	static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
				222	long npage, int prot)
				223	{
				224	long i, unlocked = 0;
				225
				226	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
				227	unsigned long pfn;
				228
				229	pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
				230	if (pfn) {
				231	iommu_unmap(iommu->domain, iova, PAGE_SIZE);
				232	unlocked += put_pfn(pfn, prot);
				233	}
				234	}
				235	return unlocked;
				236	}
				237
				238	static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
				239	long npage, int prot)
				240	{
				241	long unlocked;
				242
				243	unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
				244	vfio_lock_acct(-unlocked);
				245	}
				246
				247	static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
				248	{
				249	struct page *page[1];
				250	struct vm_area_struct *vma;
				251	int ret = -EFAULT;
				252
				253	if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
				254	*pfn = page_to_pfn(page[0]);
				255	return 0;
				256	}
				257
				258	down_read(&current->mm->mmap_sem);
				259
				260	vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
				261
				262	if (vma && vma->vm_flags & VM_PFNMAP) {
				263	*pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
				264	if (is_invalid_reserved_pfn(*pfn))
				265	ret = 0;
				266	}
				267
				268	up_read(&current->mm->mmap_sem);
				269
				270	return ret;
				271	}
				272
				273	/* Map DMA region */
				274	static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
				275	unsigned long vaddr, long npage, int prot)
				276	{
				277	dma_addr_t start = iova;
				278	long i, locked = 0;
				279	int ret;
				280
				281	/* Verify that pages are not already mapped */
				282	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
				283	if (iommu_iova_to_phys(iommu->domain, iova))
				284	return -EBUSY;
				285
				286	iova = start;
				287
				288	if (iommu->cache)
				289	prot \|= IOMMU_CACHE;
				290
				291	/*
				292	* XXX We break mappings into pages and use get_user_pages_fast to
				293	* pin the pages in memory. It's been suggested that mlock might
				294	* provide a more efficient mechanism, but nothing prevents the
				295	* user from munlocking the pages, which could then allow the user
				296	* access to random host memory. We also have no guarantee from the
				297	* IOMMU API that the iommu driver can unmap sub-pages of previous
				298	* mappings. This means we might lose an entire range if a single
				299	* page within it is unmapped. Single page mappings are inefficient,
				300	* but provide the most flexibility for now.
				301	*/
				302	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
				303	unsigned long pfn = 0;
				304
				305	ret = vaddr_get_pfn(vaddr, prot, &pfn);
				306	if (ret) {
				307	__vfio_dma_do_unmap(iommu, start, i, prot);
				308	return ret;
				309	}
				310
				311	/*
				312	* Only add actual locked pages to accounting
				313	* XXX We're effectively marking a page locked for every
				314	* IOVA page even though it's possible the user could be
				315	* backing multiple IOVAs with the same vaddr. This over-
				316	* penalizes the user process, but we currently have no
				317	* easy way to do this properly.
				318	*/
				319	if (!is_invalid_reserved_pfn(pfn))
				320	locked++;
				321
				322	ret = iommu_map(iommu->domain, iova,
				323	(phys_addr_t)pfn << PAGE_SHIFT,
				324	PAGE_SIZE, prot);
				325	if (ret) {
				326	/* Back out mappings on error */
				327	put_pfn(pfn, prot);
				328	__vfio_dma_do_unmap(iommu, start, i, prot);
				329	return ret;
				330	}
				331	}
				332	vfio_lock_acct(locked);
				333	return 0;
				334	}
				335
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	336	static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
				337	size_t size, struct vfio_dma *dma)
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	338	{
				339	struct vfio_dma *split;
				340	long npage_lo, npage_hi;
				341
				342	/* Existing dma region is completely covered, unmap all */
				343	if (start <= dma->iova &&
				344	start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
				345	vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	346	vfio_remove_dma(iommu, dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	347	kfree(dma);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	348	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	349	}
				350
				351	/* Overlap low address of existing range */
				352	if (start <= dma->iova) {
				353	size_t overlap;
				354
				355	overlap = start + size - dma->iova;
				356	npage_lo = overlap >> PAGE_SHIFT;
				357
				358	vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot);
				359	dma->iova += overlap;
				360	dma->vaddr += overlap;
				361	dma->npage -= npage_lo;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	362	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	363	}
				364
				365	/* Overlap high address of existing range */
				366	if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
				367	size_t overlap;
				368
				369	overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start;
				370	npage_hi = overlap >> PAGE_SHIFT;
				371
				372	vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
				373	dma->npage -= npage_hi;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	374	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	375	}
				376
				377	/* Split existing */
				378	npage_lo = (start - dma->iova) >> PAGE_SHIFT;
				379	npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
				380
				381	split = kzalloc(sizeof *split, GFP_KERNEL);
				382	if (!split)
				383	return -ENOMEM;
				384
				385	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot);
				386
				387	dma->npage = npage_lo;
				388
				389	split->npage = npage_hi;
				390	split->iova = start + size;
				391	split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
				392	split->prot = dma->prot;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	393	vfio_insert_dma(iommu, split);
				394	return 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	395	}
				396
				397	static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
				398	struct vfio_iommu_type1_dma_unmap *unmap)
				399	{
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	400	uint64_t mask;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	401	struct vfio_dma *dma;
				402	int ret = 0;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	403
				404	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
				405
				406	if (unmap->iova & mask)
				407	return -EINVAL;
				408	if (unmap->size & mask)
				409	return -EINVAL;
				410
				411	/* XXX We still break these down into PAGE_SIZE */
				412	WARN_ON(mask & PAGE_MASK);
				413
				414	mutex_lock(&iommu->lock);
				415
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	416	while (!ret && (dma = vfio_find_dma(iommu,
				417	unmap->iova, unmap->size)))
				418	ret = vfio_remove_dma_overlap(iommu, unmap->iova,
				419	unmap->size, dma);
				420
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	421	mutex_unlock(&iommu->lock);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	422	return ret;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	423	}
				424
				425	static int vfio_dma_do_map(struct vfio_iommu *iommu,
				426	struct vfio_iommu_type1_dma_map *map)
				427	{
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	428	struct vfio_dma *dma;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	429	dma_addr_t iova = map->iova;
				430	unsigned long locked, lock_limit, vaddr = map->vaddr;
				431	size_t size = map->size;
				432	int ret = 0, prot = 0;
				433	uint64_t mask;
				434	long npage;
				435
				436	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
				437
				438	/* READ/WRITE from device perspective */
				439	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
				440	prot \|= IOMMU_WRITE;
				441	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
				442	prot \|= IOMMU_READ;
				443
				444	if (!prot)
				445	return -EINVAL; /* No READ/WRITE? */
				446
				447	if (vaddr & mask)
				448	return -EINVAL;
				449	if (iova & mask)
				450	return -EINVAL;
				451	if (size & mask)
				452	return -EINVAL;
				453
				454	/* XXX We still break these down into PAGE_SIZE */
				455	WARN_ON(mask & PAGE_MASK);
				456
				457	/* Don't allow IOVA wrap */
				458	if (iova + size && iova + size < iova)
				459	return -EINVAL;
				460
				461	/* Don't allow virtual address wrap */
				462	if (vaddr + size && vaddr + size < vaddr)
				463	return -EINVAL;
				464
				465	npage = size >> PAGE_SHIFT;
				466	if (!npage)
				467	return -EINVAL;
				468
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	469	dma = kzalloc(sizeof *dma, GFP_KERNEL);
				470	if (!dma)
				471	return -ENOMEM;
				472
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	473	mutex_lock(&iommu->lock);
				474
				475	if (vfio_find_dma(iommu, iova, size)) {
				476	ret = -EBUSY;
				477	goto out_lock;
				478	}
				479
				480	/* account for locked pages */
				481	locked = current->mm->locked_vm + npage;
				482	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
				483	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
				484	pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
				485	__func__, rlimit(RLIMIT_MEMLOCK));
				486	ret = -ENOMEM;
				487	goto out_lock;
				488	}
				489
				490	ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot);
				491	if (ret)
				492	goto out_lock;
				493
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	494	dma->npage = npage;
				495	dma->iova = iova;
				496	dma->vaddr = vaddr;
				497	dma->prot = prot;
				498
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	499	/* Check if we abut a region below - nothing below 0 */
				500	if (iova) {
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	501	struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
				502	if (tmp && tmp->prot == prot &&
				503	tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
				504	vfio_remove_dma(iommu, tmp);
				505	dma->npage += tmp->npage;
				506	dma->iova = iova = tmp->iova;
				507	dma->vaddr = vaddr = tmp->vaddr;
				508	kfree(tmp);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	509	npage = dma->npage;
				510	size = NPAGE_TO_SIZE(npage);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	511	}
				512	}
				513
				514	/* Check if we abut a region above - nothing above ~0 + 1 */
				515	if (iova + size) {
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	516	struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
				517	if (tmp && tmp->prot == prot &&
				518	tmp->vaddr == vaddr + size) {
				519	vfio_remove_dma(iommu, tmp);
				520	dma->npage += tmp->npage;
				521	kfree(tmp);
				522	npage = dma->npage;
				523	size = NPAGE_TO_SIZE(npage);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	524	}
				525	}
				526
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	527	vfio_insert_dma(iommu, dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	528
				529	out_lock:
				530	mutex_unlock(&iommu->lock);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	531	if (ret)
				532	kfree(dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	533	return ret;
				534	}
				535
				536	static int vfio_iommu_type1_attach_group(void *iommu_data,
				537	struct iommu_group *iommu_group)
				538	{
				539	struct vfio_iommu *iommu = iommu_data;
				540	struct vfio_group group, tmp;
				541	int ret;
				542
				543	group = kzalloc(sizeof(*group), GFP_KERNEL);
				544	if (!group)
				545	return -ENOMEM;
				546
				547	mutex_lock(&iommu->lock);
				548
				549	list_for_each_entry(tmp, &iommu->group_list, next) {
				550	if (tmp->iommu_group == iommu_group) {
				551	mutex_unlock(&iommu->lock);
				552	kfree(group);
				553	return -EINVAL;
				554	}
				555	}
				556
				557	/*
				558	* TODO: Domain have capabilities that might change as we add
				559	* groups (see iommu->cache, currently never set). Check for
				560	* them and potentially disallow groups to be attached when it
				561	* would change capabilities (ugh).
				562	*/
				563	ret = iommu_attach_group(iommu->domain, iommu_group);
				564	if (ret) {
				565	mutex_unlock(&iommu->lock);
				566	kfree(group);
				567	return ret;
				568	}
				569
				570	group->iommu_group = iommu_group;
				571	list_add(&group->next, &iommu->group_list);
				572
				573	mutex_unlock(&iommu->lock);
				574
				575	return 0;
				576	}
				577
				578	static void vfio_iommu_type1_detach_group(void *iommu_data,
				579	struct iommu_group *iommu_group)
				580	{
				581	struct vfio_iommu *iommu = iommu_data;
				582	struct vfio_group *group;
				583
				584	mutex_lock(&iommu->lock);
				585
				586	list_for_each_entry(group, &iommu->group_list, next) {
				587	if (group->iommu_group == iommu_group) {
				588	iommu_detach_group(iommu->domain, iommu_group);
				589	list_del(&group->next);
				590	kfree(group);
				591	break;
				592	}
				593	}
				594
				595	mutex_unlock(&iommu->lock);
				596	}
				597
				598	static void *vfio_iommu_type1_open(unsigned long arg)
				599	{
				600	struct vfio_iommu *iommu;
				601
				602	if (arg != VFIO_TYPE1_IOMMU)
				603	return ERR_PTR(-EINVAL);
				604
				605	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
				606	if (!iommu)
				607	return ERR_PTR(-ENOMEM);
				608
				609	INIT_LIST_HEAD(&iommu->group_list);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	610	iommu->dma_list = RB_ROOT;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	611	mutex_init(&iommu->lock);
				612
				613	/*
				614	* Wish we didn't have to know about bus_type here.
				615	*/
				616	iommu->domain = iommu_domain_alloc(&pci_bus_type);
				617	if (!iommu->domain) {
				618	kfree(iommu);
				619	return ERR_PTR(-EIO);
				620	}
				621
				622	/*
				623	* Wish we could specify required capabilities rather than create
				624	* a domain, see what comes out and hope it doesn't change along
				625	* the way. Fortunately we know interrupt remapping is global for
				626	* our iommus.
				627	*/
				628	if (!allow_unsafe_interrupts &&
				629	!iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
				630	pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
				631	__func__);
				632	iommu_domain_free(iommu->domain);
				633	kfree(iommu);
				634	return ERR_PTR(-EPERM);
				635	}
				636
				637	return iommu;
				638	}
				639
				640	static void vfio_iommu_type1_release(void *iommu_data)
				641	{
				642	struct vfio_iommu *iommu = iommu_data;
				643	struct vfio_group group, group_tmp;
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	644	struct rb_node *node;
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	645
				646	list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
				647	iommu_detach_group(iommu->domain, group->iommu_group);
				648	list_del(&group->next);
				649	kfree(group);
				650	}
				651
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	652	while ((node = rb_first(&iommu->dma_list))) {
				653	struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	654	vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
Alex Williamson	cd9b226	2013-06-21 09:37:50 -0600	[diff] [blame^]	655	vfio_remove_dma(iommu, dma);
Alex Williamson	73fa0d1	2012-07-31 08:16:23 -0600	[diff] [blame]	656	kfree(dma);
				657	}
				658
				659	iommu_domain_free(iommu->domain);
				660	iommu->domain = NULL;
				661	kfree(iommu);
				662	}
				663
				664	static long vfio_iommu_type1_ioctl(void *iommu_data,
				665	unsigned int cmd, unsigned long arg)
				666	{
				667	struct vfio_iommu *iommu = iommu_data;
				668	unsigned long minsz;
				669
				670	if (cmd == VFIO_CHECK_EXTENSION) {
				671	switch (arg) {
				672	case VFIO_TYPE1_IOMMU:
				673	return 1;
				674	default:
				675	return 0;
				676	}
				677	} else if (cmd == VFIO_IOMMU_GET_INFO) {
				678	struct vfio_iommu_type1_info info;
				679
				680	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
				681
				682	if (copy_from_user(&info, (void __user *)arg, minsz))
				683	return -EFAULT;
				684
				685	if (info.argsz < minsz)
				686	return -EINVAL;
				687
				688	info.flags = 0;
				689
				690	info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
				691
				692	return copy_to_user((void __user *)arg, &info, minsz);
				693
				694	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
				695	struct vfio_iommu_type1_dma_map map;
				696	uint32_t mask = VFIO_DMA_MAP_FLAG_READ \|
				697	VFIO_DMA_MAP_FLAG_WRITE;
				698
				699	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
				700
				701	if (copy_from_user(&map, (void __user *)arg, minsz))
				702	return -EFAULT;
				703
				704	if (map.argsz < minsz \|\| map.flags & ~mask)
				705	return -EINVAL;
				706
				707	return vfio_dma_do_map(iommu, &map);
				708
				709	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
				710	struct vfio_iommu_type1_dma_unmap unmap;
				711
				712	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
				713
				714	if (copy_from_user(&unmap, (void __user *)arg, minsz))
				715	return -EFAULT;
				716
				717	if (unmap.argsz < minsz \|\| unmap.flags)
				718	return -EINVAL;
				719
				720	return vfio_dma_do_unmap(iommu, &unmap);
				721	}
				722
				723	return -ENOTTY;
				724	}
				725
				726	static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
				727	.name = "vfio-iommu-type1",
				728	.owner = THIS_MODULE,
				729	.open = vfio_iommu_type1_open,
				730	.release = vfio_iommu_type1_release,
				731	.ioctl = vfio_iommu_type1_ioctl,
				732	.attach_group = vfio_iommu_type1_attach_group,
				733	.detach_group = vfio_iommu_type1_detach_group,
				734	};
				735
				736	static int __init vfio_iommu_type1_init(void)
				737	{
				738	if (!iommu_present(&pci_bus_type))
				739	return -ENODEV;
				740
				741	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
				742	}
				743
				744	static void __exit vfio_iommu_type1_cleanup(void)
				745	{
				746	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
				747	}
				748
				749	module_init(vfio_iommu_type1_init);
				750	module_exit(vfio_iommu_type1_cleanup);
				751
				752	MODULE_VERSION(DRIVER_VERSION);
				753	MODULE_LICENSE("GPL v2");
				754	MODULE_AUTHOR(DRIVER_AUTHOR);
				755	MODULE_DESCRIPTION(DRIVER_DESC);