Merge tag 'vfio-v4.5-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Fixes in AMD xgbe reset, spapr structure padding, type 1 flags (Dan
Carpenter, Alexey Kardashevskiy, Pierre Morel)
- Re-introduce no-iommu mode, with a user this time (Alex Williamson)
* tag 'vfio-v4.5-rc1' of git://github.com/awilliam/linux-vfio:
vfio/iommu_type1: make use of info.flags
vfio: Include No-IOMMU mode
vfio: Add explicit alignments in vfio_iommu_spapr_tce_create
VFIO: platform: reset: fix a warning message condition
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 850d86c..da6e2ce 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -31,6 +31,21 @@
If you don't know what to do here, say N.
+menuconfig VFIO_NOIOMMU
+ bool "VFIO No-IOMMU support"
+ depends on VFIO
+ help
+ VFIO is built on the ability to isolate devices using the IOMMU.
+ Only with an IOMMU can userspace access to DMA capable devices be
+ considered secure. VFIO No-IOMMU mode enables IOMMU groups for
+ devices without IOMMU backing for the purpose of re-using the VFIO
+ infrastructure in a non-secure mode. Use of this mode will result
+ in an unsupportable kernel and will therefore taint the kernel.
+ Device assignment to virtual machines is also not possible with
+ this mode since there is no IOMMU to provide DMA translation.
+
+ If you don't know what to do here, say N.
+
source "drivers/vfio/pci/Kconfig"
source "drivers/vfio/platform/Kconfig"
source "virt/lib/Kconfig"
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 56bf6db..2760a7b 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -940,13 +940,13 @@
if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
return -EINVAL;
- group = iommu_group_get(&pdev->dev);
+ group = vfio_iommu_group_get(&pdev->dev);
if (!group)
return -EINVAL;
vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
if (!vdev) {
- iommu_group_put(group);
+ vfio_iommu_group_put(group, &pdev->dev);
return -ENOMEM;
}
@@ -957,7 +957,7 @@
ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
if (ret) {
- iommu_group_put(group);
+ vfio_iommu_group_put(group, &pdev->dev);
kfree(vdev);
return ret;
}
@@ -993,7 +993,7 @@
if (!vdev)
return;
- iommu_group_put(pdev->dev.iommu_group);
+ vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
kfree(vdev);
if (vfio_pci_is_vga(pdev)) {
diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
index da5356f..d4030d0 100644
--- a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
+++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
@@ -110,7 +110,7 @@
usleep_range(10, 15);
count = 2000;
- while (count-- && (ioread32(xgmac_regs->ioaddr + DMA_MR) & 1))
+ while (--count && (ioread32(xgmac_regs->ioaddr + DMA_MR) & 1))
usleep_range(500, 600);
if (!count)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6070b79..82f25cc 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -62,6 +62,7 @@
struct rw_semaphore group_lock;
struct vfio_iommu_driver *iommu_driver;
void *iommu_data;
+ bool noiommu;
};
struct vfio_unbound_dev {
@@ -84,6 +85,7 @@
struct list_head unbound_list;
struct mutex unbound_lock;
atomic_t opened;
+ bool noiommu;
};
struct vfio_device {
@@ -95,6 +97,128 @@
void *device_data;
};
+#ifdef CONFIG_VFIO_NOIOMMU
+static bool noiommu __read_mostly;
+module_param_named(enable_unsafe_noiommu_mode,
+ noiommu, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
+#endif
+
+/*
+ * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
+ * and remove functions, any use cases other than acquiring the first
+ * reference for the purpose of calling vfio_add_group_dev() or removing
+ * that symmetric reference after vfio_del_group_dev() should use the raw
+ * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
+ * removes the device from the dummy group and cannot be nested.
+ */
+struct iommu_group *vfio_iommu_group_get(struct device *dev)
+{
+ struct iommu_group *group;
+ int __maybe_unused ret;
+
+ group = iommu_group_get(dev);
+
+#ifdef CONFIG_VFIO_NOIOMMU
+ /*
+ * With noiommu enabled, an IOMMU group will be created for a device
+ * that doesn't already have one and doesn't have an iommu_ops on their
+ * bus. We use iommu_present() again in the main code to detect these
+ * fake groups.
+ */
+ if (group || !noiommu || iommu_present(dev->bus))
+ return group;
+
+ group = iommu_group_alloc();
+ if (IS_ERR(group))
+ return NULL;
+
+ iommu_group_set_name(group, "vfio-noiommu");
+ ret = iommu_group_add_device(group, dev);
+ iommu_group_put(group);
+ if (ret)
+ return NULL;
+
+ /*
+ * Where to taint? At this point we've added an IOMMU group for a
+ * device that is not backed by iommu_ops, therefore any iommu_
+ * callback using iommu_ops can legitimately Oops. So, while we may
+ * be about to give a DMA capable device to a user without IOMMU
+ * protection, which is clearly taint-worthy, let's go ahead and do
+ * it here.
+ */
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
+#endif
+
+ return group;
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
+
+void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
+{
+#ifdef CONFIG_VFIO_NOIOMMU
+ if (!iommu_present(dev->bus))
+ iommu_group_remove_device(dev);
+#endif
+
+ iommu_group_put(group);
+}
+EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
+
+#ifdef CONFIG_VFIO_NOIOMMU
+static void *vfio_noiommu_open(unsigned long arg)
+{
+ if (arg != VFIO_NOIOMMU_IOMMU)
+ return ERR_PTR(-EINVAL);
+ if (!capable(CAP_SYS_RAWIO))
+ return ERR_PTR(-EPERM);
+
+ return NULL;
+}
+
+static void vfio_noiommu_release(void *iommu_data)
+{
+}
+
+static long vfio_noiommu_ioctl(void *iommu_data,
+ unsigned int cmd, unsigned long arg)
+{
+ if (cmd == VFIO_CHECK_EXTENSION)
+ return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
+
+ return -ENOTTY;
+}
+
+static int vfio_iommu_present(struct device *dev, void *unused)
+{
+ return iommu_present(dev->bus) ? 1 : 0;
+}
+
+static int vfio_noiommu_attach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+ return iommu_group_for_each_dev(iommu_group, NULL,
+ vfio_iommu_present) ? -EINVAL : 0;
+}
+
+static void vfio_noiommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
+{
+}
+
+static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
+ .name = "vfio-noiommu",
+ .owner = THIS_MODULE,
+ .open = vfio_noiommu_open,
+ .release = vfio_noiommu_release,
+ .ioctl = vfio_noiommu_ioctl,
+ .attach_group = vfio_noiommu_attach_group,
+ .detach_group = vfio_noiommu_detach_group,
+};
+#endif
+
+
/**
* IOMMU driver registration
*/
@@ -199,7 +323,8 @@
/**
* Group objects - create, release, get, put, search
*/
-static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
+static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
+ bool iommu_present)
{
struct vfio_group *group, *tmp;
struct device *dev;
@@ -217,6 +342,7 @@
atomic_set(&group->container_users, 0);
atomic_set(&group->opened, 0);
group->iommu_group = iommu_group;
+ group->noiommu = !iommu_present;
group->nb.notifier_call = vfio_iommu_group_notifier;
@@ -252,7 +378,8 @@
dev = device_create(vfio.class, NULL,
MKDEV(MAJOR(vfio.group_devt), minor),
- group, "%d", iommu_group_id(iommu_group));
+ group, "%s%d", group->noiommu ? "noiommu-" : "",
+ iommu_group_id(iommu_group));
if (IS_ERR(dev)) {
vfio_free_group_minor(minor);
vfio_group_unlock_and_free(group);
@@ -640,7 +767,7 @@
group = vfio_group_get_from_iommu(iommu_group);
if (!group) {
- group = vfio_create_group(iommu_group);
+ group = vfio_create_group(iommu_group, iommu_present(dev->bus));
if (IS_ERR(group)) {
iommu_group_put(iommu_group);
return PTR_ERR(group);
@@ -854,6 +981,14 @@
mutex_lock(&vfio.iommu_drivers_lock);
list_for_each_entry(driver, &vfio.iommu_drivers_list,
vfio_next) {
+
+#ifdef CONFIG_VFIO_NOIOMMU
+ if (!list_empty(&container->group_list) &&
+ (container->noiommu !=
+ (driver->ops == &vfio_noiommu_ops)))
+ continue;
+#endif
+
if (!try_module_get(driver->ops->owner))
continue;
@@ -925,6 +1060,15 @@
list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
void *data;
+#ifdef CONFIG_VFIO_NOIOMMU
+ /*
+ * Only noiommu containers can use vfio-noiommu and noiommu
+ * containers can only use vfio-noiommu.
+ */
+ if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
+ continue;
+#endif
+
if (!try_module_get(driver->ops->owner))
continue;
@@ -1187,6 +1331,9 @@
if (atomic_read(&group->container_users))
return -EINVAL;
+ if (group->noiommu && !capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
f = fdget(container_fd);
if (!f.file)
return -EBADF;
@@ -1202,6 +1349,13 @@
down_write(&container->group_lock);
+ /* Real groups and fake groups cannot mix */
+ if (!list_empty(&container->group_list) &&
+ container->noiommu != group->noiommu) {
+ ret = -EPERM;
+ goto unlock_out;
+ }
+
driver = container->iommu_driver;
if (driver) {
ret = driver->ops->attach_group(container->iommu_data,
@@ -1211,6 +1365,7 @@
}
group->container = container;
+ container->noiommu = group->noiommu;
list_add(&group->container_next, &container->group_list);
/* Get a reference on the container and mark a user within the group */
@@ -1241,6 +1396,9 @@
!group->container->iommu_driver || !vfio_group_viable(group))
return -EINVAL;
+ if (group->noiommu && !capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
device = vfio_device_get_from_name(group, buf);
if (!device)
return -ENODEV;
@@ -1283,6 +1441,10 @@
fd_install(ret, filep);
+ if (group->noiommu)
+ dev_warn(device->dev, "vfio-noiommu device opened by user "
+ "(%s:%d)\n", current->comm, task_pid_nr(current));
+
return ret;
}
@@ -1371,6 +1533,11 @@
if (!group)
return -ENODEV;
+ if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
+ vfio_group_put(group);
+ return -EPERM;
+ }
+
/* Do we need multiple instances of the group open? Seems not. */
opened = atomic_cmpxchg(&group->opened, 0, 1);
if (opened) {
@@ -1533,6 +1700,11 @@
if (!atomic_inc_not_zero(&group->container_users))
return ERR_PTR(-EINVAL);
+ if (group->noiommu) {
+ atomic_dec(&group->container_users);
+ return ERR_PTR(-EPERM);
+ }
+
if (!group->container->iommu_driver ||
!vfio_group_viable(group)) {
atomic_dec(&group->container_users);
@@ -1625,6 +1797,9 @@
request_module_nowait("vfio_iommu_type1");
request_module_nowait("vfio_iommu_spapr_tce");
+#ifdef CONFIG_VFIO_NOIOMMU
+ vfio_register_iommu_driver(&vfio_noiommu_ops);
+#endif
return 0;
err_cdev_add:
@@ -1641,6 +1816,9 @@
{
WARN_ON(!list_empty(&vfio.group_list));
+#ifdef CONFIG_VFIO_NOIOMMU
+ vfio_unregister_iommu_driver(&vfio_noiommu_ops);
+#endif
idr_destroy(&vfio.group_idr);
cdev_del(&vfio.group_cdev);
unregister_chrdev_region(vfio.group_devt, MINORMASK);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 59d47cb..6f1ea3d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -995,7 +995,7 @@
if (info.argsz < minsz)
return -EINVAL;
- info.flags = 0;
+ info.flags = VFIO_IOMMU_INFO_PGSIZES;
info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ddb4409..610a86a 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -44,6 +44,9 @@
void (*request)(void *device_data, unsigned int count);
};
+extern struct iommu_group *vfio_iommu_group_get(struct device *dev);
+extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev);
+
extern int vfio_add_group_dev(struct device *dev,
const struct vfio_device_ops *ops,
void *device_data);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 9fd7b5d..7d7a4c6 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -39,6 +39,13 @@
#define VFIO_SPAPR_TCE_v2_IOMMU 7
/*
+ * The No-IOMMU IOMMU offers no translation or isolation for devices and
+ * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU
+ * code will taint the host kernel and should be used with extreme caution.
+ */
+#define VFIO_NOIOMMU_IOMMU 8
+
+/*
* The IOCTL interface is designed for extensibility by embedding the
* structure length (argsz) and flags into structures passed between
* kernel and userspace. We therefore use the _IO() macro for these
@@ -568,8 +575,10 @@
__u32 flags;
/* in */
__u32 page_shift;
+ __u32 __resv1;
__u64 window_size;
__u32 levels;
+ __u32 __resv2;
/* out */
__u64 start_addr;
};