blob: 1651c0769b728905ba759a87f0d1d3efcc2adcf7 [file] [log] [blame]
Alex Williamson89e1f7d2012-07-31 08:16:24 -06001/*
2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
8 *
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
12 */
13
14#include <linux/device.h>
15#include <linux/eventfd.h>
Alex Williamson8b27ee62013-09-04 11:28:04 -060016#include <linux/file.h>
Alex Williamson89e1f7d2012-07-31 08:16:24 -060017#include <linux/interrupt.h>
18#include <linux/iommu.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include <linux/notifier.h>
22#include <linux/pci.h>
23#include <linux/pm_runtime.h>
24#include <linux/slab.h>
25#include <linux/types.h>
26#include <linux/uaccess.h>
27#include <linux/vfio.h>
28
29#include "vfio_pci_private.h"
30
31#define DRIVER_VERSION "0.2"
32#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
33#define DRIVER_DESC "VFIO PCI - User Level meta-driver"
34
35static bool nointxmask;
36module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
37MODULE_PARM_DESC(nointxmask,
38 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
39
Alex Williamson61d79252014-08-07 11:12:04 -060040static DEFINE_MUTEX(driver_lock);
41
Alex Williamsonbc4fba72014-08-07 11:12:07 -060042static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
43
Alex Williamson89e1f7d2012-07-31 08:16:24 -060044static int vfio_pci_enable(struct vfio_pci_device *vdev)
45{
46 struct pci_dev *pdev = vdev->pdev;
47 int ret;
48 u16 cmd;
49 u8 msix_pos;
50
Alex Williamson9c22e662014-08-07 11:12:02 -060051 /* Don't allow our initial saved state to include busmaster */
52 pci_clear_master(pdev);
53
Alex Williamson9a92c502012-12-07 13:43:51 -070054 ret = pci_enable_device(pdev);
55 if (ret)
56 return ret;
57
Alex Williamson89e1f7d2012-07-31 08:16:24 -060058 vdev->reset_works = (pci_reset_function(pdev) == 0);
59 pci_save_state(pdev);
60 vdev->pci_saved_state = pci_store_saved_state(pdev);
61 if (!vdev->pci_saved_state)
62 pr_debug("%s: Couldn't store %s saved state\n",
63 __func__, dev_name(&pdev->dev));
64
65 ret = vfio_config_init(vdev);
Alex Williamson9a92c502012-12-07 13:43:51 -070066 if (ret) {
Alex Williamsoneb5685f2014-05-30 11:35:53 -060067 kfree(vdev->pci_saved_state);
68 vdev->pci_saved_state = NULL;
Alex Williamson9a92c502012-12-07 13:43:51 -070069 pci_disable_device(pdev);
70 return ret;
71 }
Alex Williamson89e1f7d2012-07-31 08:16:24 -060072
73 if (likely(!nointxmask))
74 vdev->pci_2_3 = pci_intx_mask_supported(pdev);
75
76 pci_read_config_word(pdev, PCI_COMMAND, &cmd);
77 if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
78 cmd &= ~PCI_COMMAND_INTX_DISABLE;
79 pci_write_config_word(pdev, PCI_COMMAND, cmd);
80 }
81
Bjorn Helgaasa9047f22013-04-18 15:12:58 -060082 msix_pos = pdev->msix_cap;
Alex Williamson89e1f7d2012-07-31 08:16:24 -060083 if (msix_pos) {
84 u16 flags;
85 u32 table;
86
87 pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
88 pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
89
Bjorn Helgaas508d1aa2013-04-18 12:42:58 -060090 vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
91 vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
Alex Williamson89e1f7d2012-07-31 08:16:24 -060092 vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
93 } else
94 vdev->msix_bar = 0xFF;
95
Alex Williamson84237a82013-02-18 10:11:13 -070096#ifdef CONFIG_VFIO_PCI_VGA
97 if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
98 vdev->has_vga = true;
99#endif
100
Alex Williamson9a92c502012-12-07 13:43:51 -0700101 return 0;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600102}
103
104static void vfio_pci_disable(struct vfio_pci_device *vdev)
105{
Alex Williamson20077222012-12-07 13:43:50 -0700106 struct pci_dev *pdev = vdev->pdev;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600107 int bar;
108
Alex Williamson9c22e662014-08-07 11:12:02 -0600109 /* Stop the device from further DMA */
110 pci_clear_master(pdev);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600111
112 vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
113 VFIO_IRQ_SET_ACTION_TRIGGER,
114 vdev->irq_type, 0, 0, NULL);
115
116 vdev->virq_disabled = false;
117
118 vfio_config_free(vdev);
119
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600120 for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
121 if (!vdev->barmap[bar])
122 continue;
Alex Williamson20077222012-12-07 13:43:50 -0700123 pci_iounmap(pdev, vdev->barmap[bar]);
124 pci_release_selected_regions(pdev, 1 << bar);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600125 vdev->barmap[bar] = NULL;
126 }
Alex Williamson20077222012-12-07 13:43:50 -0700127
Alex Williamsonbc4fba72014-08-07 11:12:07 -0600128 vdev->needs_reset = true;
129
Alex Williamson20077222012-12-07 13:43:50 -0700130 /*
131 * If we have saved state, restore it. If we can reset the device,
132 * even better. Resetting with current state seems better than
133 * nothing, but saving and restoring current state without reset
134 * is just busy work.
135 */
136 if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
137 pr_info("%s: Couldn't reload %s saved state\n",
138 __func__, dev_name(&pdev->dev));
139
140 if (!vdev->reset_works)
Alex Williamson9c22e662014-08-07 11:12:02 -0600141 goto out;
Alex Williamson20077222012-12-07 13:43:50 -0700142
143 pci_save_state(pdev);
144 }
145
146 /*
147 * Disable INTx and MSI, presumably to avoid spurious interrupts
148 * during reset. Stolen from pci_reset_function()
149 */
150 pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
151
Alex Williamsond24cdbf2013-06-10 16:40:57 -0600152 /*
Alex Williamson890ed572014-01-14 20:45:09 -0700153 * Try to reset the device. The success of this is dependent on
154 * being able to lock the device, which is not always possible.
Alex Williamsond24cdbf2013-06-10 16:40:57 -0600155 */
156 if (vdev->reset_works) {
Alex Williamson890ed572014-01-14 20:45:09 -0700157 int ret = pci_try_reset_function(pdev);
158 if (ret)
159 pr_warn("%s: Failed to reset device %s (%d)\n",
160 __func__, dev_name(&pdev->dev), ret);
Alex Williamsonbc4fba72014-08-07 11:12:07 -0600161 else
162 vdev->needs_reset = false;
Alex Williamsond24cdbf2013-06-10 16:40:57 -0600163 }
Alex Williamson20077222012-12-07 13:43:50 -0700164
165 pci_restore_state(pdev);
Alex Williamson9c22e662014-08-07 11:12:02 -0600166out:
167 pci_disable_device(pdev);
Alex Williamsonbc4fba72014-08-07 11:12:07 -0600168
169 vfio_pci_try_bus_reset(vdev);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600170}
171
172static void vfio_pci_release(void *device_data)
173{
174 struct vfio_pci_device *vdev = device_data;
175
Alex Williamson61d79252014-08-07 11:12:04 -0600176 mutex_lock(&driver_lock);
177
178 if (!(--vdev->refcnt)) {
Gavin Shan1b69be52014-06-10 11:41:57 +1000179 vfio_spapr_pci_eeh_release(vdev->pdev);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600180 vfio_pci_disable(vdev);
Gavin Shan1b69be52014-06-10 11:41:57 +1000181 }
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600182
Alex Williamson61d79252014-08-07 11:12:04 -0600183 mutex_unlock(&driver_lock);
184
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600185 module_put(THIS_MODULE);
186}
187
188static int vfio_pci_open(void *device_data)
189{
190 struct vfio_pci_device *vdev = device_data;
Alex Williamson61d79252014-08-07 11:12:04 -0600191 int ret = 0;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600192
193 if (!try_module_get(THIS_MODULE))
194 return -ENODEV;
195
Alex Williamson61d79252014-08-07 11:12:04 -0600196 mutex_lock(&driver_lock);
197
198 if (!vdev->refcnt) {
Gavin Shan1b69be52014-06-10 11:41:57 +1000199 ret = vfio_pci_enable(vdev);
200 if (ret)
201 goto error;
202
203 ret = vfio_spapr_pci_eeh_open(vdev->pdev);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600204 if (ret) {
Gavin Shan1b69be52014-06-10 11:41:57 +1000205 vfio_pci_disable(vdev);
206 goto error;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600207 }
208 }
Alex Williamson61d79252014-08-07 11:12:04 -0600209 vdev->refcnt++;
Gavin Shan1b69be52014-06-10 11:41:57 +1000210error:
Alex Williamson61d79252014-08-07 11:12:04 -0600211 mutex_unlock(&driver_lock);
212 if (ret)
213 module_put(THIS_MODULE);
Gavin Shan1b69be52014-06-10 11:41:57 +1000214 return ret;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600215}
216
217static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
218{
219 if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
220 u8 pin;
221 pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
222 if (pin)
223 return 1;
224
225 } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
226 u8 pos;
227 u16 flags;
228
Bjorn Helgaasa9047f22013-04-18 15:12:58 -0600229 pos = vdev->pdev->msi_cap;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600230 if (pos) {
231 pci_read_config_word(vdev->pdev,
232 pos + PCI_MSI_FLAGS, &flags);
Gavin Shanfd49c812014-05-30 11:35:54 -0600233 return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600234 }
235 } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
236 u8 pos;
237 u16 flags;
238
Bjorn Helgaasa9047f22013-04-18 15:12:58 -0600239 pos = vdev->pdev->msix_cap;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600240 if (pos) {
241 pci_read_config_word(vdev->pdev,
242 pos + PCI_MSIX_FLAGS, &flags);
243
244 return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
245 }
Vijay Mohan Pandarathildad9f892013-03-11 09:31:22 -0600246 } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
247 if (pci_is_pcie(vdev->pdev))
248 return 1;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600249
250 return 0;
251}
252
Alex Williamson8b27ee62013-09-04 11:28:04 -0600253static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
254{
255 (*(int *)data)++;
256 return 0;
257}
258
259struct vfio_pci_fill_info {
260 int max;
261 int cur;
262 struct vfio_pci_dependent_device *devices;
263};
264
265static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
266{
267 struct vfio_pci_fill_info *fill = data;
268 struct iommu_group *iommu_group;
269
270 if (fill->cur == fill->max)
271 return -EAGAIN; /* Something changed, try again */
272
273 iommu_group = iommu_group_get(&pdev->dev);
274 if (!iommu_group)
275 return -EPERM; /* Cannot reset non-isolated devices */
276
277 fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
278 fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
279 fill->devices[fill->cur].bus = pdev->bus->number;
280 fill->devices[fill->cur].devfn = pdev->devfn;
281 fill->cur++;
282 iommu_group_put(iommu_group);
283 return 0;
284}
285
286struct vfio_pci_group_entry {
287 struct vfio_group *group;
288 int id;
289};
290
291struct vfio_pci_group_info {
292 int count;
293 struct vfio_pci_group_entry *groups;
294};
295
296static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
297{
298 struct vfio_pci_group_info *info = data;
299 struct iommu_group *group;
300 int id, i;
301
302 group = iommu_group_get(&pdev->dev);
303 if (!group)
304 return -EPERM;
305
306 id = iommu_group_id(group);
307
308 for (i = 0; i < info->count; i++)
309 if (info->groups[i].id == id)
310 break;
311
312 iommu_group_put(group);
313
314 return (i == info->count) ? -EINVAL : 0;
315}
316
317static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
318{
319 for (; pdev; pdev = pdev->bus->self)
320 if (pdev->bus == slot->bus)
321 return (pdev->slot == slot);
322 return false;
323}
324
325struct vfio_pci_walk_info {
326 int (*fn)(struct pci_dev *, void *data);
327 void *data;
328 struct pci_dev *pdev;
329 bool slot;
330 int ret;
331};
332
333static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
334{
335 struct vfio_pci_walk_info *walk = data;
336
337 if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
338 walk->ret = walk->fn(pdev, walk->data);
339
340 return walk->ret;
341}
342
343static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
344 int (*fn)(struct pci_dev *,
345 void *data), void *data,
346 bool slot)
347{
348 struct vfio_pci_walk_info walk = {
349 .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
350 };
351
352 pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
353
354 return walk.ret;
355}
356
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600357static long vfio_pci_ioctl(void *device_data,
358 unsigned int cmd, unsigned long arg)
359{
360 struct vfio_pci_device *vdev = device_data;
361 unsigned long minsz;
362
363 if (cmd == VFIO_DEVICE_GET_INFO) {
364 struct vfio_device_info info;
365
366 minsz = offsetofend(struct vfio_device_info, num_irqs);
367
368 if (copy_from_user(&info, (void __user *)arg, minsz))
369 return -EFAULT;
370
371 if (info.argsz < minsz)
372 return -EINVAL;
373
374 info.flags = VFIO_DEVICE_FLAGS_PCI;
375
376 if (vdev->reset_works)
377 info.flags |= VFIO_DEVICE_FLAGS_RESET;
378
379 info.num_regions = VFIO_PCI_NUM_REGIONS;
380 info.num_irqs = VFIO_PCI_NUM_IRQS;
381
382 return copy_to_user((void __user *)arg, &info, minsz);
383
384 } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
385 struct pci_dev *pdev = vdev->pdev;
386 struct vfio_region_info info;
387
388 minsz = offsetofend(struct vfio_region_info, offset);
389
390 if (copy_from_user(&info, (void __user *)arg, minsz))
391 return -EFAULT;
392
393 if (info.argsz < minsz)
394 return -EINVAL;
395
396 switch (info.index) {
397 case VFIO_PCI_CONFIG_REGION_INDEX:
398 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
399 info.size = pdev->cfg_size;
400 info.flags = VFIO_REGION_INFO_FLAG_READ |
401 VFIO_REGION_INFO_FLAG_WRITE;
402 break;
403 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
404 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
405 info.size = pci_resource_len(pdev, info.index);
406 if (!info.size) {
407 info.flags = 0;
408 break;
409 }
410
411 info.flags = VFIO_REGION_INFO_FLAG_READ |
412 VFIO_REGION_INFO_FLAG_WRITE;
413 if (pci_resource_flags(pdev, info.index) &
414 IORESOURCE_MEM && info.size >= PAGE_SIZE)
415 info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
416 break;
417 case VFIO_PCI_ROM_REGION_INDEX:
418 {
419 void __iomem *io;
420 size_t size;
421
422 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
423 info.flags = 0;
424
425 /* Report the BAR size, not the ROM size */
426 info.size = pci_resource_len(pdev, info.index);
427 if (!info.size)
428 break;
429
430 /* Is it really there? */
431 io = pci_map_rom(pdev, &size);
432 if (!io || !size) {
433 info.size = 0;
434 break;
435 }
436 pci_unmap_rom(pdev, io);
437
438 info.flags = VFIO_REGION_INFO_FLAG_READ;
439 break;
440 }
Alex Williamson84237a82013-02-18 10:11:13 -0700441 case VFIO_PCI_VGA_REGION_INDEX:
442 if (!vdev->has_vga)
443 return -EINVAL;
444
445 info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
446 info.size = 0xc0000;
447 info.flags = VFIO_REGION_INFO_FLAG_READ |
448 VFIO_REGION_INFO_FLAG_WRITE;
449
450 break;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600451 default:
452 return -EINVAL;
453 }
454
455 return copy_to_user((void __user *)arg, &info, minsz);
456
457 } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
458 struct vfio_irq_info info;
459
460 minsz = offsetofend(struct vfio_irq_info, count);
461
462 if (copy_from_user(&info, (void __user *)arg, minsz))
463 return -EFAULT;
464
465 if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
466 return -EINVAL;
467
Vijay Mohan Pandarathildad9f892013-03-11 09:31:22 -0600468 switch (info.index) {
469 case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
470 break;
471 case VFIO_PCI_ERR_IRQ_INDEX:
472 if (pci_is_pcie(vdev->pdev))
473 break;
474 /* pass thru to return error */
475 default:
476 return -EINVAL;
477 }
478
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600479 info.flags = VFIO_IRQ_INFO_EVENTFD;
480
481 info.count = vfio_pci_get_irq_count(vdev, info.index);
482
483 if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
484 info.flags |= (VFIO_IRQ_INFO_MASKABLE |
485 VFIO_IRQ_INFO_AUTOMASKED);
486 else
487 info.flags |= VFIO_IRQ_INFO_NORESIZE;
488
489 return copy_to_user((void __user *)arg, &info, minsz);
490
491 } else if (cmd == VFIO_DEVICE_SET_IRQS) {
492 struct vfio_irq_set hdr;
493 u8 *data = NULL;
494 int ret = 0;
495
496 minsz = offsetofend(struct vfio_irq_set, count);
497
498 if (copy_from_user(&hdr, (void __user *)arg, minsz))
499 return -EFAULT;
500
501 if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
502 hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
503 VFIO_IRQ_SET_ACTION_TYPE_MASK))
504 return -EINVAL;
505
506 if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
507 size_t size;
Alex Williamson904c6802013-03-26 11:33:16 -0600508 int max = vfio_pci_get_irq_count(vdev, hdr.index);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600509
510 if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
511 size = sizeof(uint8_t);
512 else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
513 size = sizeof(int32_t);
514 else
515 return -EINVAL;
516
517 if (hdr.argsz - minsz < hdr.count * size ||
Alex Williamson904c6802013-03-26 11:33:16 -0600518 hdr.start >= max || hdr.start + hdr.count > max)
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600519 return -EINVAL;
520
Fengguang Wu3a1f7042012-12-07 13:43:49 -0700521 data = memdup_user((void __user *)(arg + minsz),
522 hdr.count * size);
523 if (IS_ERR(data))
524 return PTR_ERR(data);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600525 }
526
527 mutex_lock(&vdev->igate);
528
529 ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
530 hdr.start, hdr.count, data);
531
532 mutex_unlock(&vdev->igate);
533 kfree(data);
534
535 return ret;
536
Alex Williamson8b27ee62013-09-04 11:28:04 -0600537 } else if (cmd == VFIO_DEVICE_RESET) {
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600538 return vdev->reset_works ?
Alex Williamson890ed572014-01-14 20:45:09 -0700539 pci_try_reset_function(vdev->pdev) : -EINVAL;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600540
Alex Williamson8b27ee62013-09-04 11:28:04 -0600541 } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
542 struct vfio_pci_hot_reset_info hdr;
543 struct vfio_pci_fill_info fill = { 0 };
544 struct vfio_pci_dependent_device *devices = NULL;
545 bool slot = false;
546 int ret = 0;
547
548 minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
549
550 if (copy_from_user(&hdr, (void __user *)arg, minsz))
551 return -EFAULT;
552
553 if (hdr.argsz < minsz)
554 return -EINVAL;
555
556 hdr.flags = 0;
557
558 /* Can we do a slot or bus reset or neither? */
559 if (!pci_probe_reset_slot(vdev->pdev->slot))
560 slot = true;
561 else if (pci_probe_reset_bus(vdev->pdev->bus))
562 return -ENODEV;
563
564 /* How many devices are affected? */
565 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
566 vfio_pci_count_devs,
567 &fill.max, slot);
568 if (ret)
569 return ret;
570
571 WARN_ON(!fill.max); /* Should always be at least one */
572
573 /*
574 * If there's enough space, fill it now, otherwise return
575 * -ENOSPC and the number of devices affected.
576 */
577 if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
578 ret = -ENOSPC;
579 hdr.count = fill.max;
580 goto reset_info_exit;
581 }
582
583 devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
584 if (!devices)
585 return -ENOMEM;
586
587 fill.devices = devices;
588
589 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
590 vfio_pci_fill_devs,
591 &fill, slot);
592
593 /*
594 * If a device was removed between counting and filling,
595 * we may come up short of fill.max. If a device was
596 * added, we'll have a return of -EAGAIN above.
597 */
598 if (!ret)
599 hdr.count = fill.cur;
600
601reset_info_exit:
602 if (copy_to_user((void __user *)arg, &hdr, minsz))
603 ret = -EFAULT;
604
605 if (!ret) {
606 if (copy_to_user((void __user *)(arg + minsz), devices,
607 hdr.count * sizeof(*devices)))
608 ret = -EFAULT;
609 }
610
611 kfree(devices);
612 return ret;
613
614 } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
615 struct vfio_pci_hot_reset hdr;
616 int32_t *group_fds;
617 struct vfio_pci_group_entry *groups;
618 struct vfio_pci_group_info info;
619 bool slot = false;
620 int i, count = 0, ret = 0;
621
622 minsz = offsetofend(struct vfio_pci_hot_reset, count);
623
624 if (copy_from_user(&hdr, (void __user *)arg, minsz))
625 return -EFAULT;
626
627 if (hdr.argsz < minsz || hdr.flags)
628 return -EINVAL;
629
630 /* Can we do a slot or bus reset or neither? */
631 if (!pci_probe_reset_slot(vdev->pdev->slot))
632 slot = true;
633 else if (pci_probe_reset_bus(vdev->pdev->bus))
634 return -ENODEV;
635
636 /*
637 * We can't let userspace give us an arbitrarily large
638 * buffer to copy, so verify how many we think there
639 * could be. Note groups can have multiple devices so
640 * one group per device is the max.
641 */
642 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
643 vfio_pci_count_devs,
644 &count, slot);
645 if (ret)
646 return ret;
647
648 /* Somewhere between 1 and count is OK */
649 if (!hdr.count || hdr.count > count)
650 return -EINVAL;
651
652 group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
653 groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
654 if (!group_fds || !groups) {
655 kfree(group_fds);
656 kfree(groups);
657 return -ENOMEM;
658 }
659
660 if (copy_from_user(group_fds, (void __user *)(arg + minsz),
661 hdr.count * sizeof(*group_fds))) {
662 kfree(group_fds);
663 kfree(groups);
664 return -EFAULT;
665 }
666
667 /*
668 * For each group_fd, get the group through the vfio external
669 * user interface and store the group and iommu ID. This
670 * ensures the group is held across the reset.
671 */
672 for (i = 0; i < hdr.count; i++) {
673 struct vfio_group *group;
674 struct fd f = fdget(group_fds[i]);
675 if (!f.file) {
676 ret = -EBADF;
677 break;
678 }
679
680 group = vfio_group_get_external_user(f.file);
681 fdput(f);
682 if (IS_ERR(group)) {
683 ret = PTR_ERR(group);
684 break;
685 }
686
687 groups[i].group = group;
688 groups[i].id = vfio_external_user_iommu_id(group);
689 }
690
691 kfree(group_fds);
692
693 /* release reference to groups on error */
694 if (ret)
695 goto hot_reset_release;
696
697 info.count = hdr.count;
698 info.groups = groups;
699
700 /*
701 * Test whether all the affected devices are contained
702 * by the set of groups provided by the user.
703 */
704 ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
705 vfio_pci_validate_devs,
706 &info, slot);
707 if (!ret)
708 /* User has access, do the reset */
Alex Williamson890ed572014-01-14 20:45:09 -0700709 ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
710 pci_try_reset_bus(vdev->pdev->bus);
Alex Williamson8b27ee62013-09-04 11:28:04 -0600711
712hot_reset_release:
713 for (i--; i >= 0; i--)
714 vfio_group_put_external_user(groups[i].group);
715
716 kfree(groups);
717 return ret;
718 }
719
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600720 return -ENOTTY;
721}
722
Alex Williamson5b279a12013-02-14 14:02:12 -0700723static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
724 size_t count, loff_t *ppos, bool iswrite)
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600725{
726 unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
727 struct vfio_pci_device *vdev = device_data;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600728
729 if (index >= VFIO_PCI_NUM_REGIONS)
730 return -EINVAL;
731
Alex Williamson5b279a12013-02-14 14:02:12 -0700732 switch (index) {
733 case VFIO_PCI_CONFIG_REGION_INDEX:
Alex Williamson906ee992013-02-14 14:02:12 -0700734 return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
735
Alex Williamson5b279a12013-02-14 14:02:12 -0700736 case VFIO_PCI_ROM_REGION_INDEX:
737 if (iswrite)
738 return -EINVAL;
Alex Williamson906ee992013-02-14 14:02:12 -0700739 return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600740
Alex Williamson5b279a12013-02-14 14:02:12 -0700741 case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
Alex Williamson906ee992013-02-14 14:02:12 -0700742 return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
Alex Williamson84237a82013-02-18 10:11:13 -0700743
744 case VFIO_PCI_VGA_REGION_INDEX:
745 return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
Alex Williamson5b279a12013-02-14 14:02:12 -0700746 }
747
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600748 return -EINVAL;
749}
750
Alex Williamson5b279a12013-02-14 14:02:12 -0700751static ssize_t vfio_pci_read(void *device_data, char __user *buf,
752 size_t count, loff_t *ppos)
753{
Alex Williamson906ee992013-02-14 14:02:12 -0700754 if (!count)
755 return 0;
756
Alex Williamson5b279a12013-02-14 14:02:12 -0700757 return vfio_pci_rw(device_data, buf, count, ppos, false);
758}
759
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600760static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
761 size_t count, loff_t *ppos)
762{
Alex Williamson906ee992013-02-14 14:02:12 -0700763 if (!count)
764 return 0;
765
766 return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600767}
768
769static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
770{
771 struct vfio_pci_device *vdev = device_data;
772 struct pci_dev *pdev = vdev->pdev;
773 unsigned int index;
Alex Williamson34002f52012-10-10 09:10:31 -0600774 u64 phys_len, req_len, pgoff, req_start;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600775 int ret;
776
777 index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
778
779 if (vma->vm_end < vma->vm_start)
780 return -EINVAL;
781 if ((vma->vm_flags & VM_SHARED) == 0)
782 return -EINVAL;
783 if (index >= VFIO_PCI_ROM_REGION_INDEX)
784 return -EINVAL;
785 if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
786 return -EINVAL;
787
788 phys_len = pci_resource_len(pdev, index);
789 req_len = vma->vm_end - vma->vm_start;
790 pgoff = vma->vm_pgoff &
791 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
792 req_start = pgoff << PAGE_SHIFT;
793
794 if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
795 return -EINVAL;
796
797 if (index == vdev->msix_bar) {
798 /*
799 * Disallow mmaps overlapping the MSI-X table; users don't
800 * get to touch this directly. We could find somewhere
801 * else to map the overlap, but page granularity is only
802 * a recommendation, not a requirement, so the user needs
803 * to know which bits are real. Requiring them to mmap
804 * around the table makes that clear.
805 */
806
807 /* If neither entirely above nor below, then it overlaps */
808 if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
809 req_start + req_len <= vdev->msix_offset))
810 return -EINVAL;
811 }
812
813 /*
814 * Even though we don't make use of the barmap for the mmap,
815 * we need to request the region and the barmap tracks that.
816 */
817 if (!vdev->barmap[index]) {
818 ret = pci_request_selected_regions(pdev,
819 1 << index, "vfio-pci");
820 if (ret)
821 return ret;
822
823 vdev->barmap[index] = pci_iomap(pdev, index, 0);
824 }
825
826 vma->vm_private_data = vdev;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600827 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
Alex Williamson34002f52012-10-10 09:10:31 -0600828 vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600829
Alex Williamson34002f52012-10-10 09:10:31 -0600830 return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600831 req_len, vma->vm_page_prot);
832}
833
834static const struct vfio_device_ops vfio_pci_ops = {
835 .name = "vfio-pci",
836 .open = vfio_pci_open,
837 .release = vfio_pci_release,
838 .ioctl = vfio_pci_ioctl,
839 .read = vfio_pci_read,
840 .write = vfio_pci_write,
841 .mmap = vfio_pci_mmap,
842};
843
844static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
845{
846 u8 type;
847 struct vfio_pci_device *vdev;
848 struct iommu_group *group;
849 int ret;
850
851 pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
852 if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
853 return -EINVAL;
854
855 group = iommu_group_get(&pdev->dev);
856 if (!group)
857 return -EINVAL;
858
859 vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
860 if (!vdev) {
861 iommu_group_put(group);
862 return -ENOMEM;
863 }
864
865 vdev->pdev = pdev;
866 vdev->irq_type = VFIO_PCI_NUM_IRQS;
867 mutex_init(&vdev->igate);
868 spin_lock_init(&vdev->irqlock);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600869
870 ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
871 if (ret) {
872 iommu_group_put(group);
873 kfree(vdev);
874 }
875
876 return ret;
877}
878
879static void vfio_pci_remove(struct pci_dev *pdev)
880{
881 struct vfio_pci_device *vdev;
882
Alex Williamson61d79252014-08-07 11:12:04 -0600883 mutex_lock(&driver_lock);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600884
Alex Williamson61d79252014-08-07 11:12:04 -0600885 vdev = vfio_del_group_dev(&pdev->dev);
886 if (vdev) {
887 iommu_group_put(pdev->dev.iommu_group);
888 kfree(vdev);
889 }
890
891 mutex_unlock(&driver_lock);
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600892}
893
Vijay Mohan Pandarathildad9f892013-03-11 09:31:22 -0600894static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
895 pci_channel_state_t state)
896{
897 struct vfio_pci_device *vdev;
898 struct vfio_device *device;
899
900 device = vfio_device_get_from_dev(&pdev->dev);
901 if (device == NULL)
902 return PCI_ERS_RESULT_DISCONNECT;
903
904 vdev = vfio_device_data(device);
905 if (vdev == NULL) {
906 vfio_device_put(device);
907 return PCI_ERS_RESULT_DISCONNECT;
908 }
909
Alex Williamson3be3a072014-01-14 16:12:55 -0700910 mutex_lock(&vdev->igate);
911
Vijay Mohan Pandarathildad9f892013-03-11 09:31:22 -0600912 if (vdev->err_trigger)
913 eventfd_signal(vdev->err_trigger, 1);
914
Alex Williamson3be3a072014-01-14 16:12:55 -0700915 mutex_unlock(&vdev->igate);
916
Vijay Mohan Pandarathildad9f892013-03-11 09:31:22 -0600917 vfio_device_put(device);
918
919 return PCI_ERS_RESULT_CAN_RECOVER;
920}
921
922static struct pci_error_handlers vfio_err_handlers = {
923 .error_detected = vfio_pci_aer_err_detected,
924};
925
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600926static struct pci_driver vfio_pci_driver = {
927 .name = "vfio-pci",
928 .id_table = NULL, /* only dynamic ids */
929 .probe = vfio_pci_probe,
930 .remove = vfio_pci_remove,
Vijay Mohan Pandarathildad9f892013-03-11 09:31:22 -0600931 .err_handler = &vfio_err_handlers,
Alex Williamson89e1f7d2012-07-31 08:16:24 -0600932};
933
Alex Williamsonbc4fba72014-08-07 11:12:07 -0600934/*
935 * Test whether a reset is necessary and possible. We mark devices as
936 * needs_reset when they are released, but don't have a function-local reset
937 * available. If any of these exist in the affected devices, we want to do
938 * a bus/slot reset. We also need all of the affected devices to be unused,
939 * so we abort if any device has a non-zero refcnt. driver_lock prevents a
940 * device from being opened during the scan or unbound from vfio-pci.
941 */
942static int vfio_pci_test_bus_reset(struct pci_dev *pdev, void *data)
943{
944 bool *needs_reset = data;
945 struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
946 int ret = -EBUSY;
947
948 if (pci_drv == &vfio_pci_driver) {
949 struct vfio_device *device;
950 struct vfio_pci_device *vdev;
951
952 device = vfio_device_get_from_dev(&pdev->dev);
953 if (!device)
954 return ret;
955
956 vdev = vfio_device_data(device);
957 if (vdev) {
958 if (vdev->needs_reset)
959 *needs_reset = true;
960
961 if (!vdev->refcnt)
962 ret = 0;
963 }
964
965 vfio_device_put(device);
966 }
967
968 /*
969 * TODO: vfio-core considers groups to be viable even if some devices
970 * are attached to known drivers, like pci-stub or pcieport. We can't
971 * freeze devices from being unbound to those drivers like we can
972 * here though, so it would be racy to test for them. We also can't
973 * use device_lock() to prevent changes as that would interfere with
974 * PCI-core taking device_lock during bus reset. For now, we require
975 * devices to be bound to vfio-pci to get a bus/slot reset on release.
976 */
977
978 return ret;
979}
980
981/* Clear needs_reset on all affected devices after successful bus/slot reset */
982static int vfio_pci_clear_needs_reset(struct pci_dev *pdev, void *data)
983{
984 struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
985
986 if (pci_drv == &vfio_pci_driver) {
987 struct vfio_device *device;
988 struct vfio_pci_device *vdev;
989
990 device = vfio_device_get_from_dev(&pdev->dev);
991 if (!device)
992 return 0;
993
994 vdev = vfio_device_data(device);
995 if (vdev)
996 vdev->needs_reset = false;
997
998 vfio_device_put(device);
999 }
1000
1001 return 0;
1002}
1003
1004/*
1005 * Attempt to do a bus/slot reset if there are devices affected by a reset for
1006 * this device that are needs_reset and all of the affected devices are unused
1007 * (!refcnt). Callers of this function are required to hold driver_lock such
1008 * that devices can not be unbound from vfio-pci or opened by a user while we
1009 * test for and perform a bus/slot reset.
1010 */
1011static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
1012{
1013 bool needs_reset = false, slot = false;
1014 int ret;
1015
1016 if (!pci_probe_reset_slot(vdev->pdev->slot))
1017 slot = true;
1018 else if (pci_probe_reset_bus(vdev->pdev->bus))
1019 return;
1020
1021 if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
1022 vfio_pci_test_bus_reset,
1023 &needs_reset, slot) || !needs_reset)
1024 return;
1025
1026 if (slot)
1027 ret = pci_try_reset_slot(vdev->pdev->slot);
1028 else
1029 ret = pci_try_reset_bus(vdev->pdev->bus);
1030
1031 if (ret)
1032 return;
1033
1034 vfio_pci_for_each_slot_or_bus(vdev->pdev,
1035 vfio_pci_clear_needs_reset, NULL, slot);
1036}
1037
Alex Williamson89e1f7d2012-07-31 08:16:24 -06001038static void __exit vfio_pci_cleanup(void)
1039{
1040 pci_unregister_driver(&vfio_pci_driver);
1041 vfio_pci_virqfd_exit();
1042 vfio_pci_uninit_perm_bits();
1043}
1044
1045static int __init vfio_pci_init(void)
1046{
1047 int ret;
1048
1049 /* Allocate shared config space permision data used by all devices */
1050 ret = vfio_pci_init_perm_bits();
1051 if (ret)
1052 return ret;
1053
1054 /* Start the virqfd cleanup handler */
1055 ret = vfio_pci_virqfd_init();
1056 if (ret)
1057 goto out_virqfd;
1058
1059 /* Register and scan for devices */
1060 ret = pci_register_driver(&vfio_pci_driver);
1061 if (ret)
1062 goto out_driver;
1063
1064 return 0;
1065
Alex Williamson89e1f7d2012-07-31 08:16:24 -06001066out_driver:
Jiang Liu05bf3aa2012-12-07 13:43:51 -07001067 vfio_pci_virqfd_exit();
1068out_virqfd:
Alex Williamson89e1f7d2012-07-31 08:16:24 -06001069 vfio_pci_uninit_perm_bits();
1070 return ret;
1071}
1072
1073module_init(vfio_pci_init);
1074module_exit(vfio_pci_cleanup);
1075
1076MODULE_VERSION(DRIVER_VERSION);
1077MODULE_LICENSE("GPL v2");
1078MODULE_AUTHOR(DRIVER_AUTHOR);
1079MODULE_DESCRIPTION(DRIVER_DESC);