PCI: Spread interrupt vectors in pci_alloc_irq_vectors()
Set the affinity_mask in the PCI device before allocating vectors so that
the affinity can be propagated through the MSI descriptor structures to the
core IRQ code. To facilitate this, new __pci_enable_msi_range() and
__pci_enable_msix_range() helpers are factored out of their not prefixed
variants which assigning the new IRQ affinity mask in the PCI device so
that the low-level interrupt code can perform the interrupt affinity
assignment and do node-local allocations.
A new PCI_IRQ_NOAFFINITY flag is added to pci_alloc_irq_vectors() so that
this function can also be used by drivers that don't wish to use the
automatic affinity assignment.
[bhelgaas: omit "else" after "return" consistently]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Alexander Gordeev <agordeev@redhat.com>
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 0ac612b..c55df29 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -99,6 +99,10 @@
MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in
case the device does not support legacy interrupt lines.
+By default this function will spread the interrupts around the available
+CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY
+flag.
+
To get the Linux IRQ numbers passed to request_irq() and free_irq() and the
vectors, use the following function:
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5e5ab47..a02981e 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -569,6 +569,7 @@
entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1;
entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec));
entry->nvec_used = nvec;
+ entry->affinity = dev->irq_affinity;
if (control & PCI_MSI_FLAGS_64BIT)
entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
@@ -680,10 +681,18 @@
static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
struct msix_entry *entries, int nvec)
{
+ const struct cpumask *mask = NULL;
struct msi_desc *entry;
- int i;
+ int cpu = -1, i;
for (i = 0; i < nvec; i++) {
+ if (dev->irq_affinity) {
+ cpu = cpumask_next(cpu, dev->irq_affinity);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(dev->irq_affinity);
+ mask = cpumask_of(cpu);
+ }
+
entry = alloc_msi_entry(&dev->dev);
if (!entry) {
if (!i)
@@ -703,6 +712,7 @@
entry->msi_attrib.default_irq = dev->irq;
entry->mask_base = base;
entry->nvec_used = 1;
+ entry->affinity = mask;
list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
}
@@ -1028,19 +1038,8 @@
}
EXPORT_SYMBOL(pci_msi_enabled);
-/**
- * pci_enable_msi_range - configure device's MSI capability structure
- * @dev: device to configure
- * @minvec: minimal number of interrupts to configure
- * @maxvec: maximum number of interrupts to configure
- *
- * This function tries to allocate a maximum possible number of interrupts in a
- * range between @minvec and @maxvec. It returns a negative errno if an error
- * occurs. If it succeeds, it returns the actual number of interrupts allocated
- * and updates the @dev's irq member to the lowest new interrupt number;
- * the other interrupt numbers allocated to this device are consecutive.
- **/
-int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
+ unsigned int flags)
{
int nvec;
int rc;
@@ -1063,26 +1062,86 @@
nvec = pci_msi_vec_count(dev);
if (nvec < 0)
return nvec;
- else if (nvec < minvec)
+ if (nvec < minvec)
return -EINVAL;
- else if (nvec > maxvec)
+
+ if (nvec > maxvec)
nvec = maxvec;
- do {
- rc = msi_capability_init(dev, nvec);
- if (rc < 0) {
- return rc;
- } else if (rc > 0) {
- if (rc < minvec)
+ for (;;) {
+ if (!(flags & PCI_IRQ_NOAFFINITY)) {
+ dev->irq_affinity = irq_create_affinity_mask(&nvec);
+ if (nvec < minvec)
return -ENOSPC;
- nvec = rc;
}
- } while (rc);
- return nvec;
+ rc = msi_capability_init(dev, nvec);
+ if (rc == 0)
+ return nvec;
+
+ kfree(dev->irq_affinity);
+ dev->irq_affinity = NULL;
+
+ if (rc < 0)
+ return rc;
+ if (rc < minvec)
+ return -ENOSPC;
+
+ nvec = rc;
+ }
+}
+
+/**
+ * pci_enable_msi_range - configure device's MSI capability structure
+ * @dev: device to configure
+ * @minvec: minimal number of interrupts to configure
+ * @maxvec: maximum number of interrupts to configure
+ *
+ * This function tries to allocate a maximum possible number of interrupts in a
+ * range between @minvec and @maxvec. It returns a negative errno if an error
+ * occurs. If it succeeds, it returns the actual number of interrupts allocated
+ * and updates the @dev's irq member to the lowest new interrupt number;
+ * the other interrupt numbers allocated to this device are consecutive.
+ **/
+int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+{
+ return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY);
}
EXPORT_SYMBOL(pci_enable_msi_range);
+static int __pci_enable_msix_range(struct pci_dev *dev,
+ struct msix_entry *entries, int minvec, int maxvec,
+ unsigned int flags)
+{
+ int nvec = maxvec;
+ int rc;
+
+ if (maxvec < minvec)
+ return -ERANGE;
+
+ for (;;) {
+ if (!(flags & PCI_IRQ_NOAFFINITY)) {
+ dev->irq_affinity = irq_create_affinity_mask(&nvec);
+ if (nvec < minvec)
+ return -ENOSPC;
+ }
+
+ rc = pci_enable_msix(dev, entries, nvec);
+ if (rc == 0)
+ return nvec;
+
+ kfree(dev->irq_affinity);
+ dev->irq_affinity = NULL;
+
+ if (rc < 0)
+ return rc;
+ if (rc < minvec)
+ return -ENOSPC;
+
+ nvec = rc;
+ }
+}
+
/**
* pci_enable_msix_range - configure device's MSI-X capability structure
* @dev: pointer to the pci_dev data structure of MSI-X device function
@@ -1099,26 +1158,10 @@
* with new allocated MSI-X interrupts.
**/
int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
- int minvec, int maxvec)
+ int minvec, int maxvec)
{
- int nvec = maxvec;
- int rc;
-
- if (maxvec < minvec)
- return -ERANGE;
-
- do {
- rc = pci_enable_msix(dev, entries, nvec);
- if (rc < 0) {
- return rc;
- } else if (rc > 0) {
- if (rc < minvec)
- return -ENOSPC;
- nvec = rc;
- }
- } while (rc);
-
- return nvec;
+ return __pci_enable_msix_range(dev, entries, minvec, maxvec,
+ PCI_IRQ_NOAFFINITY);
}
EXPORT_SYMBOL(pci_enable_msix_range);
@@ -1145,13 +1188,14 @@
int vecs = -ENOSPC;
if (!(flags & PCI_IRQ_NOMSIX)) {
- vecs = pci_enable_msix_range(dev, NULL, min_vecs, max_vecs);
+ vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
+ flags);
if (vecs > 0)
return vecs;
}
if (!(flags & PCI_IRQ_NOMSI)) {
- vecs = pci_enable_msi_range(dev, min_vecs, max_vecs);
+ vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags);
if (vecs > 0)
return vecs;
}
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 52ecd49..f140661 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -320,6 +320,7 @@
* directly, use the values stored here. They might be different!
*/
unsigned int irq;
+ struct cpumask *irq_affinity;
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
bool match_driver; /* Skip attaching driver */
@@ -1240,6 +1241,7 @@
#define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */
#define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */
#define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */
+#define PCI_IRQ_NOAFFINITY (1 << 3) /* don't auto-assign affinity */
/* kmem_cache style wrapper around pci_alloc_consistent() */