powerpc/pseries: Implement a quota system for MSIs
There are hardware limitations on the number of available MSIs,
which firmware expresses using a property named "ibm,pe-total-#msi".
This property tells us how many MSIs are available for devices below
the point in the PCI tree where we find the property.
For old firmwares which don't have the property, we assume there are
8 MSIs available per "partitionable endpoint" (PE). The PE can be
found using existing EEH code, which uses the methods described in
PAPR. For our purposes we want the parent of the node that's
identified using this method.
When a driver requests n MSIs for a device, we first establish where
the "ibm,pe-total-#msi" property above that device is, or we find the
PE if the property is not found. In both cases we call this node
the "pe_dn".
We then count all non-bridge devices below the pe_dn, to establish
how many devices in total may need MSIs. The quota is then simply the
total available divided by the number of devices, if the request is
less than or equal to the quota, the request is fine and we're done.
If the request is greater than the quota, we try to determine if there
are any "spare" MSIs which we can give to this device. Spare MSIs are
found by looking for other devices which can never use their full
quota, because their "req#msi(-x)" property is less than the quota.
If we find any spare, we divide the spares by the number of devices
that could request more than their quota. This ensures the spare
MSIs are spread evenly amongst all over-quota requestors.
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 081af6d..3e0d6ef 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -174,12 +174,186 @@
return check_req(pdev, nvec, "ibm,req#msi-x");
}
+/* Quota calculation */
+
+static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
+{
+ struct device_node *dn;
+ const u32 *p;
+
+ dn = of_node_get(pci_device_to_OF_node(dev));
+ while (dn) {
+ p = of_get_property(dn, "ibm,pe-total-#msi", NULL);
+ if (p) {
+ pr_debug("rtas_msi: found prop on dn %s\n",
+ dn->full_name);
+ *total = *p;
+ return dn;
+ }
+
+ dn = of_get_next_parent(dn);
+ }
+
+ return NULL;
+}
+
+static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
+{
+ struct device_node *dn;
+
+ /* Found our PE and assume 8 at that point. */
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn)
+ return NULL;
+
+ dn = find_device_pe(dn);
+ if (!dn)
+ return NULL;
+
+ /* We actually want the parent */
+ dn = of_get_parent(dn);
+ if (!dn)
+ return NULL;
+
+ /* Hardcode of 8 for old firmwares */
+ *total = 8;
+ pr_debug("rtas_msi: using PE dn %s\n", dn->full_name);
+
+ return dn;
+}
+
+struct msi_counts {
+ struct device_node *requestor;
+ int num_devices;
+ int request;
+ int quota;
+ int spare;
+ int over_quota;
+};
+
+static void *count_non_bridge_devices(struct device_node *dn, void *data)
+{
+ struct msi_counts *counts = data;
+ const u32 *p;
+ u32 class;
+
+ pr_debug("rtas_msi: counting %s\n", dn->full_name);
+
+ p = of_get_property(dn, "class-code", NULL);
+ class = p ? *p : 0;
+
+ if ((class >> 8) != PCI_CLASS_BRIDGE_PCI)
+ counts->num_devices++;
+
+ return NULL;
+}
+
+static void *count_spare_msis(struct device_node *dn, void *data)
+{
+ struct msi_counts *counts = data;
+ const u32 *p;
+ int req;
+
+ if (dn == counts->requestor)
+ req = counts->request;
+ else {
+ /* We don't know if a driver will try to use MSI or MSI-X,
+ * so we just have to punt and use the larger of the two. */
+ req = 0;
+ p = of_get_property(dn, "ibm,req#msi", NULL);
+ if (p)
+ req = *p;
+
+ p = of_get_property(dn, "ibm,req#msi-x", NULL);
+ if (p)
+ req = max(req, (int)*p);
+ }
+
+ if (req < counts->quota)
+ counts->spare += counts->quota - req;
+ else if (req > counts->quota)
+ counts->over_quota++;
+
+ return NULL;
+}
+
+static int msi_quota_for_device(struct pci_dev *dev, int request)
+{
+ struct device_node *pe_dn;
+ struct msi_counts counts;
+ int total;
+
+ pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev),
+ request);
+
+ pe_dn = find_pe_total_msi(dev, &total);
+ if (!pe_dn)
+ pe_dn = find_pe_dn(dev, &total);
+
+ if (!pe_dn) {
+ pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev));
+ goto out;
+ }
+
+ pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name);
+
+ memset(&counts, 0, sizeof(struct msi_counts));
+
+ /* Work out how many devices we have below this PE */
+ traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts);
+
+ if (counts.num_devices == 0) {
+ pr_err("rtas_msi: found 0 devices under PE for %s\n",
+ pci_name(dev));
+ goto out;
+ }
+
+ counts.quota = total / counts.num_devices;
+ if (request <= counts.quota)
+ goto out;
+
+ /* else, we have some more calculating to do */
+ counts.requestor = pci_device_to_OF_node(dev);
+ counts.request = request;
+ traverse_pci_devices(pe_dn, count_spare_msis, &counts);
+
+ /* If the quota isn't an integer multiple of the total, we can
+ * use the remainder as spare MSIs for anyone that wants them. */
+ counts.spare += total % counts.num_devices;
+
+ /* Divide any spare by the number of over-quota requestors */
+ if (counts.over_quota)
+ counts.quota += counts.spare / counts.over_quota;
+
+ /* And finally clamp the request to the possibly adjusted quota */
+ request = min(counts.quota, request);
+
+ pr_debug("rtas_msi: request clamped to quota %d\n", request);
+out:
+ of_node_put(pe_dn);
+
+ return request;
+}
+
static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type)
{
- if (type == PCI_CAP_ID_MSIX)
- return check_req_msix(pdev, nvec);
+ int quota, rc;
- return check_req_msi(pdev, nvec);
+ if (type == PCI_CAP_ID_MSIX)
+ rc = check_req_msix(pdev, nvec);
+ else
+ rc = check_req_msi(pdev, nvec);
+
+ if (rc)
+ return rc;
+
+ quota = msi_quota_for_device(pdev, nvec);
+
+ if (quota && quota < nvec)
+ return quota;
+
+ return 0;
}
static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)