IB: Add a QP creation flag to use GFP_NOIO allocations
This addresses a problem where NFS client writes over IPoIB connected
mode may deadlock on memory allocation/writeback.
The problem is not directly memory reclamation. There is an indirect
dependency between network filesystems writing back pages and
ipoib_cm_tx_init() due to how a kworker is used. Page reclaim cannot
make forward progress until ipoib_cm_tx_init() succeeds and it is
stuck in page reclaim itself waiting for network transmission.
Ordinarily this situation may be avoided by having the caller use
GFP_NOFS but ipoib_cm_tx_init() does not have that information.
To address this, take a general approach and add a new QP creation
flag that tells the low-level hardware driver to use GFP_NOIO for the
memory allocations related to the new QP.
Use the new flag in the ipoib connected mode path, and if the driver
doesn't support it, re-issue the QP creation without the flag.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 1377f85..933efce 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -1030,10 +1030,20 @@
.cap.max_send_sge = 1,
.sq_sig_type = IB_SIGNAL_ALL_WR,
.qp_type = IB_QPT_RC,
- .qp_context = tx
+ .qp_context = tx,
+ .create_flags = IB_QP_CREATE_USE_GFP_NOIO
};
- return ib_create_qp(priv->pd, &attr);
+ struct ib_qp *tx_qp;
+
+ tx_qp = ib_create_qp(priv->pd, &attr);
+ if (PTR_ERR(tx_qp) == -EINVAL) {
+ ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
+ priv->ca->name);
+ attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO;
+ tx_qp = ib_create_qp(priv->pd, &attr);
+ }
+ return tx_qp;
}
static int ipoib_cm_send_req(struct net_device *dev,
@@ -1104,12 +1114,14 @@
struct ipoib_dev_priv *priv = netdev_priv(p->dev);
int ret;
- p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring);
+ p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring,
+ GFP_NOIO, PAGE_KERNEL);
if (!p->tx_ring) {
ipoib_warn(priv, "failed to allocate tx ring\n");
ret = -ENOMEM;
goto err_tx;
}
+ memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
p->qp = ipoib_cm_create_tx_qp(p->dev, p);
if (IS_ERR(p->qp)) {
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index acd8251..d75b02f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -783,6 +783,7 @@
IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1,
IB_QP_CREATE_NETIF_QP = 1 << 5,
IB_QP_CREATE_SIGNATURE_EN = 1 << 6,
+ IB_QP_CREATE_USE_GFP_NOIO = 1 << 7,
/* reserve bits 26-31 for low level drivers' internal use */
IB_QP_CREATE_RESERVED_START = 1 << 26,
IB_QP_CREATE_RESERVED_END = 1 << 31,