Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband
* 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband:
IB/cm: Improve local id allocation
IPoIB/cm: Fix SRQ WR leak
IB/ipoib: Fix typos in error messages
IB/mlx4: Check if SRQ is full when posting receive
IB/mlx4: Pass send queue sizes from userspace to kernel
IB/mlx4: Fix check of opcode in mlx4_ib_post_send()
mlx4_core: Fix array overrun in dump_dev_cap_flags()
IB/mlx4: Fix RESET to RESET and RESET to ERROR transitions
IB/mthca: Fix RESET to ERROR transition
IB/mlx4: Set GRH:HopLimit when sending globally routed MADs
IB/mthca: Set GRH:HopLimit when building MLX headers
IB/mlx4: Fix check of max_qp_dest_rdma in modify QP
IB/mthca: Fix use-after-free on device restart
IB/ehca: Return proper error code if register_mr fails
IPoIB: Handle P_Key table reordering
IB/core: Use start_port() and end_port()
IB/core: Add helpers for uncached GID and P_Key searches
IB/ipath: Fix potential deadlock with multicast spinlocks
IB/core: Free umem when mm is already gone
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index eff591d..e840434 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -306,7 +306,9 @@
do {
spin_lock_irqsave(&cm.lock, flags);
ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
- next_id++, &id);
+ next_id, &id);
+ if (!ret)
+ next_id = ((unsigned) id + 1) & MAX_ID_MASK;
spin_unlock_irqrestore(&cm.lock, flags);
} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index bcecf4d..3ada17c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -150,6 +150,18 @@
return 0;
}
+static int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
/**
* ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
@@ -209,6 +221,45 @@
return 0;
}
+static int read_port_table_lengths(struct ib_device *device)
+{
+ struct ib_port_attr *tprops = NULL;
+ int num_ports, ret = -ENOMEM;
+ u8 port_index;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ goto out;
+
+ num_ports = end_port(device) - start_port(device) + 1;
+
+ device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+ GFP_KERNEL);
+ device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+ GFP_KERNEL);
+ if (!device->pkey_tbl_len || !device->gid_tbl_len)
+ goto err;
+
+ for (port_index = 0; port_index < num_ports; ++port_index) {
+ ret = ib_query_port(device, port_index + start_port(device),
+ tprops);
+ if (ret)
+ goto err;
+ device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+ device->gid_tbl_len[port_index] = tprops->gid_tbl_len;
+ }
+
+ ret = 0;
+ goto out;
+
+err:
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+out:
+ kfree(tprops);
+ return ret;
+}
+
/**
* ib_register_device - Register an IB device with IB core
* @device:Device to register
@@ -240,10 +291,19 @@
spin_lock_init(&device->event_handler_lock);
spin_lock_init(&device->client_data_lock);
+ ret = read_port_table_lengths(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+ device->name);
+ goto out;
+ }
+
ret = ib_device_register_sysfs(device);
if (ret) {
printk(KERN_WARNING "Couldn't register device %s with driver model\n",
device->name);
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
goto out;
}
@@ -285,6 +345,9 @@
list_del(&device->core_list);
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+
mutex_unlock(&device_mutex);
spin_lock_irqsave(&device->client_data_lock, flags);
@@ -507,10 +570,7 @@
u8 port_num,
struct ib_port_attr *port_attr)
{
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- if (port_num)
- return -EINVAL;
- } else if (port_num < 1 || port_num > device->phys_port_cnt)
+ if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
return device->query_port(device, port_num, port_attr);
@@ -582,10 +642,7 @@
u8 port_num, int port_modify_mask,
struct ib_port_modify *port_modify)
{
- if (device->node_type == RDMA_NODE_IB_SWITCH) {
- if (port_num)
- return -EINVAL;
- } else if (port_num < 1 || port_num > device->phys_port_cnt)
+ if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
return device->modify_port(device, port_num, port_modify_mask,
@@ -593,6 +650,68 @@
}
EXPORT_SYMBOL(ib_modify_port);
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found. This
+ * parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index)
+{
+ union ib_gid tmp_gid;
+ int ret, port, i;
+
+ for (port = start_port(device); port <= end_port(device); ++port) {
+ for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+ ret = ib_query_gid(device, port, i, &tmp_gid);
+ if (ret)
+ return ret;
+ if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+ *port_num = port;
+ if (index)
+ *index = i;
+ return 0;
+ }
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index)
+{
+ int ret, i;
+ u16 tmp_pkey;
+
+ for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+ ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+ if (ret)
+ return ret;
+
+ if (pkey == tmp_pkey) {
+ *index = i;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
static int __init ib_core_init(void)
{
int ret;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 96a16c0..b4aec51 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -210,8 +210,10 @@
__ib_umem_release(umem->context->device, umem, 1);
mm = get_task_mm(current);
- if (!mm)
+ if (!mm) {
+ kfree(umem);
return;
+ }
diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index 84c5bb4..add79bd 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -2050,13 +2050,10 @@
switch (hipz_rc) {
case H_SUCCESS: /* successful completion */
return 0;
- case H_ADAPTER_PARM: /* invalid adapter handle */
- case H_RT_PARM: /* invalid resource type */
case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
- case H_MLENGTH_PARM: /* invalid memory length */
- case H_MEM_ACCESS_PARM: /* invalid access controls */
case H_CONSTRAINED: /* resource constraint */
- return -EINVAL;
+ case H_NO_MEM:
+ return -ENOMEM;
case H_BUSY: /* long busy */
return -EBUSY;
default:
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
index 085e28b..dd691cf 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -165,10 +165,9 @@
{
struct rb_node **n = &mcast_tree.rb_node;
struct rb_node *pn = NULL;
- unsigned long flags;
int ret;
- spin_lock_irqsave(&mcast_lock, flags);
+ spin_lock_irq(&mcast_lock);
while (*n) {
struct ipath_mcast *tmcast;
@@ -228,7 +227,7 @@
ret = 0;
bail:
- spin_unlock_irqrestore(&mcast_lock, flags);
+ spin_unlock_irq(&mcast_lock);
return ret;
}
@@ -289,17 +288,16 @@
struct ipath_mcast *mcast = NULL;
struct ipath_mcast_qp *p, *tmp;
struct rb_node *n;
- unsigned long flags;
int last = 0;
int ret;
- spin_lock_irqsave(&mcast_lock, flags);
+ spin_lock_irq(&mcast_lock);
/* Find the GID in the mcast table. */
n = mcast_tree.rb_node;
while (1) {
if (n == NULL) {
- spin_unlock_irqrestore(&mcast_lock, flags);
+ spin_unlock_irq(&mcast_lock);
ret = -EINVAL;
goto bail;
}
@@ -334,7 +332,7 @@
break;
}
- spin_unlock_irqrestore(&mcast_lock, flags);
+ spin_unlock_irq(&mcast_lock);
if (p) {
/*
@@ -348,9 +346,9 @@
atomic_dec(&mcast->refcount);
wait_event(mcast->wait, !atomic_read(&mcast->refcount));
ipath_mcast_free(mcast);
- spin_lock(&dev->n_mcast_grps_lock);
+ spin_lock_irq(&dev->n_mcast_grps_lock);
dev->n_mcast_grps_allocated--;
- spin_unlock(&dev->n_mcast_grps_lock);
+ spin_unlock_irq(&dev->n_mcast_grps_lock);
}
ret = 0;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 5cd7069..a824bc5 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -188,14 +188,32 @@
}
}
-static int set_qp_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
- enum ib_qp_type type, struct mlx4_ib_qp *qp)
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ struct mlx4_ib_qp *qp)
{
- /* Sanity check QP size before proceeding */
+ /* Sanity check RQ size before proceeding */
+ if (cap->max_recv_wr > dev->dev->caps.max_wqes ||
+ cap->max_recv_sge > dev->dev->caps.max_rq_sg)
+ return -EINVAL;
+
+ qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
+
+ qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
+ sizeof (struct mlx4_wqe_data_seg)));
+ qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+
+ cap->max_recv_wr = qp->rq.max;
+ cap->max_recv_sge = qp->rq.max_gs;
+
+ return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+ /* Sanity check SQ size before proceeding */
if (cap->max_send_wr > dev->dev->caps.max_wqes ||
- cap->max_recv_wr > dev->dev->caps.max_wqes ||
cap->max_send_sge > dev->dev->caps.max_sq_sg ||
- cap->max_recv_sge > dev->dev->caps.max_rq_sg ||
cap->max_inline_data + send_wqe_overhead(type) +
sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
return -EINVAL;
@@ -208,12 +226,7 @@
cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
return -EINVAL;
- qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
- qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 0;
-
- qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
- sizeof (struct mlx4_wqe_data_seg)));
- qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+ qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 1;
qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
sizeof (struct mlx4_wqe_data_seg),
@@ -233,16 +246,26 @@
qp->sq.offset = 0;
}
- cap->max_send_wr = qp->sq.max;
- cap->max_recv_wr = qp->rq.max;
- cap->max_send_sge = qp->sq.max_gs;
- cap->max_recv_sge = qp->rq.max_gs;
+ cap->max_send_wr = qp->sq.max;
+ cap->max_send_sge = qp->sq.max_gs;
cap->max_inline_data = (1 << qp->sq.wqe_shift) - send_wqe_overhead(type) -
sizeof (struct mlx4_wqe_inline_seg);
return 0;
}
+static int set_user_sq_size(struct mlx4_ib_qp *qp,
+ struct mlx4_ib_create_qp *ucmd)
+{
+ qp->sq.max = 1 << ucmd->log_sq_bb_count;
+ qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+ qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
+ (qp->sq.max << qp->sq.wqe_shift);
+
+ return 0;
+}
+
static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
struct ib_qp_init_attr *init_attr,
struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
@@ -264,7 +287,7 @@
qp->sq.head = 0;
qp->sq.tail = 0;
- err = set_qp_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+ err = set_rq_size(dev, &init_attr->cap, qp);
if (err)
goto err;
@@ -276,6 +299,10 @@
goto err;
}
+ err = set_user_sq_size(qp, &ucmd);
+ if (err)
+ goto err;
+
qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
qp->buf_size, 0);
if (IS_ERR(qp->umem)) {
@@ -297,6 +324,10 @@
if (err)
goto err_mtt;
} else {
+ err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+ if (err)
+ goto err;
+
err = mlx4_ib_db_alloc(dev, &qp->db, 0);
if (err)
goto err;
@@ -573,7 +604,7 @@
}
}
-static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *attr,
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
int attr_mask)
{
u8 dest_rd_atomic;
@@ -603,7 +634,7 @@
return cpu_to_be32(hw_access_flags);
}
-static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, struct ib_qp_attr *attr,
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
int attr_mask)
{
if (attr_mask & IB_QP_PKEY_INDEX)
@@ -619,7 +650,7 @@
path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
}
-static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
struct mlx4_qp_path *path, u8 port)
{
path->grh_mylmc = ah->src_path_bits & 0x7f;
@@ -655,14 +686,14 @@
return 0;
}
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
- int attr_mask, struct ib_udata *udata)
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+ const struct ib_qp_attr *attr, int attr_mask,
+ enum ib_qp_state cur_state, enum ib_qp_state new_state)
{
struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
struct mlx4_ib_qp *qp = to_mqp(ibqp);
struct mlx4_qp_context *context;
enum mlx4_qp_optpar optpar = 0;
- enum ib_qp_state cur_state, new_state;
int sqd_event;
int err = -EINVAL;
@@ -670,34 +701,6 @@
if (!context)
return -ENOMEM;
- mutex_lock(&qp->mutex);
-
- cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
- new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
- if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
- goto out;
-
- if ((attr_mask & IB_QP_PKEY_INDEX) &&
- attr->pkey_index >= dev->dev->caps.pkey_table_len) {
- goto out;
- }
-
- if ((attr_mask & IB_QP_PORT) &&
- (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
- goto out;
- }
-
- if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
- attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
- goto out;
- }
-
- if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
- attr->max_dest_rd_atomic > 1 << dev->dev->caps.max_qp_dest_rdma) {
- goto out;
- }
-
context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
(to_mlx4_st(ibqp->qp_type) << 16));
context->flags |= cpu_to_be32(1 << 8); /* DE? */
@@ -920,11 +923,84 @@
}
out:
- mutex_unlock(&qp->mutex);
kfree(context);
return err;
}
+static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
+static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+};
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int err = -EINVAL;
+
+ mutex_lock(&qp->mutex);
+
+ cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+ goto out;
+
+ if ((attr_mask & IB_QP_PKEY_INDEX) &&
+ attr->pkey_index >= dev->dev->caps.pkey_table_len) {
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PORT) &&
+ (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+ goto out;
+ }
+
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ err = 0;
+ goto out;
+ }
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+ err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
+ mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
+ IB_QPS_RESET, IB_QPS_INIT);
+ if (err)
+ goto out;
+ cur_state = IB_QPS_INIT;
+ }
+
+ err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
void *wqe)
{
@@ -952,6 +1028,7 @@
(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
sqp->ud_header.grh.flow_label =
ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ sqp->ud_header.grh.hop_limit = ah->av.hop_limit;
ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
ah->av.gid_index, &sqp->ud_header.grh.source_gid);
memcpy(sqp->ud_header.grh.destination_gid.raw,
@@ -1192,7 +1269,7 @@
*/
wmb();
- if (wr->opcode < 0 || wr->opcode > ARRAY_SIZE(mlx4_ib_opcode)) {
+ if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
err = -EINVAL;
goto out;
}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 42ab4a8..12fac1c 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -297,6 +297,12 @@
break;
}
+ if (unlikely(srq->head == srq->tail)) {
+ err = -ENOMEM;
+ *bad_wr = wr;
+ break;
+ }
+
srq->wrid[srq->head] = wr->wr_id;
next = get_wqe(srq, srq->head);
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
index 5b8eddc..88c72d5 100644
--- a/drivers/infiniband/hw/mlx4/user.h
+++ b/drivers/infiniband/hw/mlx4/user.h
@@ -39,7 +39,7 @@
* Increment this value if any changes that break userspace ABI
* compatibility are made.
*/
-#define MLX4_IB_UVERBS_ABI_VERSION 1
+#define MLX4_IB_UVERBS_ABI_VERSION 2
/*
* Make sure that all structs defined in this file remain laid out so
@@ -87,6 +87,9 @@
struct mlx4_ib_create_qp {
__u64 buf_addr;
__u64 db_addr;
+ __u8 log_sq_bb_count;
+ __u8 log_sq_stride;
+ __u8 reserved[6];
};
#endif /* MLX4_IB_USER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 27caf3b..4b111a8 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -279,6 +279,7 @@
(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
header->grh.flow_label =
ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ header->grh.hop_limit = ah->av->hop_limit;
ib_get_cached_gid(&dev->ib_dev,
be32_to_cpu(ah->av->port_pd) >> 24,
ah->av->gid_index % dev->limits.gid_table_len,
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 773145e..aa563e6 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -1250,12 +1250,14 @@
int __mthca_restart_one(struct pci_dev *pdev)
{
struct mthca_dev *mdev;
+ int hca_type;
mdev = pci_get_drvdata(pdev);
if (!mdev)
return -ENODEV;
+ hca_type = mdev->hca_type;
__mthca_remove_one(pdev);
- return __mthca_init_one(pdev, mdev->hca_type);
+ return __mthca_init_one(pdev, hca_type);
}
static int __devinit mthca_init_one(struct pci_dev *pdev,
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 2741ded..0276649 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -296,7 +296,7 @@
}
}
-static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr,
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
int attr_mask)
{
if (attr_mask & IB_QP_PKEY_INDEX)
@@ -328,7 +328,7 @@
mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
}
-static __be32 get_hw_access_flags(struct mthca_qp *qp, struct ib_qp_attr *attr,
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
int attr_mask)
{
u8 dest_rd_atomic;
@@ -511,7 +511,7 @@
return err;
}
-static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah,
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
struct mthca_qp_path *path, u8 port)
{
path->g_mylmc = ah->src_path_bits & 0x7f;
@@ -539,12 +539,12 @@
return 0;
}
-int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
- struct ib_udata *udata)
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+ const struct ib_qp_attr *attr, int attr_mask,
+ enum ib_qp_state cur_state, enum ib_qp_state new_state)
{
struct mthca_dev *dev = to_mdev(ibqp->device);
struct mthca_qp *qp = to_mqp(ibqp);
- enum ib_qp_state cur_state, new_state;
struct mthca_mailbox *mailbox;
struct mthca_qp_param *qp_param;
struct mthca_qp_context *qp_context;
@@ -552,60 +552,6 @@
u8 status;
int err = -EINVAL;
- mutex_lock(&qp->mutex);
-
- if (attr_mask & IB_QP_CUR_STATE) {
- cur_state = attr->cur_qp_state;
- } else {
- spin_lock_irq(&qp->sq.lock);
- spin_lock(&qp->rq.lock);
- cur_state = qp->state;
- spin_unlock(&qp->rq.lock);
- spin_unlock_irq(&qp->sq.lock);
- }
-
- new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
- if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
- mthca_dbg(dev, "Bad QP transition (transport %d) "
- "%d->%d with attr 0x%08x\n",
- qp->transport, cur_state, new_state,
- attr_mask);
- goto out;
- }
-
- if (cur_state == new_state && cur_state == IB_QPS_RESET) {
- err = 0;
- goto out;
- }
-
- if ((attr_mask & IB_QP_PKEY_INDEX) &&
- attr->pkey_index >= dev->limits.pkey_table_len) {
- mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
- attr->pkey_index, dev->limits.pkey_table_len-1);
- goto out;
- }
-
- if ((attr_mask & IB_QP_PORT) &&
- (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
- mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
- goto out;
- }
-
- if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
- attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
- mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
- attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
- goto out;
- }
-
- if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
- attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
- mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
- attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
- goto out;
- }
-
mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
if (IS_ERR(mailbox)) {
err = PTR_ERR(mailbox);
@@ -892,6 +838,98 @@
out_mailbox:
mthca_free_mailbox(dev, mailbox);
+out:
+ return err;
+}
+
+static const struct ib_qp_attr dummy_init_attr = { .port_num = 1 };
+static const int dummy_init_attr_mask[] = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+};
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+ struct ib_udata *udata)
+{
+ struct mthca_dev *dev = to_mdev(ibqp->device);
+ struct mthca_qp *qp = to_mqp(ibqp);
+ enum ib_qp_state cur_state, new_state;
+ int err = -EINVAL;
+
+ mutex_lock(&qp->mutex);
+ if (attr_mask & IB_QP_CUR_STATE) {
+ cur_state = attr->cur_qp_state;
+ } else {
+ spin_lock_irq(&qp->sq.lock);
+ spin_lock(&qp->rq.lock);
+ cur_state = qp->state;
+ spin_unlock(&qp->rq.lock);
+ spin_unlock_irq(&qp->sq.lock);
+ }
+
+ new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+ if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+ mthca_dbg(dev, "Bad QP transition (transport %d) "
+ "%d->%d with attr 0x%08x\n",
+ qp->transport, cur_state, new_state,
+ attr_mask);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PKEY_INDEX) &&
+ attr->pkey_index >= dev->limits.pkey_table_len) {
+ mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+ attr->pkey_index, dev->limits.pkey_table_len-1);
+ goto out;
+ }
+
+ if ((attr_mask & IB_QP_PORT) &&
+ (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+ mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+ attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+ mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+ attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+ goto out;
+ }
+
+ if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+ attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+ mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+ attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+ goto out;
+ }
+
+ if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+ err = 0;
+ goto out;
+ }
+
+ if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+ err = __mthca_modify_qp(ibqp, &dummy_init_attr,
+ dummy_init_attr_mask[ibqp->qp_type],
+ IB_QPS_RESET, IB_QPS_INIT);
+ if (err)
+ goto out;
+ cur_state = IB_QPS_INIT;
+ }
+
+ err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
out:
mutex_unlock(&qp->mutex);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 87310ee..a0b3782 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -132,12 +132,46 @@
__be32 mtu;
};
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State. The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ * drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ * to be empty or the number of Poll CQ operations has exceeded
+ * CQ capacity size;
+ * - or
+ * post another WR that completes on the same CQ and wait for this
+ * WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * rx_drain_qp before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+ IPOIB_CM_RX_LIVE,
+ IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+ IPOIB_CM_RX_FLUSH /* Last WQE Reached event observed */
+};
+
struct ipoib_cm_rx {
struct ib_cm_id *id;
struct ib_qp *qp;
struct list_head list;
struct net_device *dev;
unsigned long jiffies;
+ enum ipoib_cm_state state;
};
struct ipoib_cm_tx {
@@ -165,10 +199,16 @@
struct ib_srq *srq;
struct ipoib_cm_rx_buf *srq_ring;
struct ib_cm_id *id;
- struct list_head passive_ids;
+ struct ib_qp *rx_drain_qp; /* generates WR described in 10.3.1 */
+ struct list_head passive_ids; /* state: LIVE */
+ struct list_head rx_error_list; /* state: ERROR */
+ struct list_head rx_flush_list; /* state: FLUSH, drain not started */
+ struct list_head rx_drain_list; /* state: FLUSH, drain started */
+ struct list_head rx_reap_list; /* state: FLUSH, drain done */
struct work_struct start_task;
struct work_struct reap_task;
struct work_struct skb_task;
+ struct work_struct rx_reap_task;
struct delayed_work stale_task;
struct sk_buff_head skb_queue;
struct list_head start_list;
@@ -201,15 +241,17 @@
struct list_head multicast_list;
struct rb_root multicast_tree;
- struct delayed_work pkey_task;
+ struct delayed_work pkey_poll_task;
struct delayed_work mcast_task;
struct work_struct flush_task;
struct work_struct restart_task;
struct delayed_work ah_reap_task;
+ struct work_struct pkey_event_task;
struct ib_device *ca;
u8 port;
u16 pkey;
+ u16 pkey_index;
struct ib_pd *pd;
struct ib_mr *mr;
struct ib_cq *cq;
@@ -333,12 +375,13 @@
int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
void ipoib_ib_dev_cleanup(struct net_device *dev);
int ipoib_ib_dev_open(struct net_device *dev);
int ipoib_ib_dev_up(struct net_device *dev);
int ipoib_ib_dev_down(struct net_device *dev, int flush);
-int ipoib_ib_dev_stop(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev, int flush);
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
void ipoib_dev_cleanup(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index eec833b..ffec794 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -37,6 +37,7 @@
#include <net/dst.h>
#include <net/icmp.h>
#include <linux/icmpv6.h>
+#include <linux/delay.h>
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
static int data_debug_level;
@@ -62,6 +63,16 @@
u32 remote_mtu;
};
+static struct ib_qp_attr ipoib_cm_err_attr = {
+ .qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
+
+static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
+ .wr_id = IPOIB_CM_RX_DRAIN_WRID
+};
+
static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
struct ib_cm_event *event);
@@ -150,11 +161,44 @@
return NULL;
}
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
+{
+ struct ib_recv_wr *bad_wr;
+
+ /* rx_drain_qp send queue depth is 1, so
+ * make sure we have at most 1 outstanding WR. */
+ if (list_empty(&priv->cm.rx_flush_list) ||
+ !list_empty(&priv->cm.rx_drain_list))
+ return;
+
+ if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+ ipoib_warn(priv, "failed to post rx_drain wr\n");
+
+ list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+ struct ipoib_cm_rx *p = ctx;
+ struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+ unsigned long flags;
+
+ if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+ return;
+
+ spin_lock_irqsave(&priv->lock, flags);
+ list_move(&p->list, &priv->cm.rx_flush_list);
+ p->state = IPOIB_CM_RX_FLUSH;
+ ipoib_cm_start_rx_drain(priv);
+ spin_unlock_irqrestore(&priv->lock, flags);
+}
+
static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
struct ipoib_cm_rx *p)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_qp_init_attr attr = {
+ .event_handler = ipoib_cm_rx_event_handler,
.send_cq = priv->cq, /* does not matter, we never send anything */
.recv_cq = priv->cq,
.srq = priv->cm.srq,
@@ -256,6 +300,7 @@
cm_id->context = p;
p->jiffies = jiffies;
+ p->state = IPOIB_CM_RX_LIVE;
spin_lock_irq(&priv->lock);
if (list_empty(&priv->cm.passive_ids))
queue_delayed_work(ipoib_workqueue,
@@ -277,7 +322,6 @@
{
struct ipoib_cm_rx *p;
struct ipoib_dev_priv *priv;
- int ret;
switch (event->event) {
case IB_CM_REQ_RECEIVED:
@@ -289,20 +333,9 @@
case IB_CM_REJ_RECEIVED:
p = cm_id->context;
priv = netdev_priv(p->dev);
- spin_lock_irq(&priv->lock);
- if (list_empty(&p->list))
- ret = 0; /* Connection is going away already. */
- else {
- list_del_init(&p->list);
- ret = -ECONNRESET;
- }
- spin_unlock_irq(&priv->lock);
- if (ret) {
- ib_destroy_qp(p->qp);
- kfree(p);
- return ret;
- }
- return 0;
+ if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+ ipoib_warn(priv, "unable to move qp to error state\n");
+ /* Fall through */
default:
return 0;
}
@@ -354,8 +387,15 @@
wr_id, wc->status);
if (unlikely(wr_id >= ipoib_recvq_size)) {
- ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
- wr_id, ipoib_recvq_size);
+ if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+ spin_lock_irqsave(&priv->lock, flags);
+ list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+ ipoib_cm_start_rx_drain(priv);
+ queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+ spin_unlock_irqrestore(&priv->lock, flags);
+ } else
+ ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+ wr_id, ipoib_recvq_size);
return;
}
@@ -374,9 +414,9 @@
if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
spin_lock_irqsave(&priv->lock, flags);
p->jiffies = jiffies;
- /* Move this entry to list head, but do
- * not re-add it if it has been removed. */
- if (!list_empty(&p->list))
+ /* Move this entry to list head, but do not re-add it
+ * if it has been moved out of list. */
+ if (p->state == IPOIB_CM_RX_LIVE)
list_move(&p->list, &priv->cm.passive_ids);
spin_unlock_irqrestore(&priv->lock, flags);
}
@@ -583,17 +623,43 @@
int ipoib_cm_dev_open(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
+ struct ib_qp_init_attr qp_init_attr = {
+ .send_cq = priv->cq, /* does not matter, we never send anything */
+ .recv_cq = priv->cq,
+ .cap.max_send_wr = 1, /* FIXME: 0 Seems not to work */
+ .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+ .cap.max_recv_wr = 1,
+ .cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
+ .sq_sig_type = IB_SIGNAL_ALL_WR,
+ .qp_type = IB_QPT_UC,
+ };
int ret;
if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
return 0;
+ priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
+ if (IS_ERR(priv->cm.rx_drain_qp)) {
+ printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+ ret = PTR_ERR(priv->cm.rx_drain_qp);
+ return ret;
+ }
+
+ /*
+ * We put the QP in error state directly. This way, a "flush
+ * error" WC will be immediately generated for each WR we post.
+ */
+ ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
+ if (ret) {
+ ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
+ goto err_qp;
+ }
+
priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
if (IS_ERR(priv->cm.id)) {
printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
ret = PTR_ERR(priv->cm.id);
- priv->cm.id = NULL;
- return ret;
+ goto err_cm;
}
ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
@@ -601,35 +667,79 @@
if (ret) {
printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
IPOIB_CM_IETF_ID | priv->qp->qp_num);
- ib_destroy_cm_id(priv->cm.id);
- priv->cm.id = NULL;
- return ret;
+ goto err_listen;
}
+
return 0;
+
+err_listen:
+ ib_destroy_cm_id(priv->cm.id);
+err_cm:
+ priv->cm.id = NULL;
+err_qp:
+ ib_destroy_qp(priv->cm.rx_drain_qp);
+ return ret;
}
void ipoib_cm_dev_stop(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
- struct ipoib_cm_rx *p;
+ struct ipoib_cm_rx *p, *n;
+ unsigned long begin;
+ LIST_HEAD(list);
+ int ret;
if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
return;
ib_destroy_cm_id(priv->cm.id);
priv->cm.id = NULL;
+
spin_lock_irq(&priv->lock);
while (!list_empty(&priv->cm.passive_ids)) {
p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
- list_del_init(&p->list);
+ list_move(&p->list, &priv->cm.rx_error_list);
+ p->state = IPOIB_CM_RX_ERROR;
spin_unlock_irq(&priv->lock);
+ ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+ if (ret)
+ ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+ spin_lock_irq(&priv->lock);
+ }
+
+ /* Wait for all RX to be drained */
+ begin = jiffies;
+
+ while (!list_empty(&priv->cm.rx_error_list) ||
+ !list_empty(&priv->cm.rx_flush_list) ||
+ !list_empty(&priv->cm.rx_drain_list)) {
+ if (!time_after(jiffies, begin + 5 * HZ)) {
+ ipoib_warn(priv, "RX drain timing out\n");
+
+ /*
+ * assume the HW is wedged and just free up everything.
+ */
+ list_splice_init(&priv->cm.rx_flush_list, &list);
+ list_splice_init(&priv->cm.rx_error_list, &list);
+ list_splice_init(&priv->cm.rx_drain_list, &list);
+ break;
+ }
+ spin_unlock_irq(&priv->lock);
+ msleep(1);
+ spin_lock_irq(&priv->lock);
+ }
+
+ list_splice_init(&priv->cm.rx_reap_list, &list);
+
+ spin_unlock_irq(&priv->lock);
+
+ list_for_each_entry_safe(p, n, &list, list) {
ib_destroy_cm_id(p->id);
ib_destroy_qp(p->qp);
kfree(p);
- spin_lock_irq(&priv->lock);
}
- spin_unlock_irq(&priv->lock);
+ ib_destroy_qp(priv->cm.rx_drain_qp);
cancel_delayed_work(&priv->cm.stale_task);
}
@@ -1079,24 +1189,44 @@
queue_work(ipoib_workqueue, &priv->cm.skb_task);
}
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+ cm.rx_reap_task);
+ struct ipoib_cm_rx *p, *n;
+ LIST_HEAD(list);
+
+ spin_lock_irq(&priv->lock);
+ list_splice_init(&priv->cm.rx_reap_list, &list);
+ spin_unlock_irq(&priv->lock);
+
+ list_for_each_entry_safe(p, n, &list, list) {
+ ib_destroy_cm_id(p->id);
+ ib_destroy_qp(p->qp);
+ kfree(p);
+ }
+}
+
static void ipoib_cm_stale_task(struct work_struct *work)
{
struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
cm.stale_task.work);
struct ipoib_cm_rx *p;
+ int ret;
spin_lock_irq(&priv->lock);
while (!list_empty(&priv->cm.passive_ids)) {
- /* List if sorted by LRU, start from tail,
+ /* List is sorted by LRU, start from tail,
* stop when we see a recently used entry */
p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
break;
- list_del_init(&p->list);
+ list_move(&p->list, &priv->cm.rx_error_list);
+ p->state = IPOIB_CM_RX_ERROR;
spin_unlock_irq(&priv->lock);
- ib_destroy_cm_id(p->id);
- ib_destroy_qp(p->qp);
- kfree(p);
+ ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+ if (ret)
+ ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
spin_lock_irq(&priv->lock);
}
@@ -1164,9 +1294,14 @@
INIT_LIST_HEAD(&priv->cm.passive_ids);
INIT_LIST_HEAD(&priv->cm.reap_list);
INIT_LIST_HEAD(&priv->cm.start_list);
+ INIT_LIST_HEAD(&priv->cm.rx_error_list);
+ INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+ INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+ INIT_LIST_HEAD(&priv->cm.rx_reap_list);
INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+ INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
skb_queue_head_init(&priv->cm.skb_queue);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 68d72c6..c1aad06 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -448,6 +448,13 @@
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
+ ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ return -1;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
ret = ipoib_init_qp(dev);
if (ret) {
ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
@@ -457,14 +464,14 @@
ret = ipoib_ib_post_receives(dev);
if (ret) {
ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
- ipoib_ib_dev_stop(dev);
+ ipoib_ib_dev_stop(dev, 1);
return -1;
}
ret = ipoib_cm_dev_open(dev);
if (ret) {
- ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
- ipoib_ib_dev_stop(dev);
+ ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+ ipoib_ib_dev_stop(dev, 1);
return -1;
}
@@ -516,7 +523,7 @@
if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
mutex_lock(&pkey_mutex);
set_bit(IPOIB_PKEY_STOP, &priv->flags);
- cancel_delayed_work(&priv->pkey_task);
+ cancel_delayed_work(&priv->pkey_poll_task);
mutex_unlock(&pkey_mutex);
if (flush)
flush_workqueue(ipoib_workqueue);
@@ -543,7 +550,7 @@
return pending;
}
-int ipoib_ib_dev_stop(struct net_device *dev)
+int ipoib_ib_dev_stop(struct net_device *dev, int flush)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ib_qp_attr qp_attr;
@@ -629,7 +636,8 @@
/* Wait for all AHs to be reaped */
set_bit(IPOIB_STOP_REAPER, &priv->flags);
cancel_delayed_work(&priv->ah_reap_task);
- flush_workqueue(ipoib_workqueue);
+ if (flush)
+ flush_workqueue(ipoib_workqueue);
begin = jiffies;
@@ -673,13 +681,24 @@
return 0;
}
-void ipoib_ib_dev_flush(struct work_struct *work)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
{
- struct ipoib_dev_priv *cpriv, *priv =
- container_of(work, struct ipoib_dev_priv, flush_task);
+ struct ipoib_dev_priv *cpriv;
struct net_device *dev = priv->dev;
+ u16 new_index;
- if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) ) {
+ mutex_lock(&priv->vlan_mutex);
+
+ /*
+ * Flush any child interfaces too -- they might be up even if
+ * the parent is down.
+ */
+ list_for_each_entry(cpriv, &priv->child_intfs, list)
+ __ipoib_ib_dev_flush(cpriv, pkey_event);
+
+ mutex_unlock(&priv->vlan_mutex);
+
+ if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
return;
}
@@ -689,10 +708,32 @@
return;
}
+ if (pkey_event) {
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
+ clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ ipoib_ib_dev_down(dev, 0);
+ ipoib_pkey_dev_delay_open(dev);
+ return;
+ }
+ set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+ /* restart QP only if P_Key index is changed */
+ if (new_index == priv->pkey_index) {
+ ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+ return;
+ }
+ priv->pkey_index = new_index;
+ }
+
ipoib_dbg(priv, "flushing\n");
ipoib_ib_dev_down(dev, 0);
+ if (pkey_event) {
+ ipoib_ib_dev_stop(dev, 0);
+ ipoib_ib_dev_open(dev);
+ }
+
/*
* The device could have been brought down between the start and when
* we get here, don't bring it back up if it's not configured up
@@ -701,14 +742,24 @@
ipoib_ib_dev_up(dev);
ipoib_mcast_restart_task(&priv->restart_task);
}
+}
- mutex_lock(&priv->vlan_mutex);
+void ipoib_ib_dev_flush(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, flush_task);
- /* Flush any child interfaces too */
- list_for_each_entry(cpriv, &priv->child_intfs, list)
- ipoib_ib_dev_flush(&cpriv->flush_task);
+ ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
+ __ipoib_ib_dev_flush(priv, 0);
+}
- mutex_unlock(&priv->vlan_mutex);
+void ipoib_pkey_event(struct work_struct *work)
+{
+ struct ipoib_dev_priv *priv =
+ container_of(work, struct ipoib_dev_priv, pkey_event_task);
+
+ ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
+ __ipoib_ib_dev_flush(priv, 1);
}
void ipoib_ib_dev_cleanup(struct net_device *dev)
@@ -736,7 +787,7 @@
void ipoib_pkey_poll(struct work_struct *work)
{
struct ipoib_dev_priv *priv =
- container_of(work, struct ipoib_dev_priv, pkey_task.work);
+ container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
struct net_device *dev = priv->dev;
ipoib_pkey_dev_check_presence(dev);
@@ -747,7 +798,7 @@
mutex_lock(&pkey_mutex);
if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
queue_delayed_work(ipoib_workqueue,
- &priv->pkey_task,
+ &priv->pkey_poll_task,
HZ);
mutex_unlock(&pkey_mutex);
}
@@ -766,7 +817,7 @@
mutex_lock(&pkey_mutex);
clear_bit(IPOIB_PKEY_STOP, &priv->flags);
queue_delayed_work(ipoib_workqueue,
- &priv->pkey_task,
+ &priv->pkey_poll_task,
HZ);
mutex_unlock(&pkey_mutex);
return 1;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 0a428f2..894b1dcd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -107,7 +107,7 @@
return -EINVAL;
if (ipoib_ib_dev_up(dev)) {
- ipoib_ib_dev_stop(dev);
+ ipoib_ib_dev_stop(dev, 1);
return -EINVAL;
}
@@ -152,7 +152,7 @@
flush_workqueue(ipoib_workqueue);
ipoib_ib_dev_down(dev, 1);
- ipoib_ib_dev_stop(dev);
+ ipoib_ib_dev_stop(dev, 1);
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
struct ipoib_dev_priv *cpriv;
@@ -988,7 +988,8 @@
INIT_LIST_HEAD(&priv->dead_ahs);
INIT_LIST_HEAD(&priv->multicast_list);
- INIT_DELAYED_WORK(&priv->pkey_task, ipoib_pkey_poll);
+ INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+ INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
INIT_WORK(&priv->flush_task, ipoib_ib_dev_flush);
INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 54fbead..aae3670 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -524,7 +524,7 @@
return;
if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
- ipoib_warn(priv, "ib_gid_entry_get() failed\n");
+ ipoib_warn(priv, "ib_query_gid() failed\n");
else
memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 5c3c6a4..982eb88 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -33,8 +33,6 @@
* $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
*/
-#include <rdma/ib_cache.h>
-
#include "ipoib.h"
int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
@@ -49,7 +47,7 @@
if (!qp_attr)
goto out;
- if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+ if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
ret = -ENXIO;
goto out;
@@ -94,26 +92,16 @@
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
- u16 pkey_index;
struct ib_qp_attr qp_attr;
int attr_mask;
- /*
- * Search through the port P_Key table for the requested pkey value.
- * The port has to be assigned to the respective IB partition in
- * advance.
- */
- ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
- if (ret) {
- clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
- return ret;
- }
- set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+ if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+ return -1;
qp_attr.qp_state = IB_QPS_INIT;
qp_attr.qkey = 0;
qp_attr.port_num = priv->port;
- qp_attr.pkey_index = pkey_index;
+ qp_attr.pkey_index = priv->pkey_index;
attr_mask =
IB_QP_QKEY |
IB_QP_PORT |
@@ -185,7 +173,7 @@
size = ipoib_sendq_size + ipoib_recvq_size + 1;
ret = ipoib_cm_dev_init(dev);
if (!ret)
- size += ipoib_recvq_size;
+ size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
if (IS_ERR(priv->cq)) {
@@ -259,14 +247,18 @@
struct ipoib_dev_priv *priv =
container_of(handler, struct ipoib_dev_priv, event_handler);
- if ((record->event == IB_EVENT_PORT_ERR ||
- record->event == IB_EVENT_PKEY_CHANGE ||
- record->event == IB_EVENT_PORT_ACTIVE ||
- record->event == IB_EVENT_LID_CHANGE ||
- record->event == IB_EVENT_SM_CHANGE ||
- record->event == IB_EVENT_CLIENT_REREGISTER) &&
- record->element.port_num == priv->port) {
+ if (record->element.port_num != priv->port)
+ return;
+
+ if (record->event == IB_EVENT_PORT_ERR ||
+ record->event == IB_EVENT_PORT_ACTIVE ||
+ record->event == IB_EVENT_LID_CHANGE ||
+ record->event == IB_EVENT_SM_CHANGE ||
+ record->event == IB_EVENT_CLIENT_REREGISTER) {
ipoib_dbg(priv, "Port state change event\n");
queue_work(ipoib_workqueue, &priv->flush_task);
+ } else if (record->event == IB_EVENT_PKEY_CHANGE) {
+ ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
+ queue_work(ipoib_workqueue, &priv->pkey_event_task);
}
}
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index c427173..cfa5cc0 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -90,7 +90,7 @@
int i;
mlx4_dbg(dev, "DEV_CAP flags:\n");
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < ARRAY_SIZE(fname); ++i)
if (fname[i] && (flags & (1 << i)))
mlx4_dbg(dev, " %s\n", fname[i]);
}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 47cefca..0627a6a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -890,6 +890,8 @@
spinlock_t client_data_lock;
struct ib_cache cache;
+ int *pkey_tbl_len;
+ int *gid_tbl_len;
u32 flags;
@@ -1118,6 +1120,12 @@
u8 port_num, int port_modify_mask,
struct ib_port_modify *port_modify);
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index);
+
/**
* ib_alloc_pd - Allocates an unused protection domain.
* @device: The device on which to allocate the protection domain.