Merge branch 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband

* 'for-linus' of master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband:
  IB/cm: Improve local id allocation
  IPoIB/cm: Fix SRQ WR leak
  IB/ipoib: Fix typos in error messages
  IB/mlx4: Check if SRQ is full when posting receive
  IB/mlx4: Pass send queue sizes from userspace to kernel
  IB/mlx4: Fix check of opcode in mlx4_ib_post_send()
  mlx4_core: Fix array overrun in dump_dev_cap_flags()
  IB/mlx4: Fix RESET to RESET and RESET to ERROR transitions
  IB/mthca: Fix RESET to ERROR transition
  IB/mlx4: Set GRH:HopLimit when sending globally routed MADs
  IB/mthca: Set GRH:HopLimit when building MLX headers
  IB/mlx4: Fix check of max_qp_dest_rdma in modify QP
  IB/mthca: Fix use-after-free on device restart
  IB/ehca: Return proper error code if register_mr fails
  IPoIB: Handle P_Key table reordering
  IB/core: Use start_port() and end_port()
  IB/core: Add helpers for uncached GID and P_Key searches
  IB/ipath: Fix potential deadlock with multicast spinlocks
  IB/core: Free umem when mm is already gone
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index eff591d..e840434 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -306,7 +306,9 @@
 	do {
 		spin_lock_irqsave(&cm.lock, flags);
 		ret = idr_get_new_above(&cm.local_id_table, cm_id_priv,
-					next_id++, &id);
+					next_id, &id);
+		if (!ret)
+			next_id = ((unsigned) id + 1) & MAX_ID_MASK;
 		spin_unlock_irqrestore(&cm.lock, flags);
 	} while( (ret == -EAGAIN) && idr_pre_get(&cm.local_id_table, GFP_KERNEL) );
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index bcecf4d..3ada17c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -150,6 +150,18 @@
 	return 0;
 }
 
+static int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
 /**
  * ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
@@ -209,6 +221,45 @@
 	return 0;
 }
 
+static int read_port_table_lengths(struct ib_device *device)
+{
+	struct ib_port_attr *tprops = NULL;
+	int num_ports, ret = -ENOMEM;
+	u8 port_index;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		goto out;
+
+	num_ports = end_port(device) - start_port(device) + 1;
+
+	device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+				       GFP_KERNEL);
+	device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+				      GFP_KERNEL);
+	if (!device->pkey_tbl_len || !device->gid_tbl_len)
+		goto err;
+
+	for (port_index = 0; port_index < num_ports; ++port_index) {
+		ret = ib_query_port(device, port_index + start_port(device),
+					tprops);
+		if (ret)
+			goto err;
+		device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+		device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
+	}
+
+	ret = 0;
+	goto out;
+
+err:
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+out:
+	kfree(tprops);
+	return ret;
+}
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
@@ -240,10 +291,19 @@
 	spin_lock_init(&device->event_handler_lock);
 	spin_lock_init(&device->client_data_lock);
 
+	ret = read_port_table_lengths(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+		       device->name);
+		goto out;
+	}
+
 	ret = ib_device_register_sysfs(device);
 	if (ret) {
 		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
 		       device->name);
+		kfree(device->gid_tbl_len);
+		kfree(device->pkey_tbl_len);
 		goto out;
 	}
 
@@ -285,6 +345,9 @@
 
 	list_del(&device->core_list);
 
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+
 	mutex_unlock(&device_mutex);
 
 	spin_lock_irqsave(&device->client_data_lock, flags);
@@ -507,10 +570,7 @@
 		  u8 port_num,
 		  struct ib_port_attr *port_attr)
 {
-	if (device->node_type == RDMA_NODE_IB_SWITCH) {
-		if (port_num)
-			return -EINVAL;
-	} else if (port_num < 1 || port_num > device->phys_port_cnt)
+	if (port_num < start_port(device) || port_num > end_port(device))
 		return -EINVAL;
 
 	return device->query_port(device, port_num, port_attr);
@@ -582,10 +642,7 @@
 		   u8 port_num, int port_modify_mask,
 		   struct ib_port_modify *port_modify)
 {
-	if (device->node_type == RDMA_NODE_IB_SWITCH) {
-		if (port_num)
-			return -EINVAL;
-	} else if (port_num < 1 || port_num > device->phys_port_cnt)
+	if (port_num < start_port(device) || port_num > end_port(device))
 		return -EINVAL;
 
 	return device->modify_port(device, port_num, port_modify_mask,
@@ -593,6 +650,68 @@
 }
 EXPORT_SYMBOL(ib_modify_port);
 
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index)
+{
+	union ib_gid tmp_gid;
+	int ret, port, i;
+
+	for (port = start_port(device); port <= end_port(device); ++port) {
+		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+			ret = ib_query_gid(device, port, i, &tmp_gid);
+			if (ret)
+				return ret;
+			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+				*port_num = port;
+				if (index)
+					*index = i;
+				return 0;
+			}
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index)
+{
+	int ret, i;
+	u16 tmp_pkey;
+
+	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+		if (ret)
+			return ret;
+
+		if (pkey == tmp_pkey) {
+			*index = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
 static int __init ib_core_init(void)
 {
 	int ret;
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 96a16c0..b4aec51 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -210,8 +210,10 @@
 	__ib_umem_release(umem->context->device, umem, 1);
 
 	mm = get_task_mm(current);
-	if (!mm)
+	if (!mm) {
+		kfree(umem);
 		return;
+	}
 
 	diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
 
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index 84c5bb4..add79bd 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -2050,13 +2050,10 @@
 	switch (hipz_rc) {
 	case H_SUCCESS:	             /* successful completion */
 		return 0;
-	case H_ADAPTER_PARM:         /* invalid adapter handle */
-	case H_RT_PARM:              /* invalid resource type */
 	case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
-	case H_MLENGTH_PARM:         /* invalid memory length */
-	case H_MEM_ACCESS_PARM:      /* invalid access controls */
 	case H_CONSTRAINED:          /* resource constraint */
-		return -EINVAL;
+	case H_NO_MEM:
+		return -ENOMEM;
 	case H_BUSY:                 /* long busy */
 		return -EBUSY;
 	default:
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
index 085e28b..dd691cf 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -165,10 +165,9 @@
 {
 	struct rb_node **n = &mcast_tree.rb_node;
 	struct rb_node *pn = NULL;
-	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(&mcast_lock, flags);
+	spin_lock_irq(&mcast_lock);
 
 	while (*n) {
 		struct ipath_mcast *tmcast;
@@ -228,7 +227,7 @@
 	ret = 0;
 
 bail:
-	spin_unlock_irqrestore(&mcast_lock, flags);
+	spin_unlock_irq(&mcast_lock);
 
 	return ret;
 }
@@ -289,17 +288,16 @@
 	struct ipath_mcast *mcast = NULL;
 	struct ipath_mcast_qp *p, *tmp;
 	struct rb_node *n;
-	unsigned long flags;
 	int last = 0;
 	int ret;
 
-	spin_lock_irqsave(&mcast_lock, flags);
+	spin_lock_irq(&mcast_lock);
 
 	/* Find the GID in the mcast table. */
 	n = mcast_tree.rb_node;
 	while (1) {
 		if (n == NULL) {
-			spin_unlock_irqrestore(&mcast_lock, flags);
+			spin_unlock_irq(&mcast_lock);
 			ret = -EINVAL;
 			goto bail;
 		}
@@ -334,7 +332,7 @@
 		break;
 	}
 
-	spin_unlock_irqrestore(&mcast_lock, flags);
+	spin_unlock_irq(&mcast_lock);
 
 	if (p) {
 		/*
@@ -348,9 +346,9 @@
 		atomic_dec(&mcast->refcount);
 		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
 		ipath_mcast_free(mcast);
-		spin_lock(&dev->n_mcast_grps_lock);
+		spin_lock_irq(&dev->n_mcast_grps_lock);
 		dev->n_mcast_grps_allocated--;
-		spin_unlock(&dev->n_mcast_grps_lock);
+		spin_unlock_irq(&dev->n_mcast_grps_lock);
 	}
 
 	ret = 0;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 5cd7069..a824bc5 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -188,14 +188,32 @@
 	}
 }
 
-static int set_qp_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
-		       enum ib_qp_type type, struct mlx4_ib_qp *qp)
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+		       struct mlx4_ib_qp *qp)
 {
-	/* Sanity check QP size before proceeding */
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr  > dev->dev->caps.max_wqes  ||
+	    cap->max_recv_sge > dev->dev->caps.max_rq_sg)
+		return -EINVAL;
+
+	qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
+
+	qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
+						    sizeof (struct mlx4_wqe_data_seg)));
+	qp->rq.max_gs    = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+
+	cap->max_recv_wr  = qp->rq.max;
+	cap->max_recv_sge = qp->rq.max_gs;
+
+	return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+			      enum ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+	/* Sanity check SQ size before proceeding */
 	if (cap->max_send_wr	 > dev->dev->caps.max_wqes  ||
-	    cap->max_recv_wr	 > dev->dev->caps.max_wqes  ||
 	    cap->max_send_sge	 > dev->dev->caps.max_sq_sg ||
-	    cap->max_recv_sge	 > dev->dev->caps.max_rq_sg ||
 	    cap->max_inline_data + send_wqe_overhead(type) +
 	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
 		return -EINVAL;
@@ -208,12 +226,7 @@
 	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
 		return -EINVAL;
 
-	qp->rq.max = cap->max_recv_wr ? roundup_pow_of_two(cap->max_recv_wr) : 0;
-	qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 0;
-
-	qp->rq.wqe_shift = ilog2(roundup_pow_of_two(cap->max_recv_sge *
-						    sizeof (struct mlx4_wqe_data_seg)));
-	qp->rq.max_gs    = (1 << qp->rq.wqe_shift) / sizeof (struct mlx4_wqe_data_seg);
+	qp->sq.max = cap->max_send_wr ? roundup_pow_of_two(cap->max_send_wr) : 1;
 
 	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
 							sizeof (struct mlx4_wqe_data_seg),
@@ -233,16 +246,26 @@
 		qp->sq.offset = 0;
 	}
 
-	cap->max_send_wr  = qp->sq.max;
-	cap->max_recv_wr  = qp->rq.max;
-	cap->max_send_sge = qp->sq.max_gs;
-	cap->max_recv_sge = qp->rq.max_gs;
+	cap->max_send_wr     = qp->sq.max;
+	cap->max_send_sge    = qp->sq.max_gs;
 	cap->max_inline_data = (1 << qp->sq.wqe_shift) - send_wqe_overhead(type) -
 		sizeof (struct mlx4_wqe_inline_seg);
 
 	return 0;
 }
 
+static int set_user_sq_size(struct mlx4_ib_qp *qp,
+			    struct mlx4_ib_create_qp *ucmd)
+{
+	qp->sq.max       = 1 << ucmd->log_sq_bb_count;
+	qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+	qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
+		(qp->sq.max << qp->sq.wqe_shift);
+
+	return 0;
+}
+
 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
@@ -264,7 +287,7 @@
 	qp->sq.head	    = 0;
 	qp->sq.tail	    = 0;
 
-	err = set_qp_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+	err = set_rq_size(dev, &init_attr->cap, qp);
 	if (err)
 		goto err;
 
@@ -276,6 +299,10 @@
 			goto err;
 		}
 
+		err = set_user_sq_size(qp, &ucmd);
+		if (err)
+			goto err;
+
 		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
 				       qp->buf_size, 0);
 		if (IS_ERR(qp->umem)) {
@@ -297,6 +324,10 @@
 		if (err)
 			goto err_mtt;
 	} else {
+		err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
+		if (err)
+			goto err;
+
 		err = mlx4_ib_db_alloc(dev, &qp->db, 0);
 		if (err)
 			goto err;
@@ -573,7 +604,7 @@
 	}
 }
 
-static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, struct ib_qp_attr *attr,
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
 				   int attr_mask)
 {
 	u8 dest_rd_atomic;
@@ -603,7 +634,7 @@
 	return cpu_to_be32(hw_access_flags);
 }
 
-static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, struct ib_qp_attr *attr,
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
 			    int attr_mask)
 {
 	if (attr_mask & IB_QP_PKEY_INDEX)
@@ -619,7 +650,7 @@
 	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
 }
 
-static int mlx4_set_path(struct mlx4_ib_dev *dev, struct ib_ah_attr *ah,
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 			 struct mlx4_qp_path *path, u8 port)
 {
 	path->grh_mylmc     = ah->src_path_bits & 0x7f;
@@ -655,14 +686,14 @@
 	return 0;
 }
 
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		      int attr_mask, struct ib_udata *udata)
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
 	struct mlx4_qp_context *context;
 	enum mlx4_qp_optpar optpar = 0;
-	enum ib_qp_state cur_state, new_state;
 	int sqd_event;
 	int err = -EINVAL;
 
@@ -670,34 +701,6 @@
 	if (!context)
 		return -ENOMEM;
 
-	mutex_lock(&qp->mutex);
-
-	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
-	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
-		goto out;
-
-	if ((attr_mask & IB_QP_PKEY_INDEX) &&
-	     attr->pkey_index >= dev->dev->caps.pkey_table_len) {
-		goto out;
-	}
-
-	if ((attr_mask & IB_QP_PORT) &&
-	    (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
-		goto out;
-	}
-
-	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
-	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
-		goto out;
-	}
-
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
-	    attr->max_dest_rd_atomic > 1 << dev->dev->caps.max_qp_dest_rdma) {
-		goto out;
-	}
-
 	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
 				     (to_mlx4_st(ibqp->qp_type) << 16));
 	context->flags     |= cpu_to_be32(1 << 8); /* DE? */
@@ -920,11 +923,84 @@
 	}
 
 out:
-	mutex_unlock(&qp->mutex);
 	kfree(context);
 	return err;
 }
 
+static const struct ib_qp_attr mlx4_ib_qp_attr = { .port_num = 1 };
+static const int mlx4_ib_qp_attr_mask_table[IB_QPT_UD + 1] = {
+		[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+				IB_QP_PORT			|
+				IB_QP_QKEY),
+		[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+				IB_QP_PORT			|
+				IB_QP_ACCESS_FLAGS),
+		[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+				IB_QP_PORT			|
+				IB_QP_ACCESS_FLAGS),
+		[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+				IB_QP_QKEY),
+		[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+				IB_QP_QKEY),
+};
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+		goto out;
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->dev->caps.pkey_table_len) {
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+		err = __mlx4_ib_modify_qp(ibqp, &mlx4_ib_qp_attr,
+					  mlx4_ib_qp_attr_mask_table[ibqp->qp_type],
+					  IB_QPS_RESET, IB_QPS_INIT);
+		if (err)
+			goto out;
+		cur_state = IB_QPS_INIT;
+	}
+
+	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			    void *wqe)
 {
@@ -952,6 +1028,7 @@
 			(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
 		sqp->ud_header.grh.flow_label    =
 			ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->ud_header.grh.hop_limit     = ah->av.hop_limit;
 		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
 				  ah->av.gid_index, &sqp->ud_header.grh.source_gid);
 		memcpy(sqp->ud_header.grh.destination_gid.raw,
@@ -1192,7 +1269,7 @@
 		 */
 		wmb();
 
-		if (wr->opcode < 0 || wr->opcode > ARRAY_SIZE(mlx4_ib_opcode)) {
+		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
 			err = -EINVAL;
 			goto out;
 		}
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 42ab4a8..12fac1c 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -297,6 +297,12 @@
 			break;
 		}
 
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
 		srq->wrid[srq->head] = wr->wr_id;
 
 		next      = get_wqe(srq, srq->head);
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
index 5b8eddc..88c72d5 100644
--- a/drivers/infiniband/hw/mlx4/user.h
+++ b/drivers/infiniband/hw/mlx4/user.h
@@ -39,7 +39,7 @@
  * Increment this value if any changes that break userspace ABI
  * compatibility are made.
  */
-#define MLX4_IB_UVERBS_ABI_VERSION	1
+#define MLX4_IB_UVERBS_ABI_VERSION	2
 
 /*
  * Make sure that all structs defined in this file remain laid out so
@@ -87,6 +87,9 @@
 struct mlx4_ib_create_qp {
 	__u64	buf_addr;
 	__u64	db_addr;
+        __u8	log_sq_bb_count;
+        __u8	log_sq_stride;
+        __u8	reserved[6];
 };
 
 #endif /* MLX4_IB_USER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 27caf3b..4b111a8 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -279,6 +279,7 @@
 			(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
 		header->grh.flow_label    =
 			ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		header->grh.hop_limit     = ah->av->hop_limit;
 		ib_get_cached_gid(&dev->ib_dev,
 				  be32_to_cpu(ah->av->port_pd) >> 24,
 				  ah->av->gid_index % dev->limits.gid_table_len,
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 773145e..aa563e6 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -1250,12 +1250,14 @@
 int __mthca_restart_one(struct pci_dev *pdev)
 {
 	struct mthca_dev *mdev;
+	int hca_type;
 
 	mdev = pci_get_drvdata(pdev);
 	if (!mdev)
 		return -ENODEV;
+	hca_type = mdev->hca_type;
 	__mthca_remove_one(pdev);
-	return __mthca_init_one(pdev, mdev->hca_type);
+	return __mthca_init_one(pdev, hca_type);
 }
 
 static int __devinit mthca_init_one(struct pci_dev *pdev,
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 2741ded..0276649 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -296,7 +296,7 @@
 	}
 }
 
-static void store_attrs(struct mthca_sqp *sqp, struct ib_qp_attr *attr,
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
 			int attr_mask)
 {
 	if (attr_mask & IB_QP_PKEY_INDEX)
@@ -328,7 +328,7 @@
 		mthca_warn(dev, "INIT_IB returned status %02x.\n", status);
 }
 
-static __be32 get_hw_access_flags(struct mthca_qp *qp, struct ib_qp_attr *attr,
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
 				  int attr_mask)
 {
 	u8 dest_rd_atomic;
@@ -511,7 +511,7 @@
 	return err;
 }
 
-static int mthca_path_set(struct mthca_dev *dev, struct ib_ah_attr *ah,
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
 			  struct mthca_qp_path *path, u8 port)
 {
 	path->g_mylmc     = ah->src_path_bits & 0x7f;
@@ -539,12 +539,12 @@
 	return 0;
 }
 
-int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
-		    struct ib_udata *udata)
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+			     const struct ib_qp_attr *attr, int attr_mask,
+			     enum ib_qp_state cur_state, enum ib_qp_state new_state)
 {
 	struct mthca_dev *dev = to_mdev(ibqp->device);
 	struct mthca_qp *qp = to_mqp(ibqp);
-	enum ib_qp_state cur_state, new_state;
 	struct mthca_mailbox *mailbox;
 	struct mthca_qp_param *qp_param;
 	struct mthca_qp_context *qp_context;
@@ -552,60 +552,6 @@
 	u8 status;
 	int err = -EINVAL;
 
-	mutex_lock(&qp->mutex);
-
-	if (attr_mask & IB_QP_CUR_STATE) {
-		cur_state = attr->cur_qp_state;
-	} else {
-		spin_lock_irq(&qp->sq.lock);
-		spin_lock(&qp->rq.lock);
-		cur_state = qp->state;
-		spin_unlock(&qp->rq.lock);
-		spin_unlock_irq(&qp->sq.lock);
-	}
-
-	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
-
-	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
-		mthca_dbg(dev, "Bad QP transition (transport %d) "
-			  "%d->%d with attr 0x%08x\n",
-			  qp->transport, cur_state, new_state,
-			  attr_mask);
-		goto out;
-	}
-
-	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
-		err = 0;
-		goto out;
-	}
-
-	if ((attr_mask & IB_QP_PKEY_INDEX) &&
-	     attr->pkey_index >= dev->limits.pkey_table_len) {
-		mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
-			  attr->pkey_index, dev->limits.pkey_table_len-1);
-		goto out;
-	}
-
-	if ((attr_mask & IB_QP_PORT) &&
-	    (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
-		mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
-		goto out;
-	}
-
-	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
-	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
-		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
-			  attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
-		goto out;
-	}
-
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
-	    attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
-		mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
-			  attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
-		goto out;
-	}
-
 	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
 	if (IS_ERR(mailbox)) {
 		err = PTR_ERR(mailbox);
@@ -892,6 +838,98 @@
 
 out_mailbox:
 	mthca_free_mailbox(dev, mailbox);
+out:
+	return err;
+}
+
+static const struct ib_qp_attr dummy_init_attr = { .port_num = 1 };
+static const int dummy_init_attr_mask[] = {
+	[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+			IB_QP_PORT			|
+			IB_QP_QKEY),
+	[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+			IB_QP_PORT			|
+			IB_QP_ACCESS_FLAGS),
+	[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+			IB_QP_PORT			|
+			IB_QP_ACCESS_FLAGS),
+	[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+			IB_QP_QKEY),
+	[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+			IB_QP_QKEY),
+};
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+	if (attr_mask & IB_QP_CUR_STATE) {
+		cur_state = attr->cur_qp_state;
+	} else {
+		spin_lock_irq(&qp->sq.lock);
+		spin_lock(&qp->rq.lock);
+		cur_state = qp->state;
+		spin_unlock(&qp->rq.lock);
+		spin_unlock_irq(&qp->sq.lock);
+	}
+
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) {
+		mthca_dbg(dev, "Bad QP transition (transport %d) "
+			  "%d->%d with attr 0x%08x\n",
+			  qp->transport, cur_state, new_state,
+			  attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->limits.pkey_table_len) {
+		mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+			  attr->pkey_index, dev->limits.pkey_table_len-1);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+		mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+			  attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+		mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+			  attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_ERR) {
+		err = __mthca_modify_qp(ibqp, &dummy_init_attr,
+					dummy_init_attr_mask[ibqp->qp_type],
+					IB_QPS_RESET, IB_QPS_INIT);
+		if (err)
+			goto out;
+		cur_state = IB_QPS_INIT;
+	}
+
+	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
 
 out:
 	mutex_unlock(&qp->mutex);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 87310ee..a0b3782 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -132,12 +132,46 @@
 	__be32 mtu;
 };
 
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State.  The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ *       to be empty or the number of Poll CQ operations has exceeded
+ *       CQ capacity size;
+ * - or
+ *       post another WR that completes on the same CQ and wait for this
+ *       WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * rx_drain_qp before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+	IPOIB_CM_RX_LIVE,
+	IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+	IPOIB_CM_RX_FLUSH  /* Last WQE Reached event observed */
+};
+
 struct ipoib_cm_rx {
 	struct ib_cm_id     *id;
 	struct ib_qp        *qp;
 	struct list_head     list;
 	struct net_device   *dev;
 	unsigned long        jiffies;
+	enum ipoib_cm_state  state;
 };
 
 struct ipoib_cm_tx {
@@ -165,10 +199,16 @@
 	struct ib_srq  	       *srq;
 	struct ipoib_cm_rx_buf *srq_ring;
 	struct ib_cm_id        *id;
-	struct list_head        passive_ids;
+	struct ib_qp           *rx_drain_qp;   /* generates WR described in 10.3.1 */
+	struct list_head        passive_ids;   /* state: LIVE */
+	struct list_head        rx_error_list; /* state: ERROR */
+	struct list_head        rx_flush_list; /* state: FLUSH, drain not started */
+	struct list_head        rx_drain_list; /* state: FLUSH, drain started */
+	struct list_head        rx_reap_list;  /* state: FLUSH, drain done */
 	struct work_struct      start_task;
 	struct work_struct      reap_task;
 	struct work_struct      skb_task;
+	struct work_struct      rx_reap_task;
 	struct delayed_work     stale_task;
 	struct sk_buff_head     skb_queue;
 	struct list_head        start_list;
@@ -201,15 +241,17 @@
 	struct list_head multicast_list;
 	struct rb_root multicast_tree;
 
-	struct delayed_work pkey_task;
+	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
 	struct work_struct flush_task;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
+	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
 	u8            	  port;
 	u16           	  pkey;
+	u16               pkey_index;
 	struct ib_pd  	 *pd;
 	struct ib_mr  	 *mr;
 	struct ib_cq  	 *cq;
@@ -333,12 +375,13 @@
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
 int ipoib_ib_dev_open(struct net_device *dev);
 int ipoib_ib_dev_up(struct net_device *dev);
 int ipoib_ib_dev_down(struct net_device *dev, int flush);
-int ipoib_ib_dev_stop(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev, int flush);
 
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
 void ipoib_dev_cleanup(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index eec833b..ffec794 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -37,6 +37,7 @@
 #include <net/dst.h>
 #include <net/icmp.h>
 #include <linux/icmpv6.h>
+#include <linux/delay.h>
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
 static int data_debug_level;
@@ -62,6 +63,16 @@
 	u32 remote_mtu;
 };
 
+static struct ib_qp_attr ipoib_cm_err_attr = {
+	.qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
+
+static struct ib_recv_wr ipoib_cm_rx_drain_wr = {
+	.wr_id = IPOIB_CM_RX_DRAIN_WRID
+};
+
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event);
 
@@ -150,11 +161,44 @@
 	return NULL;
 }
 
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv* priv)
+{
+	struct ib_recv_wr *bad_wr;
+
+	/* rx_drain_qp send queue depth is 1, so
+	 * make sure we have at most 1 outstanding WR. */
+	if (list_empty(&priv->cm.rx_flush_list) ||
+	    !list_empty(&priv->cm.rx_drain_list))
+		return;
+
+	if (ib_post_recv(priv->cm.rx_drain_qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+		ipoib_warn(priv, "failed to post rx_drain wr\n");
+
+	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+	struct ipoib_cm_rx *p = ctx;
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	unsigned long flags;
+
+	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_move(&p->list, &priv->cm.rx_flush_list);
+	p->state = IPOIB_CM_RX_FLUSH;
+	ipoib_cm_start_rx_drain(priv);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
 static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 					   struct ipoib_cm_rx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
+		.event_handler = ipoib_cm_rx_event_handler,
 		.send_cq = priv->cq, /* does not matter, we never send anything */
 		.recv_cq = priv->cq,
 		.srq = priv->cm.srq,
@@ -256,6 +300,7 @@
 
 	cm_id->context = p;
 	p->jiffies = jiffies;
+	p->state = IPOIB_CM_RX_LIVE;
 	spin_lock_irq(&priv->lock);
 	if (list_empty(&priv->cm.passive_ids))
 		queue_delayed_work(ipoib_workqueue,
@@ -277,7 +322,6 @@
 {
 	struct ipoib_cm_rx *p;
 	struct ipoib_dev_priv *priv;
-	int ret;
 
 	switch (event->event) {
 	case IB_CM_REQ_RECEIVED:
@@ -289,20 +333,9 @@
 	case IB_CM_REJ_RECEIVED:
 		p = cm_id->context;
 		priv = netdev_priv(p->dev);
-		spin_lock_irq(&priv->lock);
-		if (list_empty(&p->list))
-			ret = 0; /* Connection is going away already. */
-		else {
-			list_del_init(&p->list);
-			ret = -ECONNRESET;
-		}
-		spin_unlock_irq(&priv->lock);
-		if (ret) {
-			ib_destroy_qp(p->qp);
-			kfree(p);
-			return ret;
-		}
-		return 0;
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+		/* Fall through */
 	default:
 		return 0;
 	}
@@ -354,8 +387,15 @@
 		       wr_id, wc->status);
 
 	if (unlikely(wr_id >= ipoib_recvq_size)) {
-		ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
-			   wr_id, ipoib_recvq_size);
+		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+			spin_lock_irqsave(&priv->lock, flags);
+			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+			ipoib_cm_start_rx_drain(priv);
+			queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		} else
+			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+				   wr_id, ipoib_recvq_size);
 		return;
 	}
 
@@ -374,9 +414,9 @@
 		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
 			spin_lock_irqsave(&priv->lock, flags);
 			p->jiffies = jiffies;
-			/* Move this entry to list head, but do
-			 * not re-add it if it has been removed. */
-			if (!list_empty(&p->list))
+			/* Move this entry to list head, but do not re-add it
+			 * if it has been moved out of list. */
+			if (p->state == IPOIB_CM_RX_LIVE)
 				list_move(&p->list, &priv->cm.passive_ids);
 			spin_unlock_irqrestore(&priv->lock, flags);
 		}
@@ -583,17 +623,43 @@
 int ipoib_cm_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr qp_init_attr = {
+		.send_cq = priv->cq,   /* does not matter, we never send anything */
+		.recv_cq = priv->cq,
+		.cap.max_send_wr = 1,  /* FIXME: 0 Seems not to work */
+		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+		.cap.max_recv_wr = 1,
+		.cap.max_recv_sge = 1, /* FIXME: 0 Seems not to work */
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type = IB_QPT_UC,
+	};
 	int ret;
 
 	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
 		return 0;
 
+	priv->cm.rx_drain_qp = ib_create_qp(priv->pd, &qp_init_attr);
+	if (IS_ERR(priv->cm.rx_drain_qp)) {
+		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+		ret = PTR_ERR(priv->cm.rx_drain_qp);
+		return ret;
+	}
+
+	/*
+	 * We put the QP in error state directly.  This way, a "flush
+	 * error" WC will be immediately generated for each WR we post.
+	 */
+	ret = ib_modify_qp(priv->cm.rx_drain_qp, &ipoib_cm_err_attr, IB_QP_STATE);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify drain QP to error: %d\n", ret);
+		goto err_qp;
+	}
+
 	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
 	if (IS_ERR(priv->cm.id)) {
 		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
 		ret = PTR_ERR(priv->cm.id);
-		priv->cm.id = NULL;
-		return ret;
+		goto err_cm;
 	}
 
 	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
@@ -601,35 +667,79 @@
 	if (ret) {
 		printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
 		       IPOIB_CM_IETF_ID | priv->qp->qp_num);
-		ib_destroy_cm_id(priv->cm.id);
-		priv->cm.id = NULL;
-		return ret;
+		goto err_listen;
 	}
+
 	return 0;
+
+err_listen:
+	ib_destroy_cm_id(priv->cm.id);
+err_cm:
+	priv->cm.id = NULL;
+err_qp:
+	ib_destroy_qp(priv->cm.rx_drain_qp);
+	return ret;
 }
 
 void ipoib_cm_dev_stop(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_cm_rx *p;
+	struct ipoib_cm_rx *p, *n;
+	unsigned long begin;
+	LIST_HEAD(list);
+	int ret;
 
 	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
 		return;
 
 	ib_destroy_cm_id(priv->cm.id);
 	priv->cm.id = NULL;
+
 	spin_lock_irq(&priv->lock);
 	while (!list_empty(&priv->cm.passive_ids)) {
 		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
-		list_del_init(&p->list);
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
 		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	/* Wait for all RX to be drained */
+	begin = jiffies;
+
+	while (!list_empty(&priv->cm.rx_error_list) ||
+	       !list_empty(&priv->cm.rx_flush_list) ||
+	       !list_empty(&priv->cm.rx_drain_list)) {
+		if (!time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "RX drain timing out\n");
+
+			/*
+			 * assume the HW is wedged and just free up everything.
+			 */
+			list_splice_init(&priv->cm.rx_flush_list, &list);
+			list_splice_init(&priv->cm.rx_error_list, &list);
+			list_splice_init(&priv->cm.rx_drain_list, &list);
+			break;
+		}
+		spin_unlock_irq(&priv->lock);
+		msleep(1);
+		spin_lock_irq(&priv->lock);
+	}
+
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(p, n, &list, list) {
 		ib_destroy_cm_id(p->id);
 		ib_destroy_qp(p->qp);
 		kfree(p);
-		spin_lock_irq(&priv->lock);
 	}
-	spin_unlock_irq(&priv->lock);
 
+	ib_destroy_qp(priv->cm.rx_drain_qp);
 	cancel_delayed_work(&priv->cm.stale_task);
 }
 
@@ -1079,24 +1189,44 @@
 		queue_work(ipoib_workqueue, &priv->cm.skb_task);
 }
 
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.rx_reap_task);
+	struct ipoib_cm_rx *p, *n;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&priv->lock);
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(p, n, &list, list) {
+		ib_destroy_cm_id(p->id);
+		ib_destroy_qp(p->qp);
+		kfree(p);
+	}
+}
+
 static void ipoib_cm_stale_task(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.stale_task.work);
 	struct ipoib_cm_rx *p;
+	int ret;
 
 	spin_lock_irq(&priv->lock);
 	while (!list_empty(&priv->cm.passive_ids)) {
-		/* List if sorted by LRU, start from tail,
+		/* List is sorted by LRU, start from tail,
 		 * stop when we see a recently used entry */
 		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
 		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
 			break;
-		list_del_init(&p->list);
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
 		spin_unlock_irq(&priv->lock);
-		ib_destroy_cm_id(p->id);
-		ib_destroy_qp(p->qp);
-		kfree(p);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
 		spin_lock_irq(&priv->lock);
 	}
 
@@ -1164,9 +1294,14 @@
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
 	INIT_LIST_HEAD(&priv->cm.reap_list);
 	INIT_LIST_HEAD(&priv->cm.start_list);
+	INIT_LIST_HEAD(&priv->cm.rx_error_list);
+	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
 	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
 	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
 	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
 	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
 
 	skb_queue_head_init(&priv->cm.skb_queue);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 68d72c6..c1aad06 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -448,6 +448,13 @@
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int ret;
 
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) {
+		ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey);
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		return -1;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
 	ret = ipoib_init_qp(dev);
 	if (ret) {
 		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
@@ -457,14 +464,14 @@
 	ret = ipoib_ib_post_receives(dev);
 	if (ret) {
 		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
-		ipoib_ib_dev_stop(dev);
+		ipoib_ib_dev_stop(dev, 1);
 		return -1;
 	}
 
 	ret = ipoib_cm_dev_open(dev);
 	if (ret) {
-		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
-		ipoib_ib_dev_stop(dev);
+		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+		ipoib_ib_dev_stop(dev, 1);
 		return -1;
 	}
 
@@ -516,7 +523,7 @@
 	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
 		mutex_lock(&pkey_mutex);
 		set_bit(IPOIB_PKEY_STOP, &priv->flags);
-		cancel_delayed_work(&priv->pkey_task);
+		cancel_delayed_work(&priv->pkey_poll_task);
 		mutex_unlock(&pkey_mutex);
 		if (flush)
 			flush_workqueue(ipoib_workqueue);
@@ -543,7 +550,7 @@
 	return pending;
 }
 
-int ipoib_ib_dev_stop(struct net_device *dev)
+int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
@@ -629,7 +636,8 @@
 	/* Wait for all AHs to be reaped */
 	set_bit(IPOIB_STOP_REAPER, &priv->flags);
 	cancel_delayed_work(&priv->ah_reap_task);
-	flush_workqueue(ipoib_workqueue);
+	if (flush)
+		flush_workqueue(ipoib_workqueue);
 
 	begin = jiffies;
 
@@ -673,13 +681,24 @@
 	return 0;
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 {
-	struct ipoib_dev_priv *cpriv, *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
+	u16 new_index;
 
-	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) ) {
+	mutex_lock(&priv->vlan_mutex);
+
+	/*
+	 * Flush any child interfaces too -- they might be up even if
+	 * the parent is down.
+	 */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		__ipoib_ib_dev_flush(cpriv, pkey_event);
+
+	mutex_unlock(&priv->vlan_mutex);
+
+	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) {
 		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
 		return;
 	}
@@ -689,10 +708,32 @@
 		return;
 	}
 
+	if (pkey_event) {
+		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
+			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+			ipoib_ib_dev_down(dev, 0);
+			ipoib_pkey_dev_delay_open(dev);
+			return;
+		}
+		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+		/* restart QP only if P_Key index is changed */
+		if (new_index == priv->pkey_index) {
+			ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+			return;
+		}
+		priv->pkey_index = new_index;
+	}
+
 	ipoib_dbg(priv, "flushing\n");
 
 	ipoib_ib_dev_down(dev, 0);
 
+	if (pkey_event) {
+		ipoib_ib_dev_stop(dev, 0);
+		ipoib_ib_dev_open(dev);
+	}
+
 	/*
 	 * The device could have been brought down between the start and when
 	 * we get here, don't bring it back up if it's not configured up
@@ -701,14 +742,24 @@
 		ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
+}
 
-	mutex_lock(&priv->vlan_mutex);
+void ipoib_ib_dev_flush(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_task);
 
-	/* Flush any child interfaces too */
-	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		ipoib_ib_dev_flush(&cpriv->flush_task);
+	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
+	__ipoib_ib_dev_flush(priv, 0);
+}
 
-	mutex_unlock(&priv->vlan_mutex);
+void ipoib_pkey_event(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, pkey_event_task);
+
+	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
+	__ipoib_ib_dev_flush(priv, 1);
 }
 
 void ipoib_ib_dev_cleanup(struct net_device *dev)
@@ -736,7 +787,7 @@
 void ipoib_pkey_poll(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_task.work);
+		container_of(work, struct ipoib_dev_priv, pkey_poll_task.work);
 	struct net_device *dev = priv->dev;
 
 	ipoib_pkey_dev_check_presence(dev);
@@ -747,7 +798,7 @@
 		mutex_lock(&pkey_mutex);
 		if (!test_bit(IPOIB_PKEY_STOP, &priv->flags))
 			queue_delayed_work(ipoib_workqueue,
-					   &priv->pkey_task,
+					   &priv->pkey_poll_task,
 					   HZ);
 		mutex_unlock(&pkey_mutex);
 	}
@@ -766,7 +817,7 @@
 		mutex_lock(&pkey_mutex);
 		clear_bit(IPOIB_PKEY_STOP, &priv->flags);
 		queue_delayed_work(ipoib_workqueue,
-				   &priv->pkey_task,
+				   &priv->pkey_poll_task,
 				   HZ);
 		mutex_unlock(&pkey_mutex);
 		return 1;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 0a428f2..894b1dcd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -107,7 +107,7 @@
 		return -EINVAL;
 
 	if (ipoib_ib_dev_up(dev)) {
-		ipoib_ib_dev_stop(dev);
+		ipoib_ib_dev_stop(dev, 1);
 		return -EINVAL;
 	}
 
@@ -152,7 +152,7 @@
 	flush_workqueue(ipoib_workqueue);
 
 	ipoib_ib_dev_down(dev, 1);
-	ipoib_ib_dev_stop(dev);
+	ipoib_ib_dev_stop(dev, 1);
 
 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
@@ -988,7 +988,8 @@
 	INIT_LIST_HEAD(&priv->dead_ahs);
 	INIT_LIST_HEAD(&priv->multicast_list);
 
-	INIT_DELAYED_WORK(&priv->pkey_task,    ipoib_pkey_poll);
+	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
+	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
 	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 54fbead..aae3670 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -524,7 +524,7 @@
 		return;
 
 	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
-		ipoib_warn(priv, "ib_gid_entry_get() failed\n");
+		ipoib_warn(priv, "ib_query_gid() failed\n");
 	else
 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 5c3c6a4..982eb88 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -33,8 +33,6 @@
  * $Id: ipoib_verbs.c 1349 2004-12-16 21:09:43Z roland $
  */
 
-#include <rdma/ib_cache.h>
-
 #include "ipoib.h"
 
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid)
@@ -49,7 +47,7 @@
 	if (!qp_attr)
 		goto out;
 
-	if (ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 		ret = -ENXIO;
 		goto out;
@@ -94,26 +92,16 @@
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int ret;
-	u16 pkey_index;
 	struct ib_qp_attr qp_attr;
 	int attr_mask;
 
-	/*
-	 * Search through the port P_Key table for the requested pkey value.
-	 * The port has to be assigned to the respective IB partition in
-	 * advance.
-	 */
-	ret = ib_find_cached_pkey(priv->ca, priv->port, priv->pkey, &pkey_index);
-	if (ret) {
-		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
-		return ret;
-	}
-	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		return -1;
 
 	qp_attr.qp_state = IB_QPS_INIT;
 	qp_attr.qkey = 0;
 	qp_attr.port_num = priv->port;
-	qp_attr.pkey_index = pkey_index;
+	qp_attr.pkey_index = priv->pkey_index;
 	attr_mask =
 	    IB_QP_QKEY |
 	    IB_QP_PORT |
@@ -185,7 +173,7 @@
 	size = ipoib_sendq_size + ipoib_recvq_size + 1;
 	ret = ipoib_cm_dev_init(dev);
 	if (!ret)
-		size += ipoib_recvq_size;
+		size += ipoib_recvq_size + 1 /* 1 extra for rx_drain_qp */;
 
 	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
 	if (IS_ERR(priv->cq)) {
@@ -259,14 +247,18 @@
 	struct ipoib_dev_priv *priv =
 		container_of(handler, struct ipoib_dev_priv, event_handler);
 
-	if ((record->event == IB_EVENT_PORT_ERR    ||
-	     record->event == IB_EVENT_PKEY_CHANGE ||
-	     record->event == IB_EVENT_PORT_ACTIVE ||
-	     record->event == IB_EVENT_LID_CHANGE  ||
-	     record->event == IB_EVENT_SM_CHANGE   ||
-	     record->event == IB_EVENT_CLIENT_REREGISTER) &&
-	    record->element.port_num == priv->port) {
+	if (record->element.port_num != priv->port)
+		return;
+
+	if (record->event == IB_EVENT_PORT_ERR    ||
+	    record->event == IB_EVENT_PORT_ACTIVE ||
+	    record->event == IB_EVENT_LID_CHANGE  ||
+	    record->event == IB_EVENT_SM_CHANGE   ||
+	    record->event == IB_EVENT_CLIENT_REREGISTER) {
 		ipoib_dbg(priv, "Port state change event\n");
 		queue_work(ipoib_workqueue, &priv->flush_task);
+	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
+		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
+		queue_work(ipoib_workqueue, &priv->pkey_event_task);
 	}
 }
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index c427173..cfa5cc0 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -90,7 +90,7 @@
 	int i;
 
 	mlx4_dbg(dev, "DEV_CAP flags:\n");
-	for (i = 0; i < 32; ++i)
+	for (i = 0; i < ARRAY_SIZE(fname); ++i)
 		if (fname[i] && (flags & (1 << i)))
 			mlx4_dbg(dev, "    %s\n", fname[i]);
 }
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 47cefca..0627a6a 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -890,6 +890,8 @@
 	spinlock_t                    client_data_lock;
 
 	struct ib_cache               cache;
+	int                          *pkey_tbl_len;
+	int                          *gid_tbl_len;
 
 	u32                           flags;
 
@@ -1118,6 +1120,12 @@
 		   u8 port_num, int port_modify_mask,
 		   struct ib_port_modify *port_modify);
 
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index);
+
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index);
+
 /**
  * ib_alloc_pd - Allocates an unused protection domain.
  * @device: The device on which to allocate the protection domain.