Merge branch 'misc' into for-next

Conflicts:
	drivers/infiniband/core/uverbs_main.c
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index f504c9b..1b09b73 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -1215,15 +1215,18 @@
 
 	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
 	cdev_del(&ucm_dev->cdev);
-	clear_bit(ucm_dev->devnum, dev_map);
+	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+		clear_bit(ucm_dev->devnum, dev_map);
+	else
+		clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
 	kfree(ucm_dev);
 }
 
 static const struct file_operations ucm_fops = {
-	.owner 	 = THIS_MODULE,
-	.open 	 = ib_ucm_open,
+	.owner	 = THIS_MODULE,
+	.open	 = ib_ucm_open,
 	.release = ib_ucm_close,
-	.write 	 = ib_ucm_write,
+	.write	 = ib_ucm_write,
 	.poll    = ib_ucm_poll,
 };
 
@@ -1237,8 +1240,32 @@
 }
 static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
 
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
+static int find_overflow_devnum(void)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
+					  "infiniband_cm");
+		if (ret) {
+			printk(KERN_ERR "ucm: couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES);
+	if (ret >= IB_UCM_MAX_DEVICES)
+		return -1;
+
+	return ret;
+}
+
 static void ib_ucm_add_one(struct ib_device *device)
 {
+	int devnum;
+	dev_t base;
 	struct ib_ucm_device *ucm_dev;
 
 	if (!device->alloc_ucontext ||
@@ -1251,16 +1278,25 @@
 
 	ucm_dev->ib_dev = device;
 
-	ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
-	if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES)
-		goto err;
+	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+	if (devnum >= IB_UCM_MAX_DEVICES) {
+		devnum = find_overflow_devnum();
+		if (devnum < 0)
+			goto err;
 
-	set_bit(ucm_dev->devnum, dev_map);
+		ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		ucm_dev->devnum = devnum;
+		base = devnum + IB_UCM_BASE_DEV;
+		set_bit(devnum, dev_map);
+	}
 
 	cdev_init(&ucm_dev->cdev, &ucm_fops);
 	ucm_dev->cdev.owner = THIS_MODULE;
 	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
-	if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1))
+	if (cdev_add(&ucm_dev->cdev, base, 1))
 		goto err;
 
 	ucm_dev->dev.class = &cm_class;
@@ -1281,7 +1317,10 @@
 	device_unregister(&ucm_dev->dev);
 err_cdev:
 	cdev_del(&ucm_dev->cdev);
-	clear_bit(ucm_dev->devnum, dev_map);
+	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
 err:
 	kfree(ucm_dev);
 	return;
@@ -1340,6 +1379,8 @@
 	ib_unregister_client(&ucm_client);
 	class_remove_file(&cm_class, &class_attr_abi_version);
 	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
 	idr_destroy(&ctx_id_table);
 }
 
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 7de0296..02d360c 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -65,12 +65,9 @@
 };
 
 /*
- * Our lifetime rules for these structs are the following: each time a
- * device special file is opened, we look up the corresponding struct
- * ib_umad_port by minor in the umad_port[] table while holding the
- * port_lock.  If this lookup succeeds, we take a reference on the
- * ib_umad_port's struct ib_umad_device while still holding the
- * port_lock; if the lookup fails, we fail the open().  We drop these
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
  * references in the corresponding close().
  *
  * In addition to references coming from open character devices, there
@@ -78,19 +75,14 @@
  * module's reference taken when allocating the ib_umad_device in
  * ib_umad_add_one().
  *
- * When destroying an ib_umad_device, we clear all of its
- * ib_umad_ports from umad_port[] while holding port_lock before
- * dropping the module's reference to the ib_umad_device.  This is
- * always safe because any open() calls will either succeed and obtain
- * a reference before we clear the umad_port[] entries, or fail after
- * we clear the umad_port[] entries.
+ * When destroying an ib_umad_device, we drop the module's reference.
  */
 
 struct ib_umad_port {
-	struct cdev           *cdev;
+	struct cdev           cdev;
 	struct device	      *dev;
 
-	struct cdev           *sm_cdev;
+	struct cdev           sm_cdev;
 	struct device	      *sm_dev;
 	struct semaphore       sm_sem;
 
@@ -136,7 +128,6 @@
 static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
 
 static DEFINE_SPINLOCK(port_lock);
-static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS];
 static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
 
 static void ib_umad_add_one(struct ib_device *device);
@@ -496,8 +487,8 @@
 		ah_attr.ah_flags = IB_AH_GRH;
 		memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
 		ah_attr.grh.sgid_index	   = packet->mad.hdr.gid_index;
-		ah_attr.grh.flow_label 	   = be32_to_cpu(packet->mad.hdr.flow_label);
-		ah_attr.grh.hop_limit  	   = packet->mad.hdr.hop_limit;
+		ah_attr.grh.flow_label	   = be32_to_cpu(packet->mad.hdr.flow_label);
+		ah_attr.grh.hop_limit	   = packet->mad.hdr.hop_limit;
 		ah_attr.grh.traffic_class  = packet->mad.hdr.traffic_class;
 	}
 
@@ -528,9 +519,9 @@
 		goto err_ah;
 	}
 
-	packet->msg->ah 	= ah;
+	packet->msg->ah		= ah;
 	packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
-	packet->msg->retries 	= packet->mad.hdr.retries;
+	packet->msg->retries	= packet->mad.hdr.retries;
 	packet->msg->context[0] = packet;
 
 	/* Copy MAD header.  Any RMPP header is already in place. */
@@ -779,15 +770,11 @@
 /*
  * ib_umad_open() does not need the BKL:
  *
- *  - umad_port[] accesses are protected by port_lock, the
- *    ib_umad_port structures are properly reference counted, and
+ *  - the ib_umad_port structures are properly reference counted, and
  *    everything else is purely local to the file being created, so
  *    races against other open calls are not a problem;
  *  - the ioctl method does not affect any global state outside of the
  *    file structure being operated on;
- *  - the port is added to umad_port[] as the last part of module
- *    initialization so the open method will either immediately run
- *    -ENXIO, or all required initialization will be done.
  */
 static int ib_umad_open(struct inode *inode, struct file *filp)
 {
@@ -795,13 +782,10 @@
 	struct ib_umad_file *file;
 	int ret = 0;
 
-	spin_lock(&port_lock);
-	port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE];
+	port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
 	if (port)
 		kref_get(&port->umad_dev->ref);
-	spin_unlock(&port_lock);
-
-	if (!port)
+	else
 		return -ENXIO;
 
 	mutex_lock(&port->file_mutex);
@@ -872,16 +856,16 @@
 }
 
 static const struct file_operations umad_fops = {
-	.owner 	 	= THIS_MODULE,
-	.read 	 	= ib_umad_read,
-	.write 	 	= ib_umad_write,
-	.poll 	 	= ib_umad_poll,
+	.owner		= THIS_MODULE,
+	.read		= ib_umad_read,
+	.write		= ib_umad_write,
+	.poll		= ib_umad_poll,
 	.unlocked_ioctl = ib_umad_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl 	= ib_umad_compat_ioctl,
+	.compat_ioctl	= ib_umad_compat_ioctl,
 #endif
-	.open 	 	= ib_umad_open,
-	.release 	= ib_umad_close
+	.open		= ib_umad_open,
+	.release	= ib_umad_close
 };
 
 static int ib_umad_sm_open(struct inode *inode, struct file *filp)
@@ -892,13 +876,10 @@
 	};
 	int ret;
 
-	spin_lock(&port_lock);
-	port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS];
+	port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
 	if (port)
 		kref_get(&port->umad_dev->ref);
-	spin_unlock(&port_lock);
-
-	if (!port)
+	else
 		return -ENXIO;
 
 	if (filp->f_flags & O_NONBLOCK) {
@@ -949,8 +930,8 @@
 }
 
 static const struct file_operations umad_sm_fops = {
-	.owner 	 = THIS_MODULE,
-	.open 	 = ib_umad_sm_open,
+	.owner	 = THIS_MODULE,
+	.open	 = ib_umad_sm_open,
 	.release = ib_umad_sm_close
 };
 
@@ -990,16 +971,51 @@
 }
 static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
 
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
+static int find_overflow_devnum(void)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
+					  "infiniband_mad");
+		if (ret) {
+			printk(KERN_ERR "user_mad: couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS);
+	if (ret >= IB_UMAD_MAX_PORTS)
+		return -1;
+
+	return ret;
+}
+
 static int ib_umad_init_port(struct ib_device *device, int port_num,
 			     struct ib_umad_port *port)
 {
+	int devnum;
+	dev_t base;
+
 	spin_lock(&port_lock);
-	port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
-	if (port->dev_num >= IB_UMAD_MAX_PORTS) {
+	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+	if (devnum >= IB_UMAD_MAX_PORTS) {
 		spin_unlock(&port_lock);
-		return -1;
+		devnum = find_overflow_devnum();
+		if (devnum < 0)
+			return -1;
+
+		spin_lock(&port_lock);
+		port->dev_num = devnum + IB_UMAD_MAX_PORTS;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		port->dev_num = devnum;
+		base = devnum + base_dev;
+		set_bit(devnum, dev_map);
 	}
-	set_bit(port->dev_num, dev_map);
 	spin_unlock(&port_lock);
 
 	port->ib_dev   = device;
@@ -1008,17 +1024,14 @@
 	mutex_init(&port->file_mutex);
 	INIT_LIST_HEAD(&port->file_list);
 
-	port->cdev = cdev_alloc();
-	if (!port->cdev)
-		return -1;
-	port->cdev->owner = THIS_MODULE;
-	port->cdev->ops   = &umad_fops;
-	kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num);
-	if (cdev_add(port->cdev, base_dev + port->dev_num, 1))
+	cdev_init(&port->cdev, &umad_fops);
+	port->cdev.owner = THIS_MODULE;
+	kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+	if (cdev_add(&port->cdev, base, 1))
 		goto err_cdev;
 
 	port->dev = device_create(umad_class, device->dma_device,
-				  port->cdev->dev, port,
+				  port->cdev.dev, port,
 				  "umad%d", port->dev_num);
 	if (IS_ERR(port->dev))
 		goto err_cdev;
@@ -1028,17 +1041,15 @@
 	if (device_create_file(port->dev, &dev_attr_port))
 		goto err_dev;
 
-	port->sm_cdev = cdev_alloc();
-	if (!port->sm_cdev)
-		goto err_dev;
-	port->sm_cdev->owner = THIS_MODULE;
-	port->sm_cdev->ops   = &umad_sm_fops;
-	kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num);
-	if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1))
+	base += IB_UMAD_MAX_PORTS;
+	cdev_init(&port->sm_cdev, &umad_sm_fops);
+	port->sm_cdev.owner = THIS_MODULE;
+	kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+	if (cdev_add(&port->sm_cdev, base, 1))
 		goto err_sm_cdev;
 
 	port->sm_dev = device_create(umad_class, device->dma_device,
-				     port->sm_cdev->dev, port,
+				     port->sm_cdev.dev, port,
 				     "issm%d", port->dev_num);
 	if (IS_ERR(port->sm_dev))
 		goto err_sm_cdev;
@@ -1048,24 +1059,23 @@
 	if (device_create_file(port->sm_dev, &dev_attr_port))
 		goto err_sm_dev;
 
-	spin_lock(&port_lock);
-	umad_port[port->dev_num] = port;
-	spin_unlock(&port_lock);
-
 	return 0;
 
 err_sm_dev:
-	device_destroy(umad_class, port->sm_cdev->dev);
+	device_destroy(umad_class, port->sm_cdev.dev);
 
 err_sm_cdev:
-	cdev_del(port->sm_cdev);
+	cdev_del(&port->sm_cdev);
 
 err_dev:
-	device_destroy(umad_class, port->cdev->dev);
+	device_destroy(umad_class, port->cdev.dev);
 
 err_cdev:
-	cdev_del(port->cdev);
-	clear_bit(port->dev_num, dev_map);
+	cdev_del(&port->cdev);
+	if (port->dev_num < IB_UMAD_MAX_PORTS)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
 
 	return -1;
 }
@@ -1079,15 +1089,11 @@
 	dev_set_drvdata(port->dev,    NULL);
 	dev_set_drvdata(port->sm_dev, NULL);
 
-	device_destroy(umad_class, port->cdev->dev);
-	device_destroy(umad_class, port->sm_cdev->dev);
+	device_destroy(umad_class, port->cdev.dev);
+	device_destroy(umad_class, port->sm_cdev.dev);
 
-	cdev_del(port->cdev);
-	cdev_del(port->sm_cdev);
-
-	spin_lock(&port_lock);
-	umad_port[port->dev_num] = NULL;
-	spin_unlock(&port_lock);
+	cdev_del(&port->cdev);
+	cdev_del(&port->sm_cdev);
 
 	mutex_lock(&port->file_mutex);
 
@@ -1106,7 +1112,10 @@
 
 	mutex_unlock(&port->file_mutex);
 
-	clear_bit(port->dev_num, dev_map);
+	if (port->dev_num < IB_UMAD_MAX_PORTS)
+		clear_bit(port->dev_num, dev_map);
+	else
+		clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map);
 }
 
 static void ib_umad_add_one(struct ib_device *device)
@@ -1214,6 +1223,8 @@
 	ib_unregister_client(&umad_client);
 	class_destroy(umad_class);
 	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
 }
 
 module_init(ib_umad_init);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b3ea958..e54d9ac 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -41,6 +41,7 @@
 #include <linux/idr.h>
 #include <linux/mutex.h>
 #include <linux/completion.h>
+#include <linux/cdev.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_umem.h>
@@ -69,23 +70,23 @@
 
 struct ib_uverbs_device {
 	struct kref				ref;
+	int					num_comp_vectors;
 	struct completion			comp;
-	int					devnum;
-	struct cdev			       *cdev;
 	struct device			       *dev;
 	struct ib_device		       *ib_dev;
-	int					num_comp_vectors;
+	int					devnum;
+	struct cdev			        cdev;
 };
 
 struct ib_uverbs_event_file {
 	struct kref				ref;
+	int					is_async;
 	struct ib_uverbs_file		       *uverbs_file;
 	spinlock_t				lock;
+	int					is_closed;
 	wait_queue_head_t			poll_wait;
 	struct fasync_struct		       *async_queue;
 	struct list_head			event_list;
-	int					is_async;
-	int					is_closed;
 };
 
 struct ib_uverbs_file {
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 82b60c6..ff59a79 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -73,40 +73,39 @@
 DEFINE_IDR(ib_uverbs_srq_idr);
 
 static DEFINE_SPINLOCK(map_lock);
-static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES];
 static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
 
 static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 				     const char __user *buf, int in_len,
 				     int out_len) = {
-	[IB_USER_VERBS_CMD_GET_CONTEXT]   	= ib_uverbs_get_context,
-	[IB_USER_VERBS_CMD_QUERY_DEVICE]  	= ib_uverbs_query_device,
-	[IB_USER_VERBS_CMD_QUERY_PORT]    	= ib_uverbs_query_port,
-	[IB_USER_VERBS_CMD_ALLOC_PD]      	= ib_uverbs_alloc_pd,
-	[IB_USER_VERBS_CMD_DEALLOC_PD]    	= ib_uverbs_dealloc_pd,
-	[IB_USER_VERBS_CMD_REG_MR]        	= ib_uverbs_reg_mr,
-	[IB_USER_VERBS_CMD_DEREG_MR]      	= ib_uverbs_dereg_mr,
+	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context,
+	[IB_USER_VERBS_CMD_QUERY_DEVICE]	= ib_uverbs_query_device,
+	[IB_USER_VERBS_CMD_QUERY_PORT]		= ib_uverbs_query_port,
+	[IB_USER_VERBS_CMD_ALLOC_PD]		= ib_uverbs_alloc_pd,
+	[IB_USER_VERBS_CMD_DEALLOC_PD]		= ib_uverbs_dealloc_pd,
+	[IB_USER_VERBS_CMD_REG_MR]		= ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_DEREG_MR]		= ib_uverbs_dereg_mr,
 	[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
-	[IB_USER_VERBS_CMD_CREATE_CQ]     	= ib_uverbs_create_cq,
-	[IB_USER_VERBS_CMD_RESIZE_CQ]     	= ib_uverbs_resize_cq,
-	[IB_USER_VERBS_CMD_POLL_CQ]     	= ib_uverbs_poll_cq,
-	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]     	= ib_uverbs_req_notify_cq,
-	[IB_USER_VERBS_CMD_DESTROY_CQ]    	= ib_uverbs_destroy_cq,
-	[IB_USER_VERBS_CMD_CREATE_QP]     	= ib_uverbs_create_qp,
-	[IB_USER_VERBS_CMD_QUERY_QP]     	= ib_uverbs_query_qp,
-	[IB_USER_VERBS_CMD_MODIFY_QP]     	= ib_uverbs_modify_qp,
-	[IB_USER_VERBS_CMD_DESTROY_QP]    	= ib_uverbs_destroy_qp,
-	[IB_USER_VERBS_CMD_POST_SEND]    	= ib_uverbs_post_send,
-	[IB_USER_VERBS_CMD_POST_RECV]    	= ib_uverbs_post_recv,
-	[IB_USER_VERBS_CMD_POST_SRQ_RECV]    	= ib_uverbs_post_srq_recv,
-	[IB_USER_VERBS_CMD_CREATE_AH]    	= ib_uverbs_create_ah,
-	[IB_USER_VERBS_CMD_DESTROY_AH]    	= ib_uverbs_destroy_ah,
-	[IB_USER_VERBS_CMD_ATTACH_MCAST]  	= ib_uverbs_attach_mcast,
-	[IB_USER_VERBS_CMD_DETACH_MCAST]  	= ib_uverbs_detach_mcast,
-	[IB_USER_VERBS_CMD_CREATE_SRQ]    	= ib_uverbs_create_srq,
-	[IB_USER_VERBS_CMD_MODIFY_SRQ]    	= ib_uverbs_modify_srq,
-	[IB_USER_VERBS_CMD_QUERY_SRQ]     	= ib_uverbs_query_srq,
-	[IB_USER_VERBS_CMD_DESTROY_SRQ]   	= ib_uverbs_destroy_srq,
+	[IB_USER_VERBS_CMD_CREATE_CQ]		= ib_uverbs_create_cq,
+	[IB_USER_VERBS_CMD_RESIZE_CQ]		= ib_uverbs_resize_cq,
+	[IB_USER_VERBS_CMD_POLL_CQ]		= ib_uverbs_poll_cq,
+	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]	= ib_uverbs_req_notify_cq,
+	[IB_USER_VERBS_CMD_DESTROY_CQ]		= ib_uverbs_destroy_cq,
+	[IB_USER_VERBS_CMD_CREATE_QP]		= ib_uverbs_create_qp,
+	[IB_USER_VERBS_CMD_QUERY_QP]		= ib_uverbs_query_qp,
+	[IB_USER_VERBS_CMD_MODIFY_QP]		= ib_uverbs_modify_qp,
+	[IB_USER_VERBS_CMD_DESTROY_QP]		= ib_uverbs_destroy_qp,
+	[IB_USER_VERBS_CMD_POST_SEND]		= ib_uverbs_post_send,
+	[IB_USER_VERBS_CMD_POST_RECV]		= ib_uverbs_post_recv,
+	[IB_USER_VERBS_CMD_POST_SRQ_RECV]	= ib_uverbs_post_srq_recv,
+	[IB_USER_VERBS_CMD_CREATE_AH]		= ib_uverbs_create_ah,
+	[IB_USER_VERBS_CMD_DESTROY_AH]		= ib_uverbs_destroy_ah,
+	[IB_USER_VERBS_CMD_ATTACH_MCAST]	= ib_uverbs_attach_mcast,
+	[IB_USER_VERBS_CMD_DETACH_MCAST]	= ib_uverbs_detach_mcast,
+	[IB_USER_VERBS_CMD_CREATE_SRQ]		= ib_uverbs_create_srq,
+	[IB_USER_VERBS_CMD_MODIFY_SRQ]		= ib_uverbs_modify_srq,
+	[IB_USER_VERBS_CMD_QUERY_SRQ]		= ib_uverbs_query_srq,
+	[IB_USER_VERBS_CMD_DESTROY_SRQ]		= ib_uverbs_destroy_srq,
 };
 
 static void ib_uverbs_add_one(struct ib_device *device);
@@ -366,7 +365,7 @@
 
 static const struct file_operations uverbs_event_fops = {
 	.owner	 = THIS_MODULE,
-	.read 	 = ib_uverbs_event_read,
+	.read	 = ib_uverbs_event_read,
 	.poll    = ib_uverbs_event_poll,
 	.release = ib_uverbs_event_close,
 	.fasync  = ib_uverbs_event_fasync
@@ -601,14 +600,12 @@
 /*
  * ib_uverbs_open() does not need the BKL:
  *
- *  - dev_table[] accesses are protected by map_lock, the
- *    ib_uverbs_device structures are properly reference counted, and
+ *  - the ib_uverbs_device structures are properly reference counted and
  *    everything else is purely local to the file being created, so
  *    races against other open calls are not a problem;
  *  - there is no ioctl method to race against;
- *  - the device is added to dev_table[] as the last part of module
- *    initialization, the open method will either immediately run
- *    -ENXIO, or all required initialization will be done.
+ *  - the open method will either immediately run -ENXIO, or all
+ *    required initialization will be done.
  */
 static int ib_uverbs_open(struct inode *inode, struct file *filp)
 {
@@ -616,13 +613,10 @@
 	struct ib_uverbs_file *file;
 	int ret;
 
-	spin_lock(&map_lock);
-	dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR];
+	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
 	if (dev)
 		kref_get(&dev->ref);
-	spin_unlock(&map_lock);
-
-	if (!dev)
+	else
 		return -ENXIO;
 
 	if (!try_module_get(dev->ib_dev->owner)) {
@@ -669,17 +663,17 @@
 }
 
 static const struct file_operations uverbs_fops = {
-	.owner 	 = THIS_MODULE,
-	.write 	 = ib_uverbs_write,
-	.open 	 = ib_uverbs_open,
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
+	.open	 = ib_uverbs_open,
 	.release = ib_uverbs_close
 };
 
 static const struct file_operations uverbs_mmap_fops = {
-	.owner 	 = THIS_MODULE,
-	.write 	 = ib_uverbs_write,
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
 	.mmap    = ib_uverbs_mmap,
-	.open 	 = ib_uverbs_open,
+	.open	 = ib_uverbs_open,
 	.release = ib_uverbs_close
 };
 
@@ -719,8 +713,38 @@
 }
 static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
 
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
+
+/*
+ * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
+ * requesting a new major number and doubling the number of max devices we
+ * support. It's stupid, but simple.
+ */
+static int find_overflow_devnum(void)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
+					  "infiniband_verbs");
+		if (ret) {
+			printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
+	if (ret >= IB_UVERBS_MAX_DEVICES)
+		return -1;
+
+	return ret;
+}
+
 static void ib_uverbs_add_one(struct ib_device *device)
 {
+	int devnum;
+	dev_t base;
 	struct ib_uverbs_device *uverbs_dev;
 
 	if (!device->alloc_ucontext)
@@ -734,28 +758,36 @@
 	init_completion(&uverbs_dev->comp);
 
 	spin_lock(&map_lock);
-	uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
-	if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) {
+	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+	if (devnum >= IB_UVERBS_MAX_DEVICES) {
 		spin_unlock(&map_lock);
-		goto err;
+		devnum = find_overflow_devnum();
+		if (devnum < 0)
+			goto err;
+
+		spin_lock(&map_lock);
+		uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		uverbs_dev->devnum = devnum;
+		base = devnum + IB_UVERBS_BASE_DEV;
+		set_bit(devnum, dev_map);
 	}
-	set_bit(uverbs_dev->devnum, dev_map);
 	spin_unlock(&map_lock);
 
 	uverbs_dev->ib_dev           = device;
 	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
 
-	uverbs_dev->cdev = cdev_alloc();
-	if (!uverbs_dev->cdev)
-		goto err;
-	uverbs_dev->cdev->owner = THIS_MODULE;
-	uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
-	kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum);
-	if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1))
+	cdev_init(&uverbs_dev->cdev, NULL);
+	uverbs_dev->cdev.owner = THIS_MODULE;
+	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+	if (cdev_add(&uverbs_dev->cdev, base, 1))
 		goto err_cdev;
 
 	uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
-					uverbs_dev->cdev->dev, uverbs_dev,
+					uverbs_dev->cdev.dev, uverbs_dev,
 					"uverbs%d", uverbs_dev->devnum);
 	if (IS_ERR(uverbs_dev->dev))
 		goto err_cdev;
@@ -765,20 +797,19 @@
 	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
 		goto err_class;
 
-	spin_lock(&map_lock);
-	dev_table[uverbs_dev->devnum] = uverbs_dev;
-	spin_unlock(&map_lock);
-
 	ib_set_client_data(device, &uverbs_client, uverbs_dev);
 
 	return;
 
 err_class:
-	device_destroy(uverbs_class, uverbs_dev->cdev->dev);
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
 
 err_cdev:
-	cdev_del(uverbs_dev->cdev);
-	clear_bit(uverbs_dev->devnum, dev_map);
+	cdev_del(&uverbs_dev->cdev);
+	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
 
 err:
 	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
@@ -795,14 +826,13 @@
 		return;
 
 	dev_set_drvdata(uverbs_dev->dev, NULL);
-	device_destroy(uverbs_class, uverbs_dev->cdev->dev);
-	cdev_del(uverbs_dev->cdev);
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+	cdev_del(&uverbs_dev->cdev);
 
-	spin_lock(&map_lock);
-	dev_table[uverbs_dev->devnum] = NULL;
-	spin_unlock(&map_lock);
-
-	clear_bit(uverbs_dev->devnum, dev_map);
+	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+		clear_bit(uverbs_dev->devnum, dev_map);
+	else
+		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
 
 	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
 	wait_for_completion(&uverbs_dev->comp);
@@ -856,6 +886,8 @@
 	ib_unregister_client(&uverbs_client);
 	class_destroy(uverbs_class);
 	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
 	idr_destroy(&ib_uverbs_pd_idr);
 	idr_destroy(&ib_uverbs_mr_idr);
 	idr_destroy(&ib_uverbs_mw_idr);
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index 0677fc7..a28e862 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -109,7 +109,6 @@
 		while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
 			udelay(1);
 			if (i++ > 1000000) {
-				BUG_ON(1);
 				printk(KERN_ERR "%s: stalled rnic\n",
 				       rdev_p->dev_name);
 				return -EIO;
@@ -155,7 +154,7 @@
 	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
 }
 
-int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
 {
 	struct rdma_cq_setup setup;
 	int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
@@ -163,12 +162,12 @@
 	cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
 	if (!cq->cqid)
 		return -ENOMEM;
-	cq->sw_queue = kzalloc(size, GFP_KERNEL);
-	if (!cq->sw_queue)
-		return -ENOMEM;
-	cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
-					     (1UL << (cq->size_log2)) *
-					     sizeof(struct t3_cqe),
+	if (kernel) {
+		cq->sw_queue = kzalloc(size, GFP_KERNEL);
+		if (!cq->sw_queue)
+			return -ENOMEM;
+	}
+	cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size,
 					     &(cq->dma_addr), GFP_KERNEL);
 	if (!cq->queue) {
 		kfree(cq->sw_queue);
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
index f3d440c..073373c 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -53,7 +53,7 @@
 #define T3_MAX_PBL_SIZE 256
 #define T3_MAX_RQ_SIZE 1024
 #define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
-#define T3_MAX_CQ_DEPTH 8192
+#define T3_MAX_CQ_DEPTH 262144
 #define T3_MAX_NUM_STAG (1<<15)
 #define T3_MAX_MR_SIZE 0x100000000ULL
 #define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
@@ -157,7 +157,7 @@
 void cxio_rdev_close(struct cxio_rdev *rdev);
 int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
 		   enum t3_cq_opcode op, u32 credit);
-int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
 int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
 void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
index a197a5b..15073b2 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -730,7 +730,22 @@
 
 static inline void cxio_set_wq_in_error(struct t3_wq *wq)
 {
-	wq->queue->wq_in_err.err = 1;
+	wq->queue->wq_in_err.err |= 1;
+}
+
+static inline void cxio_disable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err |= 2;
+}
+
+static inline void cxio_enable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err &= ~2;
+}
+
+static inline int cxio_wq_db_enabled(struct t3_wq *wq)
+{
+	return !(wq->queue->wq_in_err.err & 2);
 }
 
 static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
index b0ea010..ee1d8b4 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -65,6 +65,46 @@
 static LIST_HEAD(dev_list);
 static DEFINE_MUTEX(dev_mutex);
 
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	cxio_disable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	if (data)
+		ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid);
+	cxio_enable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static void disable_dbs(struct iwch_dev *rnicp)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, disable_qp_db, NULL);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, enable_qp_db,
+		     (void *)(unsigned long)ring_db);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void iwch_db_drop_task(struct work_struct *work)
+{
+	struct iwch_dev *rnicp = container_of(work, struct iwch_dev,
+					      db_drop_task.work);
+	enable_dbs(rnicp, 1);
+}
+
 static void rnic_init(struct iwch_dev *rnicp)
 {
 	PDBG("%s iwch_dev %p\n", __func__,  rnicp);
@@ -72,6 +112,7 @@
 	idr_init(&rnicp->qpidr);
 	idr_init(&rnicp->mmidr);
 	spin_lock_init(&rnicp->lock);
+	INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
 
 	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
 	rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
@@ -147,6 +188,8 @@
 	mutex_lock(&dev_mutex);
 	list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
 		if (dev->rdev.t3cdev_p == tdev) {
+			dev->rdev.flags = CXIO_ERROR_FATAL;
+			cancel_delayed_work_sync(&dev->db_drop_task);
 			list_del(&dev->entry);
 			iwch_unregister_device(dev);
 			cxio_rdev_close(&dev->rdev);
@@ -165,7 +208,8 @@
 	struct cxio_rdev *rdev = tdev->ulp;
 	struct iwch_dev *rnicp;
 	struct ib_event event;
-	u32    portnum = port_id + 1;
+	u32 portnum = port_id + 1;
+	int dispatch = 0;
 
 	if (!rdev)
 		return;
@@ -174,21 +218,49 @@
 	case OFFLOAD_STATUS_DOWN: {
 		rdev->flags = CXIO_ERROR_FATAL;
 		event.event  = IB_EVENT_DEVICE_FATAL;
+		dispatch = 1;
 		break;
 		}
 	case OFFLOAD_PORT_DOWN: {
 		event.event  = IB_EVENT_PORT_ERR;
+		dispatch = 1;
 		break;
 		}
 	case OFFLOAD_PORT_UP: {
 		event.event  = IB_EVENT_PORT_ACTIVE;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_DB_FULL: {
+		disable_dbs(rnicp);
+		break;
+		}
+	case OFFLOAD_DB_EMPTY: {
+		enable_dbs(rnicp, 1);
+		break;
+		}
+	case OFFLOAD_DB_DROP: {
+		unsigned long delay = 1000;
+		unsigned short r;
+
+		disable_dbs(rnicp);
+		get_random_bytes(&r, 2);
+		delay += r & 1023;
+
+		/*
+		 * delay is between 1000-2023 usecs.
+		 */
+		schedule_delayed_work(&rnicp->db_drop_task,
+			usecs_to_jiffies(delay));
 		break;
 		}
 	}
 
-	event.device = &rnicp->ibdev;
-	event.element.port_num = portnum;
-	ib_dispatch_event(&event);
+	if (dispatch) {
+		event.device = &rnicp->ibdev;
+		event.element.port_num = portnum;
+		ib_dispatch_event(&event);
+	}
 
 	return;
 }
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
index 8473550..a1c4457 100644
--- a/drivers/infiniband/hw/cxgb3/iwch.h
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -36,6 +36,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #include <rdma/ib_verbs.h>
 
@@ -110,6 +111,7 @@
 	struct idr mmidr;
 	spinlock_t lock;
 	struct list_head entry;
+	struct delayed_work db_drop_task;
 };
 
 static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index ed71755..47b35c6 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -187,7 +187,7 @@
 	entries = roundup_pow_of_two(entries);
 	chp->cq.size_log2 = ilog2(entries);
 
-	if (cxio_create_cq(&rhp->rdev, &chp->cq)) {
+	if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
 		kfree(chp);
 		return ERR_PTR(-ENOMEM);
 	}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 3eb8cec..b4d893d 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -452,7 +452,8 @@
 		++(qhp->wq.sq_wptr);
 	}
 	spin_unlock_irqrestore(&qhp->lock, flag);
-	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
 
 out:
 	if (err)
@@ -514,7 +515,8 @@
 		num_wrs--;
 	}
 	spin_unlock_irqrestore(&qhp->lock, flag);
-	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
 
 out:
 	if (err)
@@ -597,7 +599,8 @@
 	++(qhp->wq.sq_wptr);
 	spin_unlock_irqrestore(&qhp->lock, flag);
 
-	ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
 
 	return err;
 }
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 42be0b1..b2b6fea 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -548,11 +548,10 @@
 	struct ehca_eq *eq = &shca->eq;
 	struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
 	u64 eqe_value, ret;
-	unsigned long flags;
 	int eqe_cnt, i;
 	int eq_empty = 0;
 
-	spin_lock_irqsave(&eq->irq_spinlock, flags);
+	spin_lock(&eq->irq_spinlock);
 	if (is_irq) {
 		const int max_query_cnt = 100;
 		int query_cnt = 0;
@@ -643,7 +642,7 @@
 	} while (1);
 
 unlock_irq_spinlock:
-	spin_unlock_irqrestore(&eq->irq_spinlock, flags);
+	spin_unlock(&eq->irq_spinlock);
 }
 
 void ehca_tasklet_eq(unsigned long data)
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 0338f1f..b105f66 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -55,9 +55,7 @@
 /*
  * attributes not supported by query qp
  */
-#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_MAX_DEST_RD_ATOMIC | \
-				     IB_QP_MAX_QP_RD_ATOMIC   | \
-				     IB_QP_ACCESS_FLAGS       | \
+#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
 				     IB_QP_EN_SQD_ASYNC_NOTIFY)
 
 /*
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
index 8c1213f..dba8f9f 100644
--- a/drivers/infiniband/hw/ehca/ehca_sqp.c
+++ b/drivers/infiniband/hw/ehca/ehca_sqp.c
@@ -222,7 +222,7 @@
 {
 	int ret;
 
-	if (!port_num || port_num > ibdev->phys_port_cnt)
+	if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
 		return IB_MAD_RESULT_FAILURE;
 
 	/* accept only pma request */
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index a182352..ae75389 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1214,7 +1214,7 @@
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			    void *wqe, unsigned *mlx_seg_len)
 {
-	struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+	struct ib_device *ib_dev = sqp->qp.ibqp.device;
 	struct mlx4_wqe_mlx_seg *mlx = wqe;
 	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
 	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
index b9d09ba..4272c52 100644
--- a/drivers/infiniband/hw/nes/nes.c
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -110,6 +110,7 @@
 
 static struct pci_device_id nes_pci_table[] = {
 	{PCI_VENDOR_ID_NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020, PCI_ANY_ID, PCI_ANY_ID},
+	{PCI_VENDOR_ID_NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR, PCI_ANY_ID, PCI_ANY_ID},
 	{0}
 };
 
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
index 9884056..cc78fee1 100644
--- a/drivers/infiniband/hw/nes/nes.h
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -64,8 +64,9 @@
  * NetEffect PCI vendor id and NE010 PCI device id.
  */
 #ifndef PCI_VENDOR_ID_NETEFFECT	/* not in pci.ids yet */
-#define PCI_VENDOR_ID_NETEFFECT       0x1678
-#define PCI_DEVICE_ID_NETEFFECT_NE020 0x0100
+#define PCI_VENDOR_ID_NETEFFECT          0x1678
+#define PCI_DEVICE_ID_NETEFFECT_NE020    0x0100
+#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110
 #endif
 
 #define NE020_REV   4
@@ -193,8 +194,8 @@
 extern u32 cm_packets_received;
 extern u32 cm_packets_dropped;
 extern u32 cm_packets_retrans;
-extern u32 cm_listens_created;
-extern u32 cm_listens_destroyed;
+extern atomic_t cm_listens_created;
+extern atomic_t cm_listens_destroyed;
 extern u32 cm_backlog_drops;
 extern atomic_t cm_loopbacks;
 extern atomic_t cm_nodes_created;
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
index 39468c2..2a49ee4 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -67,8 +67,8 @@
 u32 cm_packets_retrans;
 u32 cm_packets_created;
 u32 cm_packets_received;
-u32 cm_listens_created;
-u32 cm_listens_destroyed;
+atomic_t cm_listens_created;
+atomic_t cm_listens_destroyed;
 u32 cm_backlog_drops;
 atomic_t cm_loopbacks;
 atomic_t cm_nodes_created;
@@ -1011,9 +1011,10 @@
 					event.cm_info.loc_port =
 							 loopback->loc_port;
 					event.cm_info.cm_id = loopback->cm_id;
+					add_ref_cm_node(loopback);
+					loopback->state = NES_CM_STATE_CLOSED;
 					cm_event_connect_error(&event);
 					cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
-					loopback->state = NES_CM_STATE_CLOSED;
 
 					rem_ref_cm_node(cm_node->cm_core,
 							 cm_node);
@@ -1042,7 +1043,7 @@
 		kfree(listener);
 		listener = NULL;
 		ret = 0;
-		cm_listens_destroyed++;
+		atomic_inc(&cm_listens_destroyed);
 	} else {
 		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
 	}
@@ -3172,7 +3173,7 @@
 			g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
 			return err;
 		}
-		cm_listens_created++;
+		atomic_inc(&cm_listens_created);
 	}
 
 	cm_id->add_ref(cm_id);
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index b1c2cbb..ce7f538 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -748,16 +748,28 @@
 
 	if (hw_rev != NE020_REV) {
 		/* init serdes 0 */
-		if (wide_ppm_offset && (nesadapter->phy_type[0] == NES_PHY_TYPE_CX4))
-			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
-		else
+		switch (nesadapter->phy_type[0]) {
+		case NES_PHY_TYPE_CX4:
+			if (wide_ppm_offset)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
+		case NES_PHY_TYPE_KR:
 			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
-
-		if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+			break;
+		case NES_PHY_TYPE_PUMA_1G:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
 			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0);
 			sds |= 0x00000100;
 			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds);
+			break;
+		default:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
 		}
+
 		if (!OneG_Mode)
 			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
 
@@ -778,6 +790,9 @@
 			if (wide_ppm_offset)
 				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA);
 			break;
+		case NES_PHY_TYPE_KR:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+			break;
 		case NES_PHY_TYPE_PUMA_1G:
 			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
 			sds |= 0x000000100;
@@ -1279,115 +1294,100 @@
 
 
 /**
- * nes_init_phy
+ * nes_init_1g_phy
  */
-int nes_init_phy(struct nes_device *nesdev)
+int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
 {
-	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 counter = 0;
+	u16 phy_data;
+	int ret = 0;
+
+	nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
+
+	/* Reset the PHY */
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
+	udelay(100);
+	counter = 0;
+	do {
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+		if (counter++ > 100) {
+			ret = -1;
+			break;
+		}
+	} while (phy_data & 0x8000);
+
+	/* Setting no phy loopback */
+	phy_data &= 0xbfff;
+	phy_data |= 0x1140;
+	nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
+
+	/* Setting the interrupt mask */
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+
+	/* turning on flow control */
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+
+	/* Clear Half duplex */
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
+
+	return ret;
+}
+
+
+/**
+ * nes_init_2025_phy
+ */
+int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+	u32 temp_phy_data = 0;
+	u32 temp_phy_data2 = 0;
 	u32 counter = 0;
 	u32 sds;
 	u32 mac_index = nesdev->mac_index;
-	u32 tx_config = 0;
-	u16 phy_data;
-	u32 temp_phy_data = 0;
-	u32 temp_phy_data2 = 0;
-	u8  phy_type = nesadapter->phy_type[mac_index];
-	u8  phy_index = nesadapter->phy_index[mac_index];
+	int ret = 0;
+	unsigned int first_attempt = 1;
 
-	if ((nesadapter->OneG_Mode) &&
-	    (phy_type != NES_PHY_TYPE_PUMA_1G)) {
-		nes_debug(NES_DBG_PHY, "1G PHY, mac_index = %d.\n", mac_index);
-		if (phy_type == NES_PHY_TYPE_1G) {
-			tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-			tx_config &= 0xFFFFFFE3;
-			tx_config |= 0x04;
-			nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-		}
+	/* Check firmware heartbeat */
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	udelay(1500);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
 
-		nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
-		nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
-
-		/* Reset the PHY */
-		nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
-		udelay(100);
-		counter = 0;
-		do {
-			nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-			if (counter++ > 100)
-				break;
-		} while (phy_data & 0x8000);
-
-		/* Setting no phy loopback */
-		phy_data &= 0xbfff;
-		phy_data |= 0x1140;
-		nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
-		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-		nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
-		nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
-
-		/* Setting the interrupt mask */
-		nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
-		nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
-		nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
-
-		/* turning on flow control */
-		nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
-		nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
-		nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
-
-		/* Clear Half duplex */
-		nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
-		nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
-		nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
-
-		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
-		nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
-
-		return 0;
-	}
-
-	if ((phy_type == NES_PHY_TYPE_IRIS) ||
-	    (phy_type == NES_PHY_TYPE_ARGUS) ||
-	    (phy_type == NES_PHY_TYPE_SFP_D)) {
-		/* setup 10G MDIO operation */
-		tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
-		tx_config &= 0xFFFFFFE3;
-		tx_config |= 0x15;
-		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
-	}
-	if ((phy_type == NES_PHY_TYPE_ARGUS) ||
-	    (phy_type == NES_PHY_TYPE_SFP_D)) {
-		u32 first_time = 1;
-
-		/* Check firmware heartbeat */
-		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	if (temp_phy_data != temp_phy_data2) {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
 		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-		udelay(1500);
-		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-		temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+		if ((temp_phy_data & 0xff) > 0x20)
+			return 0;
+		printk(PFX "Reinitialize external PHY\n");
+	}
 
-		if (temp_phy_data != temp_phy_data2) {
-			nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-			temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-			if ((temp_phy_data & 0xff) > 0x20)
-				return 0;
-			printk(PFX "Reinitializing PHY\n");
-		}
+	/* no heartbeat, configure the PHY */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
 
-		/* no heartbeat, configure the PHY */
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
+	switch (phy_type) {
+	case NES_PHY_TYPE_ARGUS:
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
-		if (phy_type == NES_PHY_TYPE_ARGUS) {
-			nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
-			nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
-			nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
-		} else {
-			nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
-			nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
-			nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
-		}
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
 
@@ -1395,71 +1395,151 @@
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
 
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
+	case NES_PHY_TYPE_SFP_D:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
 
-		/* Bring PHY out of reset */
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
 
-		/* Check for heartbeat */
-		counter = 0;
-		mdelay(690);
+	case NES_PHY_TYPE_KR:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004);
+
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020);
+		break;
+	}
+
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
+
+	/* Bring PHY out of reset */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
+
+	/* Check for heartbeat */
+	counter = 0;
+	mdelay(690);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	do {
+		if (counter++ > 150) {
+			printk(PFX "No PHY heartbeat\n");
+			break;
+		}
+		mdelay(1);
 		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+		temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	} while ((temp_phy_data2 == temp_phy_data));
+
+	/* wait for tracking */
+	counter = 0;
+	do {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
 		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-		do {
-			if (counter++ > 150) {
-				printk(PFX "No PHY heartbeat\n");
+		if (counter++ > 300) {
+			if (((temp_phy_data & 0xff) == 0x0) && first_attempt) {
+				first_attempt = 0;
+				counter = 0;
+				/* reset AMCC PHY and try again */
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
+				continue;
+			} else {
+				ret = 1;
 				break;
 			}
-			mdelay(1);
-			nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
-			temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-		} while ((temp_phy_data2 == temp_phy_data));
+		}
+		mdelay(10);
+	} while ((temp_phy_data & 0xff) < 0x30);
 
-		/* wait for tracking */
-		counter = 0;
-		do {
-			nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
-			temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-			if (counter++ > 300) {
-				if (((temp_phy_data & 0xff) == 0x0) && first_time) {
-					first_time = 0;
-					counter = 0;
-					/* reset AMCC PHY and try again */
-					nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
-					nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
-					continue;
-				} else {
-					printk(PFX "PHY did not track\n");
-					break;
-				}
-			}
-			mdelay(10);
-		} while ((temp_phy_data & 0xff) < 0x30);
-
-		/* setup signal integrity */
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
-		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
+	/* setup signal integrity */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
+	if (phy_type == NES_PHY_TYPE_KR) {
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C);
+	} else {
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002);
 		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063);
-
-		/* reset serdes */
-		sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 +
-				       mac_index * 0x200);
-		sds |= 0x1;
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 +
-				  mac_index * 0x200, sds);
-		sds &= 0xfffffffe;
-		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 +
-				  mac_index * 0x200, sds);
-
-		counter = 0;
-		while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
-				&& (counter++ < 5000))
-			;
 	}
-	return 0;
+
+	/* reset serdes */
+	sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200);
+	sds |= 0x1;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+	sds &= 0xfffffffe;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+
+	counter = 0;
+	while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
+			&& (counter++ < 5000))
+		;
+
+	return ret;
+}
+
+
+/**
+ * nes_init_phy
+ */
+int nes_init_phy(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 mac_index = nesdev->mac_index;
+	u32 tx_config = 0;
+	unsigned long flags;
+	u8  phy_type = nesadapter->phy_type[mac_index];
+	u8  phy_index = nesadapter->phy_index[mac_index];
+	int ret = 0;
+
+	tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+	if (phy_type == NES_PHY_TYPE_1G) {
+		/* setup 1G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x04;
+	} else {
+		/* setup 10G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x15;
+	}
+	nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+
+	switch (phy_type) {
+	case NES_PHY_TYPE_1G:
+		ret = nes_init_1g_phy(nesdev, phy_type, phy_index);
+		break;
+	case NES_PHY_TYPE_ARGUS:
+	case NES_PHY_TYPE_SFP_D:
+	case NES_PHY_TYPE_KR:
+		ret = nes_init_2025_phy(nesdev, phy_type, phy_index);
+		break;
+	}
+
+	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+	return ret;
 }
 
 
@@ -2460,23 +2540,9 @@
 			}
 		} else {
 			switch (nesadapter->phy_type[mac_index]) {
-			case NES_PHY_TYPE_IRIS:
-				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1);
-				temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-				u32temp = 20;
-				do {
-					nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 1);
-					phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
-					if ((phy_data == temp_phy_data) || (!(--u32temp)))
-						break;
-					temp_phy_data = phy_data;
-				} while (1);
-				nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
-					__func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
-				break;
-
 			case NES_PHY_TYPE_ARGUS:
 			case NES_PHY_TYPE_SFP_D:
+			case NES_PHY_TYPE_KR:
 				/* clear the alarms */
 				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008);
 				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001);
@@ -3352,8 +3418,6 @@
 	u16 async_event_id;
 	u8 tcp_state;
 	u8 iwarp_state;
-	int must_disconn = 1;
-	int must_terminate = 0;
 	struct ib_event ibevent;
 
 	nes_debug(NES_DBG_AEQ, "\n");
@@ -3367,6 +3431,8 @@
 		BUG_ON(!context);
 	}
 
+	/* context is nesqp unless async_event_id == CQ ERROR */
+	nesqp = (struct nes_qp *)(unsigned long)context;
 	async_event_id = (u16)aeq_info;
 	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
 	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
@@ -3378,8 +3444,6 @@
 
 	switch (async_event_id) {
 		case NES_AEQE_AEID_LLP_FIN_RECEIVED:
-			nesqp = (struct nes_qp *)(unsigned long)context;
-
 			if (nesqp->term_flags)
 				return; /* Ignore it, wait for close complete */
 
@@ -3394,79 +3458,48 @@
 						async_event_id, nesqp->last_aeq, tcp_state);
 			}
 
-			if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
-					(nesqp->ibqp_state != IB_QPS_RTS)) {
-				/* FIN Received but tcp state or IB state moved on,
-						should expect a	close complete */
-				return;
-			}
-
+			break;
 		case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
-			nesqp = (struct nes_qp *)(unsigned long)context;
 			if (nesqp->term_flags) {
 				nes_terminate_done(nesqp, 0);
 				return;
 			}
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_CLOSING, 0, 0);
+			nes_cm_disconn(nesqp);
+			break;
 
-		case NES_AEQE_AEID_LLP_CONNECTION_RESET:
 		case NES_AEQE_AEID_RESET_SENT:
-			nesqp = (struct nes_qp *)(unsigned long)context;
-			if (async_event_id == NES_AEQE_AEID_RESET_SENT) {
-				tcp_state = NES_AEQE_TCP_STATE_CLOSED;
-			}
+			tcp_state = NES_AEQE_TCP_STATE_CLOSED;
 			spin_lock_irqsave(&nesqp->lock, flags);
 			nesqp->hw_iwarp_state = iwarp_state;
 			nesqp->hw_tcp_state = tcp_state;
 			nesqp->last_aeq = async_event_id;
-
-			if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
-					(tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) {
-				nesqp->hte_added = 0;
-				next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
-			}
-
-			if ((nesqp->ibqp_state == IB_QPS_RTS) &&
-					((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
-					(async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
-				switch (nesqp->hw_iwarp_state) {
-					case NES_AEQE_IWARP_STATE_RTS:
-						next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
-						nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
-						break;
-					case NES_AEQE_IWARP_STATE_TERMINATE:
-						must_disconn = 0; /* terminate path takes care of disconn */
-						if (nesqp->term_flags == 0)
-							must_terminate = 1;
-						break;
-				}
-			} else {
-				if (async_event_id ==  NES_AEQE_AEID_LLP_FIN_RECEIVED) {
-					/* FIN Received but ib state not RTS,
-							close complete will be on its way */
-					must_disconn = 0;
-				}
-			}
+			nesqp->hte_added = 0;
 			spin_unlock_irqrestore(&nesqp->lock, flags);
+			next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
+			nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+			nes_cm_disconn(nesqp);
+			break;
 
-			if (must_terminate)
-				nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
-			else if (must_disconn) {
-				if (next_iwarp_state) {
-					nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X\n",
-						  nesqp->hwqp.qp_id, next_iwarp_state);
-					nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
-				}
-				nes_cm_disconn(nesqp);
-			}
+		case NES_AEQE_AEID_LLP_CONNECTION_RESET:
+			if (atomic_read(&nesqp->close_timer_started))
+				return;
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_cm_disconn(nesqp);
 			break;
 
 		case NES_AEQE_AEID_TERMINATE_SENT:
-			nesqp = (struct nes_qp *)(unsigned long)context;
 			nes_terminate_send_fin(nesdev, nesqp, aeqe);
 			break;
 
 		case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
-			nesqp = (struct nes_qp *)(unsigned long)context;
 			nes_terminate_received(nesdev, nesqp, aeqe);
 			break;
 
@@ -3480,7 +3513,8 @@
 		case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
 		case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
 		case NES_AEQE_AEID_AMP_TO_WRAP:
-			nesqp = (struct nes_qp *)(unsigned long)context;
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
+					nesqp->hwqp.qp_id, async_event_id);
 			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
 			break;
 
@@ -3488,7 +3522,6 @@
 		case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
 		case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
 		case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
-			nesqp = (struct nes_qp *)(unsigned long)context;
 			if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
 				aeq_info &= 0xffff0000;
 				aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
@@ -3530,7 +3563,8 @@
 		case NES_AEQE_AEID_STAG_ZERO_INVALID:
 		case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
 		case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
-			nesqp = (struct nes_qp *)(unsigned long)context;
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
+					nesqp->hwqp.qp_id, async_event_id);
 			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
 			break;
 
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
index 084be0e..9b1e7f8 100644
--- a/drivers/infiniband/hw/nes/nes_hw.h
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -37,12 +37,12 @@
 
 #define NES_PHY_TYPE_CX4       1
 #define NES_PHY_TYPE_1G        2
-#define NES_PHY_TYPE_IRIS      3
 #define NES_PHY_TYPE_ARGUS     4
 #define NES_PHY_TYPE_PUMA_1G   5
 #define NES_PHY_TYPE_PUMA_10G  6
 #define NES_PHY_TYPE_GLADIUS   7
 #define NES_PHY_TYPE_SFP_D     8
+#define NES_PHY_TYPE_KR	       9
 
 #define NES_MULTICAST_PF_MAX 8
 
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index ab11027..7dd6ce6 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -1230,8 +1230,8 @@
 	target_stat_values[++index] = cm_packets_received;
 	target_stat_values[++index] = cm_packets_dropped;
 	target_stat_values[++index] = cm_packets_retrans;
-	target_stat_values[++index] = cm_listens_created;
-	target_stat_values[++index] = cm_listens_destroyed;
+	target_stat_values[++index] = atomic_read(&cm_listens_created);
+	target_stat_values[++index] = atomic_read(&cm_listens_destroyed);
 	target_stat_values[++index] = cm_backlog_drops;
 	target_stat_values[++index] = atomic_read(&cm_loopbacks);
 	target_stat_values[++index] = atomic_read(&cm_nodes_created);
@@ -1461,9 +1461,9 @@
 		}
 		return 0;
 	}
-	if ((phy_type == NES_PHY_TYPE_IRIS) ||
-	    (phy_type == NES_PHY_TYPE_ARGUS) ||
-	    (phy_type == NES_PHY_TYPE_SFP_D)) {
+	if ((phy_type == NES_PHY_TYPE_ARGUS) ||
+	    (phy_type == NES_PHY_TYPE_SFP_D) ||
+	    (phy_type == NES_PHY_TYPE_KR)) {
 		et_cmd->transceiver = XCVR_EXTERNAL;
 		et_cmd->port        = PORT_FIBRE;
 		et_cmd->supported   = SUPPORTED_FIBRE;
@@ -1583,8 +1583,7 @@
 	struct net_device *netdev;
 	struct nic_qp_map *curr_qp_map;
 	u32 u32temp;
-	u16 phy_data;
-	u16 temp_phy_data;
+	u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index];
 
 	netdev = alloc_etherdev(sizeof(struct nes_vnic));
 	if (!netdev) {
@@ -1692,65 +1691,23 @@
 
 	if ((nesdev->netdev_count == 0) &&
 	    ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) ||
-	     ((nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) &&
+	     ((phy_type == NES_PHY_TYPE_PUMA_1G) &&
 	      (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) ||
 	       ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) {
-		/*
-		 * nes_debug(NES_DBG_INIT, "Setting up PHY interrupt mask. Using register index 0x%04X\n",
-		 *		NES_IDX_PHY_PCS_CONTROL_STATUS0 + (0x200 * (nesvnic->logical_port & 1)));
-		 */
 		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
 				(0x200 * (nesdev->mac_index & 1)));
-		if (nesdev->nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G) {
+		if (phy_type != NES_PHY_TYPE_PUMA_1G) {
 			u32temp |= 0x00200000;
 			nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
 				(0x200 * (nesdev->mac_index & 1)), u32temp);
 		}
 
-		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
-				(0x200 * (nesdev->mac_index & 1)));
-
-		if ((u32temp&0x0f1f0000) == 0x0f0f0000) {
-			if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_IRIS) {
-				nes_init_phy(nesdev);
-				nes_read_10G_phy_reg(nesdev, nesdev->nesadapter->phy_index[nesdev->mac_index], 1, 1);
-				temp_phy_data = (u16)nes_read_indexed(nesdev,
-									NES_IDX_MAC_MDIO_CONTROL);
-				u32temp = 20;
-				do {
-					nes_read_10G_phy_reg(nesdev, nesdev->nesadapter->phy_index[nesdev->mac_index], 1, 1);
-					phy_data = (u16)nes_read_indexed(nesdev,
-									NES_IDX_MAC_MDIO_CONTROL);
-					if ((phy_data == temp_phy_data) || (!(--u32temp)))
-						break;
-					temp_phy_data = phy_data;
-				} while (1);
-				if (phy_data & 4) {
-					nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
-					nesvnic->linkup = 1;
-				} else {
-					nes_debug(NES_DBG_INIT, "The Link is DOWN!!.\n");
-				}
-			} else {
-				nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
-				nesvnic->linkup = 1;
-			}
-		} else if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_PUMA_1G) {
-			nes_debug(NES_DBG_INIT, "mac_index=%d, logical_port=%d, u32temp=0x%04X, PCI_FUNC=%d\n",
-				nesdev->mac_index, nesvnic->logical_port, u32temp, PCI_FUNC(nesdev->pcidev->devfn));
-			if (((nesdev->mac_index < 2) && ((u32temp&0x01010000) == 0x01010000)) ||
-			    ((nesdev->mac_index > 1) && ((u32temp&0x02020000) == 0x02020000)))  {
-				nes_debug(NES_DBG_INIT, "The Link is UP!!.\n");
-				nesvnic->linkup = 1;
-			}
-		}
 		/* clear the MAC interrupt status, assumes direct logical to physical mapping */
 		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index));
 		nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
 		nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp);
 
-		if (nesdev->nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_IRIS)
-			nes_init_phy(nesdev);
+		nes_init_phy(nesdev);
 
 	}
 
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index 64d3136..815725f 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -228,7 +228,7 @@
 	/* Check for SQ overflow */
 	if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
 		spin_unlock_irqrestore(&nesqp->lock, flags);
-		return -EINVAL;
+		return -ENOMEM;
 	}
 
 	wqe = &nesqp->hwqp.sq_vbase[head];
@@ -3294,7 +3294,7 @@
 
 		/* Check for SQ overflow */
 		if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
-			err = -EINVAL;
+			err = -ENOMEM;
 			break;
 		}
 
@@ -3577,7 +3577,7 @@
 		}
 		/* Check for RQ overflow */
 		if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
-			err = -EINVAL;
+			err = -ENOMEM;
 			break;
 		}
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index e9795f6..d10b4ec 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -55,9 +55,7 @@
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
-	coal->tx_coalesce_usecs = priv->ethtool.coalesce_usecs;
 	coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
-	coal->tx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
 
 	return 0;
 }
@@ -69,10 +67,8 @@
 	int ret;
 
 	/*
-	 * Since IPoIB uses a single CQ for both rx and tx, we assume
-	 * that rx params dictate the configuration.  These values are
-	 * saved in the private data and returned when ipoib_get_coalesce()
-	 * is called.
+	 * These values are saved in the private data and returned
+	 * when ipoib_get_coalesce() is called
 	 */
 	if (coal->rx_coalesce_usecs       > 0xffff ||
 	    coal->rx_max_coalesced_frames > 0xffff)
@@ -85,8 +81,6 @@
 		return ret;
 	}
 
-	coal->tx_coalesce_usecs       = coal->rx_coalesce_usecs;
-	coal->tx_max_coalesced_frames = coal->rx_max_coalesced_frames;
 	priv->ethtool.coalesce_usecs       = coal->rx_coalesce_usecs;
 	priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
 
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 5f7a6fc..71237f8f 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -128,6 +128,28 @@
 	return 0;
 }
 
+int iser_initialize_task_headers(struct iscsi_task *task,
+						struct iser_tx_desc *tx_desc)
+{
+	struct iscsi_iser_conn *iser_conn = task->conn->dd_data;
+	struct iser_device     *device    = iser_conn->ib_conn->device;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	u64 dma_addr;
+
+	dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+				ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device, dma_addr))
+		return -ENOMEM;
+
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+
+	iser_task->headers_initialized	= 1;
+	iser_task->iser_conn		= iser_conn;
+	return 0;
+}
 /**
  * iscsi_iser_task_init - Initialize task
  * @task: iscsi task
@@ -137,17 +159,17 @@
 static int
 iscsi_iser_task_init(struct iscsi_task *task)
 {
-	struct iscsi_iser_conn *iser_conn  = task->conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
 
+	if (!iser_task->headers_initialized)
+		if (iser_initialize_task_headers(task, &iser_task->desc))
+			return -ENOMEM;
+
 	/* mgmt task */
-	if (!task->sc) {
-		iser_task->desc.data = task->data;
+	if (!task->sc)
 		return 0;
-	}
 
 	iser_task->command_sent = 0;
-	iser_task->iser_conn    = iser_conn;
 	iser_task_rdma_init(iser_task);
 	return 0;
 }
@@ -168,7 +190,7 @@
 {
 	int error = 0;
 
-	iser_dbg("task deq [cid %d itt 0x%x]\n", conn->id, task->itt);
+	iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt);
 
 	error = iser_send_control(conn, task);
 
@@ -178,9 +200,6 @@
 	 * - if yes, the task is recycled at iscsi_complete_pdu
 	 * - if no,  the task is recycled at iser_snd_completion
 	 */
-	if (error && error != -ENOBUFS)
-		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
-
 	return error;
 }
 
@@ -232,7 +251,7 @@
 			   task->imm_count, task->unsol_r2t.data_length);
 	}
 
-	iser_dbg("task deq [cid %d itt 0x%x]\n",
+	iser_dbg("ctask xmit [cid %d itt 0x%x]\n",
 		   conn->id, task->itt);
 
 	/* Send the cmd PDU */
@@ -248,8 +267,6 @@
 		error = iscsi_iser_task_xmit_unsol_data(conn, task);
 
  iscsi_iser_task_xmit_exit:
-	if (error && error != -ENOBUFS)
-		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
 	return error;
 }
 
@@ -283,7 +300,7 @@
 	 * due to issues with the login code re iser sematics
 	 * this not set in iscsi_conn_setup - FIXME
 	 */
-	conn->max_recv_dlength = 128;
+	conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN;
 
 	iser_conn = conn->dd_data;
 	conn->dd_data = iser_conn;
@@ -401,7 +418,7 @@
 	struct Scsi_Host *shost;
 	struct iser_conn *ib_conn;
 
-	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 1);
+	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
 	if (!shost)
 		return NULL;
 	shost->transportt = iscsi_iser_scsi_transport;
@@ -675,7 +692,7 @@
 	memset(&ig, 0, sizeof(struct iser_global));
 
 	ig.desc_cache = kmem_cache_create("iser_descriptors",
-					  sizeof (struct iser_desc),
+					  sizeof(struct iser_tx_desc),
 					  0, SLAB_HWCACHE_ALIGN,
 					  NULL);
 	if (ig.desc_cache == NULL)
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 9d529ca..036934c 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -102,9 +102,9 @@
 #define ISER_MAX_TX_MISC_PDUS		6 /* NOOP_OUT(2), TEXT(1),         *
 					   * SCSI_TMFUNC(2), LOGOUT(1) */
 
-#define ISER_QP_MAX_RECV_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX + \
-					ISER_MAX_RX_MISC_PDUS    +  \
-					ISER_MAX_TX_MISC_PDUS)
+#define ISER_QP_MAX_RECV_DTOS		(ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISER_MIN_POSTED_RX		(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
 
 /* the max TX (send) WR supported by the iSER QP is defined by                 *
  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
@@ -132,6 +132,12 @@
 	__be64  read_va;
 } __attribute__((packed));
 
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN	128
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
 
 /* Length of an object name string */
 #define ISER_OBJECT_NAME_SIZE		    64
@@ -187,51 +193,43 @@
 	struct iser_mem_reg     reg;        /* memory registration info        */
 	void                    *virt_addr;
 	struct iser_device      *device;    /* device->device for dma_unmap    */
-	u64                     dma_addr;   /* if non zero, addr for dma_unmap */
 	enum dma_data_direction direction;  /* direction for dma_unmap	       */
 	unsigned int            data_size;
-	atomic_t                ref_count;  /* refcount, freed when dec to 0   */
-};
-
-#define MAX_REGD_BUF_VECTOR_LEN	2
-
-struct iser_dto {
-	struct iscsi_iser_task *task;
-	struct iser_conn *ib_conn;
-	int                        notify_enable;
-
-	/* vector of registered buffers */
-	unsigned int               regd_vector_len;
-	struct iser_regd_buf       *regd[MAX_REGD_BUF_VECTOR_LEN];
-
-	/* offset into the registered buffer may be specified */
-	unsigned int               offset[MAX_REGD_BUF_VECTOR_LEN];
-
-	/* a smaller size may be specified, if 0, then full size is used */
-	unsigned int               used_sz[MAX_REGD_BUF_VECTOR_LEN];
 };
 
 enum iser_desc_type {
-	ISCSI_RX,
 	ISCSI_TX_CONTROL ,
 	ISCSI_TX_SCSI_COMMAND,
 	ISCSI_TX_DATAOUT
 };
 
-struct iser_desc {
+struct iser_tx_desc {
 	struct iser_hdr              iser_header;
 	struct iscsi_hdr             iscsi_header;
-	struct iser_regd_buf         hdr_regd_buf;
-	void                         *data;         /* used by RX & TX_CONTROL */
-	struct iser_regd_buf         data_regd_buf; /* used by RX & TX_CONTROL */
 	enum   iser_desc_type        type;
-	struct iser_dto              dto;
+	u64		             dma_addr;
+	/* sg[0] points to iser/iscsi headers, sg[1] optionally points to either
+	of immediate data, unsolicited data-out or control (login,text) */
+	struct ib_sge		     tx_sg[2];
+	int                          num_sge;
 };
 
+#define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
+					sizeof(u64) + sizeof(struct ib_sge)))
+struct iser_rx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_hdr             iscsi_header;
+	char		             data[ISER_RECV_DATA_SEG_LEN];
+	u64		             dma_addr;
+	struct ib_sge		     rx_sg;
+	char		             pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
 struct iser_device {
 	struct ib_device             *ib_device;
 	struct ib_pd	             *pd;
-	struct ib_cq	             *cq;
+	struct ib_cq	             *rx_cq;
+	struct ib_cq	             *tx_cq;
 	struct ib_mr	             *mr;
 	struct tasklet_struct	     cq_tasklet;
 	struct list_head             ig_list; /* entry in ig devices list */
@@ -250,15 +248,18 @@
 	struct ib_fmr_pool           *fmr_pool;     /* pool of IB FMRs         */
 	int                          disc_evt_flag; /* disconn event delivered */
 	wait_queue_head_t	     wait;          /* waitq for conn/disconn  */
-	atomic_t                     post_recv_buf_count; /* posted rx count   */
+	int                          post_recv_buf_count; /* posted rx count  */
 	atomic_t                     post_send_buf_count; /* posted tx count   */
-	atomic_t                     unexpected_pdu_count;/* count of received *
-							   * unexpected pdus   *
-							   * not yet retired   */
 	char 			     name[ISER_OBJECT_NAME_SIZE];
 	struct iser_page_vec         *page_vec;     /* represents SG to fmr maps*
 						     * maps serialized as tx is*/
 	struct list_head	     conn_list;       /* entry in ig conn list */
+
+	char  			     *login_buf;
+	u64 			     login_dma;
+	unsigned int 		     rx_desc_head;
+	struct iser_rx_desc	     *rx_descs;
+	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
 };
 
 struct iscsi_iser_conn {
@@ -267,7 +268,7 @@
 };
 
 struct iscsi_iser_task {
-	struct iser_desc             desc;
+	struct iser_tx_desc          desc;
 	struct iscsi_iser_conn	     *iser_conn;
 	enum iser_task_status 	     status;
 	int                          command_sent;  /* set if command  sent  */
@@ -275,6 +276,7 @@
 	struct iser_regd_buf         rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */
 	struct iser_data_buf         data[ISER_DIRS_NUM];     /* orig. data des*/
 	struct iser_data_buf         data_copy[ISER_DIRS_NUM];/* contig. copy  */
+	int                          headers_initialized;
 };
 
 struct iser_page_vec {
@@ -322,22 +324,17 @@
 
 void iser_conn_terminate(struct iser_conn *ib_conn);
 
-void iser_rcv_completion(struct iser_desc *desc,
-			 unsigned long    dto_xfer_len);
+void iser_rcv_completion(struct iser_rx_desc *desc,
+			 unsigned long    dto_xfer_len,
+			struct iser_conn *ib_conn);
 
-void iser_snd_completion(struct iser_desc *desc);
+void iser_snd_completion(struct iser_tx_desc *desc, struct iser_conn *ib_conn);
 
 void iser_task_rdma_init(struct iscsi_iser_task *task);
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *task);
 
-void iser_dto_buffs_release(struct iser_dto *dto);
-
-int  iser_regd_buff_release(struct iser_regd_buf *regd_buf);
-
-void iser_reg_single(struct iser_device      *device,
-		     struct iser_regd_buf    *regd_buf,
-		     enum dma_data_direction direction);
+void iser_free_rx_descriptors(struct iser_conn *ib_conn);
 
 void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task,
 				     enum iser_data_dir         cmd_dir);
@@ -356,11 +353,9 @@
 
 void iser_unreg_mem(struct iser_mem_reg *mem_reg);
 
-int  iser_post_recv(struct iser_desc *rx_desc);
-int  iser_post_send(struct iser_desc *tx_desc);
-
-int iser_conn_state_comp(struct iser_conn *ib_conn,
-			 enum iser_ib_conn_state comp);
+int  iser_post_recvl(struct iser_conn *ib_conn);
+int  iser_post_recvm(struct iser_conn *ib_conn, int count);
+int  iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc);
 
 int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
 			    struct iser_data_buf       *data,
@@ -368,4 +363,6 @@
 			    enum   dma_data_direction  dma_dir);
 
 void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task);
+int  iser_initialize_task_headers(struct iscsi_task *task,
+			struct iser_tx_desc *tx_desc);
 #endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 9de6402..0b9ef07 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -39,29 +39,6 @@
 
 #include "iscsi_iser.h"
 
-/* Constant PDU lengths calculations */
-#define ISER_TOTAL_HEADERS_LEN  (sizeof (struct iser_hdr) + \
-				 sizeof (struct iscsi_hdr))
-
-/* iser_dto_add_regd_buff - increments the reference count for *
- * the registered buffer & adds it to the DTO object           */
-static void iser_dto_add_regd_buff(struct iser_dto *dto,
-				   struct iser_regd_buf *regd_buf,
-				   unsigned long use_offset,
-				   unsigned long use_size)
-{
-	int add_idx;
-
-	atomic_inc(&regd_buf->ref_count);
-
-	add_idx = dto->regd_vector_len;
-	dto->regd[add_idx] = regd_buf;
-	dto->used_sz[add_idx] = use_size;
-	dto->offset[add_idx] = use_offset;
-
-	dto->regd_vector_len++;
-}
-
 /* Register user buffer memory and initialize passive rdma
  *  dto descriptor. Total data size is stored in
  *  iser_task->data[ISER_DIR_IN].data_len
@@ -122,9 +99,9 @@
 	struct iscsi_iser_task *iser_task = task->dd_data;
 	struct iser_regd_buf *regd_buf;
 	int err;
-	struct iser_dto *send_dto = &iser_task->desc.dto;
 	struct iser_hdr *hdr = &iser_task->desc.iser_header;
 	struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
+	struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
 
 	err = iser_dma_map_task_data(iser_task,
 				     buf_out,
@@ -163,135 +140,100 @@
 	if (imm_sz > 0) {
 		iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n",
 			 task->itt, imm_sz);
-		iser_dto_add_regd_buff(send_dto,
-				       regd_buf,
-				       0,
-				       imm_sz);
+		tx_dsg->addr   = regd_buf->reg.va;
+		tx_dsg->length = imm_sz;
+		tx_dsg->lkey   = regd_buf->reg.lkey;
+		iser_task->desc.num_sge = 2;
 	}
 
 	return 0;
 }
 
-/**
- * iser_post_receive_control - allocates, initializes and posts receive DTO.
- */
-static int iser_post_receive_control(struct iscsi_conn *conn)
-{
-	struct iscsi_iser_conn *iser_conn = conn->dd_data;
-	struct iser_desc     *rx_desc;
-	struct iser_regd_buf *regd_hdr;
-	struct iser_regd_buf *regd_data;
-	struct iser_dto      *recv_dto = NULL;
-	struct iser_device  *device = iser_conn->ib_conn->device;
-	int rx_data_size, err;
-	int posts, outstanding_unexp_pdus;
-
-	/* for the login sequence we must support rx of upto 8K; login is done
-	 * after conn create/bind (connect) and conn stop/bind (reconnect),
-	 * what's common for both schemes is that the connection is not started
-	 */
-	if (conn->c_stage != ISCSI_CONN_STARTED)
-		rx_data_size = ISCSI_DEF_MAX_RECV_SEG_LEN;
-	else /* FIXME till user space sets conn->max_recv_dlength correctly */
-		rx_data_size = 128;
-
-	outstanding_unexp_pdus =
-		atomic_xchg(&iser_conn->ib_conn->unexpected_pdu_count, 0);
-
-	/*
-	 * in addition to the response buffer, replace those consumed by
-	 * unexpected pdus.
-	 */
-	for (posts = 0; posts < 1 + outstanding_unexp_pdus; posts++) {
-		rx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO);
-		if (rx_desc == NULL) {
-			iser_err("Failed to alloc desc for post recv %d\n",
-				 posts);
-			err = -ENOMEM;
-			goto post_rx_cache_alloc_failure;
-		}
-		rx_desc->type = ISCSI_RX;
-		rx_desc->data = kmalloc(rx_data_size, GFP_NOIO);
-		if (rx_desc->data == NULL) {
-			iser_err("Failed to alloc data buf for post recv %d\n",
-				 posts);
-			err = -ENOMEM;
-			goto post_rx_kmalloc_failure;
-		}
-
-		recv_dto = &rx_desc->dto;
-		recv_dto->ib_conn = iser_conn->ib_conn;
-		recv_dto->regd_vector_len = 0;
-
-		regd_hdr = &rx_desc->hdr_regd_buf;
-		memset(regd_hdr, 0, sizeof(struct iser_regd_buf));
-		regd_hdr->device  = device;
-		regd_hdr->virt_addr  = rx_desc; /* == &rx_desc->iser_header */
-		regd_hdr->data_size  = ISER_TOTAL_HEADERS_LEN;
-
-		iser_reg_single(device, regd_hdr, DMA_FROM_DEVICE);
-
-		iser_dto_add_regd_buff(recv_dto, regd_hdr, 0, 0);
-
-		regd_data = &rx_desc->data_regd_buf;
-		memset(regd_data, 0, sizeof(struct iser_regd_buf));
-		regd_data->device  = device;
-		regd_data->virt_addr  = rx_desc->data;
-		regd_data->data_size  = rx_data_size;
-
-		iser_reg_single(device, regd_data, DMA_FROM_DEVICE);
-
-		iser_dto_add_regd_buff(recv_dto, regd_data, 0, 0);
-
-		err = iser_post_recv(rx_desc);
-		if (err) {
-			iser_err("Failed iser_post_recv for post %d\n", posts);
-			goto post_rx_post_recv_failure;
-		}
-	}
-	/* all posts successful */
-	return 0;
-
-post_rx_post_recv_failure:
-	iser_dto_buffs_release(recv_dto);
-	kfree(rx_desc->data);
-post_rx_kmalloc_failure:
-	kmem_cache_free(ig.desc_cache, rx_desc);
-post_rx_cache_alloc_failure:
-	if (posts > 0) {
-		/*
-		 * response buffer posted, but did not replace all unexpected
-		 * pdu recv bufs. Ignore error, retry occurs next send
-		 */
-		outstanding_unexp_pdus -= (posts - 1);
-		err = 0;
-	}
-	atomic_add(outstanding_unexp_pdus,
-		   &iser_conn->ib_conn->unexpected_pdu_count);
-
-	return err;
-}
-
 /* creates a new tx descriptor and adds header regd buffer */
-static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn,
-				  struct iser_desc       *tx_desc)
+static void iser_create_send_desc(struct iser_conn	*ib_conn,
+				  struct iser_tx_desc	*tx_desc)
 {
-	struct iser_regd_buf *regd_hdr = &tx_desc->hdr_regd_buf;
-	struct iser_dto      *send_dto = &tx_desc->dto;
+	struct iser_device *device = ib_conn->device;
 
-	memset(regd_hdr, 0, sizeof(struct iser_regd_buf));
-	regd_hdr->device  = iser_conn->ib_conn->device;
-	regd_hdr->virt_addr  = tx_desc; /* == &tx_desc->iser_header */
-	regd_hdr->data_size  = ISER_TOTAL_HEADERS_LEN;
-
-	send_dto->ib_conn         = iser_conn->ib_conn;
-	send_dto->notify_enable   = 1;
-	send_dto->regd_vector_len = 0;
+	ib_dma_sync_single_for_cpu(device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
 	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
 	tx_desc->iser_header.flags = ISER_VER;
 
-	iser_dto_add_regd_buff(send_dto, regd_hdr, 0, 0);
+	tx_desc->num_sge = 1;
+
+	if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
+		tx_desc->tx_sg[0].lkey = device->mr->lkey;
+		iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
+	}
+}
+
+
+int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)
+{
+	int i, j;
+	u64 dma_addr;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge       *rx_sg;
+	struct iser_device  *device = ib_conn->device;
+
+	ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!ib_conn->rx_descs)
+		goto rx_desc_alloc_fail;
+
+	rx_desc = ib_conn->rx_descs;
+
+	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(device->ib_device, dma_addr))
+			goto rx_desc_dma_map_failed;
+
+		rx_desc->dma_addr = dma_addr;
+
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr   = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey   = device->mr->lkey;
+	}
+
+	ib_conn->rx_desc_head = 0;
+	return 0;
+
+rx_desc_dma_map_failed:
+	rx_desc = ib_conn->rx_descs;
+	for (j = 0; j < i; j++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(ib_conn->rx_descs);
+	ib_conn->rx_descs = NULL;
+rx_desc_alloc_fail:
+	iser_err("failed allocating rx descriptors / data buffers\n");
+	return -ENOMEM;
+}
+
+void iser_free_rx_descriptors(struct iser_conn *ib_conn)
+{
+	int i;
+	struct iser_rx_desc *rx_desc;
+	struct iser_device *device = ib_conn->device;
+
+	if (ib_conn->login_buf) {
+		ib_dma_unmap_single(device->ib_device, ib_conn->login_dma,
+			ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+		kfree(ib_conn->login_buf);
+	}
+
+	if (!ib_conn->rx_descs)
+		return;
+
+	rx_desc = ib_conn->rx_descs;
+	for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+			ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(ib_conn->rx_descs);
 }
 
 /**
@@ -301,46 +243,23 @@
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 
-	int i;
-	/*
-	 * FIXME this value should be declared to the target during login with
-	 * the MaxOutstandingUnexpectedPDUs key when supported
-	 */
-	int initial_post_recv_bufs_num = ISER_MAX_RX_MISC_PDUS;
-
-	iser_dbg("Initially post: %d\n", initial_post_recv_bufs_num);
+	iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX);
 
 	/* Check that there is no posted recv or send buffers left - */
 	/* they must be consumed during the login phase */
-	BUG_ON(atomic_read(&iser_conn->ib_conn->post_recv_buf_count) != 0);
+	BUG_ON(iser_conn->ib_conn->post_recv_buf_count != 0);
 	BUG_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0);
 
+	if (iser_alloc_rx_descriptors(iser_conn->ib_conn))
+		return -ENOMEM;
+
 	/* Initial post receive buffers */
-	for (i = 0; i < initial_post_recv_bufs_num; i++) {
-		if (iser_post_receive_control(conn) != 0) {
-			iser_err("Failed to post recv bufs at:%d conn:0x%p\n",
-				 i, conn);
-			return -ENOMEM;
-		}
-	}
-	iser_dbg("Posted %d post recv bufs, conn:0x%p\n", i, conn);
+	if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX))
+		return -ENOMEM;
+
 	return 0;
 }
 
-static int
-iser_check_xmit(struct iscsi_conn *conn, void *task)
-{
-	struct iscsi_iser_conn *iser_conn = conn->dd_data;
-
-	if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) ==
-	    ISER_QP_MAX_REQ_DTOS) {
-		iser_dbg("%ld can't xmit task %p\n",jiffies,task);
-		return -ENOBUFS;
-	}
-	return 0;
-}
-
-
 /**
  * iser_send_command - send command PDU
  */
@@ -349,27 +268,18 @@
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_dto *send_dto = NULL;
 	unsigned long edtl;
-	int err = 0;
+	int err;
 	struct iser_data_buf *data_buf;
 	struct iscsi_cmd *hdr =  (struct iscsi_cmd *)task->hdr;
 	struct scsi_cmnd *sc  =  task->sc;
-
-	if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-		iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-		return -EPERM;
-	}
-	if (iser_check_xmit(conn, task))
-		return -ENOBUFS;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
 
 	edtl = ntohl(hdr->data_length);
 
 	/* build the tx desc regd header and add it to the tx desc dto */
-	iser_task->desc.type = ISCSI_TX_SCSI_COMMAND;
-	send_dto = &iser_task->desc.dto;
-	send_dto->task = iser_task;
-	iser_create_send_desc(iser_conn, &iser_task->desc);
+	tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+	iser_create_send_desc(iser_conn->ib_conn, tx_desc);
 
 	if (hdr->flags & ISCSI_FLAG_CMD_READ)
 		data_buf = &iser_task->data[ISER_DIR_IN];
@@ -398,23 +308,13 @@
 			goto send_command_error;
 	}
 
-	iser_reg_single(iser_conn->ib_conn->device,
-			send_dto->regd[0], DMA_TO_DEVICE);
-
-	if (iser_post_receive_control(conn) != 0) {
-		iser_err("post_recv failed!\n");
-		err = -ENOMEM;
-		goto send_command_error;
-	}
-
 	iser_task->status = ISER_TASK_STATUS_STARTED;
 
-	err = iser_post_send(&iser_task->desc);
+	err = iser_post_send(iser_conn->ib_conn, tx_desc);
 	if (!err)
 		return 0;
 
 send_command_error:
-	iser_dto_buffs_release(send_dto);
 	iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err);
 	return err;
 }
@@ -428,20 +328,13 @@
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_desc *tx_desc = NULL;
-	struct iser_dto *send_dto = NULL;
+	struct iser_tx_desc *tx_desc = NULL;
+	struct iser_regd_buf *regd_buf;
 	unsigned long buf_offset;
 	unsigned long data_seg_len;
 	uint32_t itt;
 	int err = 0;
-
-	if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-		iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-		return -EPERM;
-	}
-
-	if (iser_check_xmit(conn, task))
-		return -ENOBUFS;
+	struct ib_sge *tx_dsg;
 
 	itt = (__force uint32_t)hdr->itt;
 	data_seg_len = ntoh24(hdr->dlength);
@@ -450,28 +343,25 @@
 	iser_dbg("%s itt %d dseg_len %d offset %d\n",
 		 __func__,(int)itt,(int)data_seg_len,(int)buf_offset);
 
-	tx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO);
+	tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
 	if (tx_desc == NULL) {
 		iser_err("Failed to alloc desc for post dataout\n");
 		return -ENOMEM;
 	}
 
 	tx_desc->type = ISCSI_TX_DATAOUT;
+	tx_desc->iser_header.flags = ISER_VER;
 	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
 
-	/* build the tx desc regd header and add it to the tx desc dto */
-	send_dto = &tx_desc->dto;
-	send_dto->task = iser_task;
-	iser_create_send_desc(iser_conn, tx_desc);
+	/* build the tx desc */
+	iser_initialize_task_headers(task, tx_desc);
 
-	iser_reg_single(iser_conn->ib_conn->device,
-			send_dto->regd[0], DMA_TO_DEVICE);
-
-	/* all data was registered for RDMA, we can use the lkey */
-	iser_dto_add_regd_buff(send_dto,
-			       &iser_task->rdma_regd[ISER_DIR_OUT],
-			       buf_offset,
-			       data_seg_len);
+	regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT];
+	tx_dsg = &tx_desc->tx_sg[1];
+	tx_dsg->addr    = regd_buf->reg.va + buf_offset;
+	tx_dsg->length  = data_seg_len;
+	tx_dsg->lkey    = regd_buf->reg.lkey;
+	tx_desc->num_sge = 2;
 
 	if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) {
 		iser_err("Offset:%ld & DSL:%ld in Data-Out "
@@ -485,12 +375,11 @@
 		 itt, buf_offset, data_seg_len);
 
 
-	err = iser_post_send(tx_desc);
+	err = iser_post_send(iser_conn->ib_conn, tx_desc);
 	if (!err)
 		return 0;
 
 send_data_out_error:
-	iser_dto_buffs_release(send_dto);
 	kmem_cache_free(ig.desc_cache, tx_desc);
 	iser_err("conn %p failed err %d\n",conn, err);
 	return err;
@@ -501,64 +390,44 @@
 {
 	struct iscsi_iser_conn *iser_conn = conn->dd_data;
 	struct iscsi_iser_task *iser_task = task->dd_data;
-	struct iser_desc *mdesc = &iser_task->desc;
-	struct iser_dto *send_dto = NULL;
+	struct iser_tx_desc *mdesc = &iser_task->desc;
 	unsigned long data_seg_len;
 	int err = 0;
-	struct iser_regd_buf *regd_buf;
 	struct iser_device *device;
-	unsigned char opcode;
-
-	if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) {
-		iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn);
-		return -EPERM;
-	}
-
-	if (iser_check_xmit(conn, task))
-		return -ENOBUFS;
 
 	/* build the tx desc regd header and add it to the tx desc dto */
 	mdesc->type = ISCSI_TX_CONTROL;
-	send_dto = &mdesc->dto;
-	send_dto->task = NULL;
-	iser_create_send_desc(iser_conn, mdesc);
+	iser_create_send_desc(iser_conn->ib_conn, mdesc);
 
 	device = iser_conn->ib_conn->device;
 
-	iser_reg_single(device, send_dto->regd[0], DMA_TO_DEVICE);
-
 	data_seg_len = ntoh24(task->hdr->dlength);
 
 	if (data_seg_len > 0) {
-		regd_buf = &mdesc->data_regd_buf;
-		memset(regd_buf, 0, sizeof(struct iser_regd_buf));
-		regd_buf->device = device;
-		regd_buf->virt_addr = task->data;
-		regd_buf->data_size = task->data_count;
-		iser_reg_single(device, regd_buf,
-				DMA_TO_DEVICE);
-		iser_dto_add_regd_buff(send_dto, regd_buf,
-				       0,
-				       data_seg_len);
-	}
-
-	opcode = task->hdr->opcode & ISCSI_OPCODE_MASK;
-
-	/* post recv buffer for response if one is expected */
-	if (!(opcode == ISCSI_OP_NOOP_OUT && task->hdr->itt == RESERVED_ITT)) {
-		if (iser_post_receive_control(conn) != 0) {
-			iser_err("post_rcv_buff failed!\n");
-			err = -ENOMEM;
+		struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+		if (task != conn->login_task) {
+			iser_err("data present on non login task!!!\n");
 			goto send_control_error;
 		}
+		memcpy(iser_conn->ib_conn->login_buf, task->data,
+							task->data_count);
+		tx_dsg->addr    = iser_conn->ib_conn->login_dma;
+		tx_dsg->length  = data_seg_len;
+		tx_dsg->lkey    = device->mr->lkey;
+		mdesc->num_sge = 2;
 	}
 
-	err = iser_post_send(mdesc);
+	if (task == conn->login_task) {
+		err = iser_post_recvl(iser_conn->ib_conn);
+		if (err)
+			goto send_control_error;
+	}
+
+	err = iser_post_send(iser_conn->ib_conn, mdesc);
 	if (!err)
 		return 0;
 
 send_control_error:
-	iser_dto_buffs_release(send_dto);
 	iser_err("conn %p failed err %d\n",conn, err);
 	return err;
 }
@@ -566,104 +435,71 @@
 /**
  * iser_rcv_dto_completion - recv DTO completion
  */
-void iser_rcv_completion(struct iser_desc *rx_desc,
-			 unsigned long dto_xfer_len)
+void iser_rcv_completion(struct iser_rx_desc *rx_desc,
+			 unsigned long rx_xfer_len,
+			 struct iser_conn *ib_conn)
 {
-	struct iser_dto *dto = &rx_desc->dto;
-	struct iscsi_iser_conn *conn = dto->ib_conn->iser_conn;
-	struct iscsi_task *task;
-	struct iscsi_iser_task *iser_task;
+	struct iscsi_iser_conn *conn = ib_conn->iser_conn;
 	struct iscsi_hdr *hdr;
-	char   *rx_data = NULL;
-	int     rx_data_len = 0;
-	unsigned char opcode;
+	u64 rx_dma;
+	int rx_buflen, outstanding, count, err;
+
+	/* differentiate between login to all other PDUs */
+	if ((char *)rx_desc == ib_conn->login_buf) {
+		rx_dma = ib_conn->login_dma;
+		rx_buflen = ISER_RX_LOGIN_SIZE;
+	} else {
+		rx_dma = rx_desc->dma_addr;
+		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
+			rx_buflen, DMA_FROM_DEVICE);
 
 	hdr = &rx_desc->iscsi_header;
 
-	iser_dbg("op 0x%x itt 0x%x\n", hdr->opcode,hdr->itt);
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
 
-	if (dto_xfer_len > ISER_TOTAL_HEADERS_LEN) { /* we have data */
-		rx_data_len = dto_xfer_len - ISER_TOTAL_HEADERS_LEN;
-		rx_data     = dto->regd[1]->virt_addr;
-		rx_data    += dto->offset[1];
-	}
+	iscsi_iser_recv(conn->iscsi_conn, hdr,
+		rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN);
 
-	opcode = hdr->opcode & ISCSI_OPCODE_MASK;
-
-	if (opcode == ISCSI_OP_SCSI_CMD_RSP) {
-		spin_lock(&conn->iscsi_conn->session->lock);
-		task = iscsi_itt_to_ctask(conn->iscsi_conn, hdr->itt);
-		if (task)
-			__iscsi_get_task(task);
-		spin_unlock(&conn->iscsi_conn->session->lock);
-
-		if (!task)
-			iser_err("itt can't be matched to task!!! "
-				 "conn %p opcode %d itt %d\n",
-				 conn->iscsi_conn, opcode, hdr->itt);
-		else {
-			iser_task = task->dd_data;
-			iser_dbg("itt %d task %p\n",hdr->itt, task);
-			iser_task->status = ISER_TASK_STATUS_COMPLETED;
-			iser_task_rdma_finalize(iser_task);
-			iscsi_put_task(task);
-		}
-	}
-	iser_dto_buffs_release(dto);
-
-	iscsi_iser_recv(conn->iscsi_conn, hdr, rx_data, rx_data_len);
-
-	kfree(rx_desc->data);
-	kmem_cache_free(ig.desc_cache, rx_desc);
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
+			rx_buflen, DMA_FROM_DEVICE);
 
 	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
 	 * task eliminates the need to worry on tasks which are completed in   *
 	 * parallel to the execution of iser_conn_term. So the code that waits *
 	 * for the posted rx bufs refcount to become zero handles everything   */
-	atomic_dec(&conn->ib_conn->post_recv_buf_count);
+	conn->ib_conn->post_recv_buf_count--;
 
-	/*
-	 * if an unexpected PDU was received then the recv wr consumed must
-	 * be replaced, this is done in the next send of a control-type PDU
-	 */
-	if (opcode == ISCSI_OP_NOOP_IN && hdr->itt == RESERVED_ITT) {
-		/* nop-in with itt = 0xffffffff */
-		atomic_inc(&conn->ib_conn->unexpected_pdu_count);
+	if (rx_dma == ib_conn->login_dma)
+		return;
+
+	outstanding = ib_conn->post_recv_buf_count;
+	if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) {
+		count = min(ISER_QP_MAX_RECV_DTOS - outstanding,
+						ISER_MIN_POSTED_RX);
+		err = iser_post_recvm(ib_conn, count);
+		if (err)
+			iser_err("posting %d rx bufs err %d\n", count, err);
 	}
-	else if (opcode == ISCSI_OP_ASYNC_EVENT) {
-		/* asyncronous message */
-		atomic_inc(&conn->ib_conn->unexpected_pdu_count);
-	}
-	/* a reject PDU consumes the recv buf posted for the response */
 }
 
-void iser_snd_completion(struct iser_desc *tx_desc)
+void iser_snd_completion(struct iser_tx_desc *tx_desc,
+			struct iser_conn *ib_conn)
 {
-	struct iser_dto        *dto = &tx_desc->dto;
-	struct iser_conn       *ib_conn = dto->ib_conn;
-	struct iscsi_iser_conn *iser_conn = ib_conn->iser_conn;
-	struct iscsi_conn      *conn = iser_conn->iscsi_conn;
 	struct iscsi_task *task;
-	int resume_tx = 0;
+	struct iser_device *device = ib_conn->device;
 
-	iser_dbg("Initiator, Data sent dto=0x%p\n", dto);
-
-	iser_dto_buffs_release(dto);
-
-	if (tx_desc->type == ISCSI_TX_DATAOUT)
+	if (tx_desc->type == ISCSI_TX_DATAOUT) {
+		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+					ISER_HEADERS_LEN, DMA_TO_DEVICE);
 		kmem_cache_free(ig.desc_cache, tx_desc);
-
-	if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) ==
-	    ISER_QP_MAX_REQ_DTOS)
-		resume_tx = 1;
+	}
 
 	atomic_dec(&ib_conn->post_send_buf_count);
 
-	if (resume_tx) {
-		iser_dbg("%ld resuming tx\n",jiffies);
-		iscsi_conn_queue_work(conn);
-	}
-
 	if (tx_desc->type == ISCSI_TX_CONTROL) {
 		/* this arithmetic is legal by libiscsi dd_data allocation */
 		task = (void *) ((long)(void *)tx_desc -
@@ -692,7 +528,6 @@
 
 void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
 {
-	int deferred;
 	int is_rdma_aligned = 1;
 	struct iser_regd_buf *regd;
 
@@ -710,32 +545,17 @@
 
 	if (iser_task->dir[ISER_DIR_IN]) {
 		regd = &iser_task->rdma_regd[ISER_DIR_IN];
-		deferred = iser_regd_buff_release(regd);
-		if (deferred) {
-			iser_err("%d references remain for BUF-IN rdma reg\n",
-				 atomic_read(&regd->ref_count));
-		}
+		if (regd->reg.is_fmr)
+			iser_unreg_mem(&regd->reg);
 	}
 
 	if (iser_task->dir[ISER_DIR_OUT]) {
 		regd = &iser_task->rdma_regd[ISER_DIR_OUT];
-		deferred = iser_regd_buff_release(regd);
-		if (deferred) {
-			iser_err("%d references remain for BUF-OUT rdma reg\n",
-				 atomic_read(&regd->ref_count));
-		}
+		if (regd->reg.is_fmr)
+			iser_unreg_mem(&regd->reg);
 	}
 
        /* if the data was unaligned, it was already unmapped and then copied */
        if (is_rdma_aligned)
 		iser_dma_unmap_task_data(iser_task);
 }
-
-void iser_dto_buffs_release(struct iser_dto *dto)
-{
-	int i;
-
-	for (i = 0; i < dto->regd_vector_len; i++)
-		iser_regd_buff_release(dto->regd[i]);
-}
-
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 274c883..fb88d68 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -41,62 +41,6 @@
 #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */
 
 /**
- * Decrements the reference count for the
- * registered buffer & releases it
- *
- * returns 0 if released, 1 if deferred
- */
-int iser_regd_buff_release(struct iser_regd_buf *regd_buf)
-{
-	struct ib_device *dev;
-
-	if ((atomic_read(&regd_buf->ref_count) == 0) ||
-	    atomic_dec_and_test(&regd_buf->ref_count)) {
-		/* if we used the dma mr, unreg is just NOP */
-		if (regd_buf->reg.is_fmr)
-			iser_unreg_mem(&regd_buf->reg);
-
-		if (regd_buf->dma_addr) {
-			dev = regd_buf->device->ib_device;
-			ib_dma_unmap_single(dev,
-					 regd_buf->dma_addr,
-					 regd_buf->data_size,
-					 regd_buf->direction);
-		}
-		/* else this regd buf is associated with task which we */
-		/* dma_unmap_single/sg later */
-		return 0;
-	} else {
-		iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf);
-		return 1;
-	}
-}
-
-/**
- * iser_reg_single - fills registered buffer descriptor with
- *		     registration information
- */
-void iser_reg_single(struct iser_device *device,
-		     struct iser_regd_buf *regd_buf,
-		     enum dma_data_direction direction)
-{
-	u64 dma_addr;
-
-	dma_addr = ib_dma_map_single(device->ib_device,
-				     regd_buf->virt_addr,
-				     regd_buf->data_size, direction);
-	BUG_ON(ib_dma_mapping_error(device->ib_device, dma_addr));
-
-	regd_buf->reg.lkey = device->mr->lkey;
-	regd_buf->reg.len  = regd_buf->data_size;
-	regd_buf->reg.va   = dma_addr;
-	regd_buf->reg.is_fmr = 0;
-
-	regd_buf->dma_addr  = dma_addr;
-	regd_buf->direction = direction;
-}
-
-/**
  * iser_start_rdma_unaligned_sg
  */
 static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
@@ -109,10 +53,10 @@
 	unsigned long  cmd_data_len = data->data_len;
 
 	if (cmd_data_len > ISER_KMALLOC_THRESHOLD)
-		mem = (void *)__get_free_pages(GFP_NOIO,
+		mem = (void *)__get_free_pages(GFP_ATOMIC,
 		      ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
 	else
-		mem = kmalloc(cmd_data_len, GFP_NOIO);
+		mem = kmalloc(cmd_data_len, GFP_ATOMIC);
 
 	if (mem == NULL) {
 		iser_err("Failed to allocate mem size %d %d for copying sglist\n",
@@ -474,9 +418,5 @@
 			return err;
 		}
 	}
-
-	/* take a reference on this regd buf such that it will not be released *
-	 * (eg in send dto completion) before we get the scsi response         */
-	atomic_inc(&regd_buf->ref_count);
 	return 0;
 }
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 8579f32..308d17b 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -37,9 +37,8 @@
 #include "iscsi_iser.h"
 
 #define ISCSI_ISER_MAX_CONN	8
-#define ISER_MAX_CQ_LEN		((ISER_QP_MAX_RECV_DTOS + \
-				ISER_QP_MAX_REQ_DTOS) *   \
-				 ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_RX_CQ_LEN	(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN	(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
 
 static void iser_cq_tasklet_fn(unsigned long data);
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
@@ -67,15 +66,23 @@
 	if (IS_ERR(device->pd))
 		goto pd_err;
 
-	device->cq = ib_create_cq(device->ib_device,
+	device->rx_cq = ib_create_cq(device->ib_device,
 				  iser_cq_callback,
 				  iser_cq_event_callback,
 				  (void *)device,
-				  ISER_MAX_CQ_LEN, 0);
-	if (IS_ERR(device->cq))
-		goto cq_err;
+				  ISER_MAX_RX_CQ_LEN, 0);
+	if (IS_ERR(device->rx_cq))
+		goto rx_cq_err;
 
-	if (ib_req_notify_cq(device->cq, IB_CQ_NEXT_COMP))
+	device->tx_cq = ib_create_cq(device->ib_device,
+				  NULL, iser_cq_event_callback,
+				  (void *)device,
+				  ISER_MAX_TX_CQ_LEN, 0);
+
+	if (IS_ERR(device->tx_cq))
+		goto tx_cq_err;
+
+	if (ib_req_notify_cq(device->rx_cq, IB_CQ_NEXT_COMP))
 		goto cq_arm_err;
 
 	tasklet_init(&device->cq_tasklet,
@@ -93,8 +100,10 @@
 dma_mr_err:
 	tasklet_kill(&device->cq_tasklet);
 cq_arm_err:
-	ib_destroy_cq(device->cq);
-cq_err:
+	ib_destroy_cq(device->tx_cq);
+tx_cq_err:
+	ib_destroy_cq(device->rx_cq);
+rx_cq_err:
 	ib_dealloc_pd(device->pd);
 pd_err:
 	iser_err("failed to allocate an IB resource\n");
@@ -112,11 +121,13 @@
 	tasklet_kill(&device->cq_tasklet);
 
 	(void)ib_dereg_mr(device->mr);
-	(void)ib_destroy_cq(device->cq);
+	(void)ib_destroy_cq(device->tx_cq);
+	(void)ib_destroy_cq(device->rx_cq);
 	(void)ib_dealloc_pd(device->pd);
 
 	device->mr = NULL;
-	device->cq = NULL;
+	device->tx_cq = NULL;
+	device->rx_cq = NULL;
 	device->pd = NULL;
 }
 
@@ -129,13 +140,23 @@
 {
 	struct iser_device	*device;
 	struct ib_qp_init_attr	init_attr;
-	int			ret;
+	int			ret = -ENOMEM;
 	struct ib_fmr_pool_param params;
 
 	BUG_ON(ib_conn->device == NULL);
 
 	device = ib_conn->device;
 
+	ib_conn->login_buf = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!ib_conn->login_buf) {
+		goto alloc_err;
+		ret = -ENOMEM;
+	}
+
+	ib_conn->login_dma = ib_dma_map_single(ib_conn->device->ib_device,
+				(void *)ib_conn->login_buf, ISER_RX_LOGIN_SIZE,
+				DMA_FROM_DEVICE);
+
 	ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) +
 				    (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)),
 				    GFP_KERNEL);
@@ -169,12 +190,12 @@
 
 	init_attr.event_handler = iser_qp_event_callback;
 	init_attr.qp_context	= (void *)ib_conn;
-	init_attr.send_cq	= device->cq;
-	init_attr.recv_cq	= device->cq;
+	init_attr.send_cq	= device->tx_cq;
+	init_attr.recv_cq	= device->rx_cq;
 	init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
 	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
-	init_attr.cap.max_send_sge = MAX_REGD_BUF_VECTOR_LEN;
-	init_attr.cap.max_recv_sge = 2;
+	init_attr.cap.max_send_sge = 2;
+	init_attr.cap.max_recv_sge = 1;
 	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
 	init_attr.qp_type	= IB_QPT_RC;
 
@@ -192,6 +213,7 @@
 	(void)ib_destroy_fmr_pool(ib_conn->fmr_pool);
 fmr_pool_err:
 	kfree(ib_conn->page_vec);
+	kfree(ib_conn->login_buf);
 alloc_err:
 	iser_err("unable to alloc mem or create resource, err %d\n", ret);
 	return ret;
@@ -278,17 +300,6 @@
 	mutex_unlock(&ig.device_list_mutex);
 }
 
-int iser_conn_state_comp(struct iser_conn *ib_conn,
-			enum iser_ib_conn_state comp)
-{
-	int ret;
-
-	spin_lock_bh(&ib_conn->lock);
-	ret = (ib_conn->state == comp);
-	spin_unlock_bh(&ib_conn->lock);
-	return ret;
-}
-
 static int iser_conn_state_comp_exch(struct iser_conn *ib_conn,
 				     enum iser_ib_conn_state comp,
 				     enum iser_ib_conn_state exch)
@@ -314,7 +325,7 @@
 	mutex_lock(&ig.connlist_mutex);
 	list_del(&ib_conn->conn_list);
 	mutex_unlock(&ig.connlist_mutex);
-
+	iser_free_rx_descriptors(ib_conn);
 	iser_free_ib_conn_res(ib_conn);
 	ib_conn->device = NULL;
 	/* on EVENT_ADDR_ERROR there's no device yet for this conn */
@@ -442,7 +453,7 @@
 				   ISCSI_ERR_CONN_FAILED);
 
 	/* Complete the termination process if no posts are pending */
-	if ((atomic_read(&ib_conn->post_recv_buf_count) == 0) &&
+	if (ib_conn->post_recv_buf_count == 0 &&
 	    (atomic_read(&ib_conn->post_send_buf_count) == 0)) {
 		ib_conn->state = ISER_CONN_DOWN;
 		wake_up_interruptible(&ib_conn->wait);
@@ -489,9 +500,8 @@
 {
 	ib_conn->state = ISER_CONN_INIT;
 	init_waitqueue_head(&ib_conn->wait);
-	atomic_set(&ib_conn->post_recv_buf_count, 0);
+	ib_conn->post_recv_buf_count = 0;
 	atomic_set(&ib_conn->post_send_buf_count, 0);
-	atomic_set(&ib_conn->unexpected_pdu_count, 0);
 	atomic_set(&ib_conn->refcount, 1);
 	INIT_LIST_HEAD(&ib_conn->conn_list);
 	spin_lock_init(&ib_conn->lock);
@@ -626,136 +636,97 @@
 	reg->mem_h = NULL;
 }
 
-/**
- * iser_dto_to_iov - builds IOV from a dto descriptor
- */
-static void iser_dto_to_iov(struct iser_dto *dto, struct ib_sge *iov, int iov_len)
+int iser_post_recvl(struct iser_conn *ib_conn)
 {
-	int		     i;
-	struct ib_sge	     *sge;
-	struct iser_regd_buf *regd_buf;
+	struct ib_recv_wr rx_wr, *rx_wr_failed;
+	struct ib_sge	  sge;
+	int ib_ret;
 
-	if (dto->regd_vector_len > iov_len) {
-		iser_err("iov size %d too small for posting dto of len %d\n",
-			 iov_len, dto->regd_vector_len);
-		BUG();
-	}
+	sge.addr   = ib_conn->login_dma;
+	sge.length = ISER_RX_LOGIN_SIZE;
+	sge.lkey   = ib_conn->device->mr->lkey;
 
-	for (i = 0; i < dto->regd_vector_len; i++) {
-		sge	    = &iov[i];
-		regd_buf  = dto->regd[i];
+	rx_wr.wr_id   = (unsigned long)ib_conn->login_buf;
+	rx_wr.sg_list = &sge;
+	rx_wr.num_sge = 1;
+	rx_wr.next    = NULL;
 
-		sge->addr   = regd_buf->reg.va;
-		sge->length = regd_buf->reg.len;
-		sge->lkey   = regd_buf->reg.lkey;
-
-		if (dto->used_sz[i] > 0)  /* Adjust size */
-			sge->length = dto->used_sz[i];
-
-		/* offset and length should not exceed the regd buf length */
-		if (sge->length + dto->offset[i] > regd_buf->reg.len) {
-			iser_err("Used len:%ld + offset:%d, exceed reg.buf.len:"
-				 "%ld in dto:0x%p [%d], va:0x%08lX\n",
-				 (unsigned long)sge->length, dto->offset[i],
-				 (unsigned long)regd_buf->reg.len, dto, i,
-				 (unsigned long)sge->addr);
-			BUG();
-		}
-
-		sge->addr += dto->offset[i]; /* Adjust offset */
-	}
-}
-
-/**
- * iser_post_recv - Posts a receive buffer.
- *
- * returns 0 on success, -1 on failure
- */
-int iser_post_recv(struct iser_desc *rx_desc)
-{
-	int		  ib_ret, ret_val = 0;
-	struct ib_recv_wr recv_wr, *recv_wr_failed;
-	struct ib_sge	  iov[2];
-	struct iser_conn  *ib_conn;
-	struct iser_dto   *recv_dto = &rx_desc->dto;
-
-	/* Retrieve conn */
-	ib_conn = recv_dto->ib_conn;
-
-	iser_dto_to_iov(recv_dto, iov, 2);
-
-	recv_wr.next	= NULL;
-	recv_wr.sg_list = iov;
-	recv_wr.num_sge = recv_dto->regd_vector_len;
-	recv_wr.wr_id	= (unsigned long)rx_desc;
-
-	atomic_inc(&ib_conn->post_recv_buf_count);
-	ib_ret	= ib_post_recv(ib_conn->qp, &recv_wr, &recv_wr_failed);
+	ib_conn->post_recv_buf_count++;
+	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
 	if (ib_ret) {
 		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
-		atomic_dec(&ib_conn->post_recv_buf_count);
-		ret_val = -1;
+		ib_conn->post_recv_buf_count--;
+	}
+	return ib_ret;
+}
+
+int iser_post_recvm(struct iser_conn *ib_conn, int count)
+{
+	struct ib_recv_wr *rx_wr, *rx_wr_failed;
+	int i, ib_ret;
+	unsigned int my_rx_head = ib_conn->rx_desc_head;
+	struct iser_rx_desc *rx_desc;
+
+	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
+		rx_desc		= &ib_conn->rx_descs[my_rx_head];
+		rx_wr->wr_id	= (unsigned long)rx_desc;
+		rx_wr->sg_list	= &rx_desc->rx_sg;
+		rx_wr->num_sge	= 1;
+		rx_wr->next	= rx_wr + 1;
+		my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1);
 	}
 
-	return ret_val;
+	rx_wr--;
+	rx_wr->next = NULL; /* mark end of work requests list */
+
+	ib_conn->post_recv_buf_count += count;
+	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count -= count;
+	} else
+		ib_conn->rx_desc_head = my_rx_head;
+	return ib_ret;
 }
 
+
 /**
  * iser_start_send - Initiate a Send DTO operation
  *
  * returns 0 on success, -1 on failure
  */
-int iser_post_send(struct iser_desc *tx_desc)
+int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc)
 {
-	int		  ib_ret, ret_val = 0;
+	int		  ib_ret;
 	struct ib_send_wr send_wr, *send_wr_failed;
-	struct ib_sge	  iov[MAX_REGD_BUF_VECTOR_LEN];
-	struct iser_conn  *ib_conn;
-	struct iser_dto   *dto = &tx_desc->dto;
 
-	ib_conn = dto->ib_conn;
-
-	iser_dto_to_iov(dto, iov, MAX_REGD_BUF_VECTOR_LEN);
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
 
 	send_wr.next	   = NULL;
 	send_wr.wr_id	   = (unsigned long)tx_desc;
-	send_wr.sg_list	   = iov;
-	send_wr.num_sge	   = dto->regd_vector_len;
+	send_wr.sg_list	   = tx_desc->tx_sg;
+	send_wr.num_sge	   = tx_desc->num_sge;
 	send_wr.opcode	   = IB_WR_SEND;
-	send_wr.send_flags = dto->notify_enable ? IB_SEND_SIGNALED : 0;
+	send_wr.send_flags = IB_SEND_SIGNALED;
 
 	atomic_inc(&ib_conn->post_send_buf_count);
 
 	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
 	if (ib_ret) {
-		iser_err("Failed to start SEND DTO, dto: 0x%p, IOV len: %d\n",
-			 dto, dto->regd_vector_len);
 		iser_err("ib_post_send failed, ret:%d\n", ib_ret);
 		atomic_dec(&ib_conn->post_send_buf_count);
-		ret_val = -1;
 	}
-
-	return ret_val;
+	return ib_ret;
 }
 
-static void iser_handle_comp_error(struct iser_desc *desc)
+static void iser_handle_comp_error(struct iser_tx_desc *desc,
+				struct iser_conn *ib_conn)
 {
-	struct iser_dto  *dto     = &desc->dto;
-	struct iser_conn *ib_conn = dto->ib_conn;
-
-	iser_dto_buffs_release(dto);
-
-	if (desc->type == ISCSI_RX) {
-		kfree(desc->data);
+	if (desc && desc->type == ISCSI_TX_DATAOUT)
 		kmem_cache_free(ig.desc_cache, desc);
-		atomic_dec(&ib_conn->post_recv_buf_count);
-	} else { /* type is TX control/command/dataout */
-		if (desc->type == ISCSI_TX_DATAOUT)
-			kmem_cache_free(ig.desc_cache, desc);
-		atomic_dec(&ib_conn->post_send_buf_count);
-	}
 
-	if (atomic_read(&ib_conn->post_recv_buf_count) == 0 &&
+	if (ib_conn->post_recv_buf_count == 0 &&
 	    atomic_read(&ib_conn->post_send_buf_count) == 0) {
 		/* getting here when the state is UP means that the conn is *
 		 * being terminated asynchronously from the iSCSI layer's   *
@@ -774,32 +745,74 @@
 	}
 }
 
+static int iser_drain_tx_cq(struct iser_device  *device)
+{
+	struct ib_cq  *cq = device->tx_cq;
+	struct ib_wc  wc;
+	struct iser_tx_desc *tx_desc;
+	struct iser_conn *ib_conn;
+	int completed_tx = 0;
+
+	while (ib_poll_cq(cq, 1, &wc) == 1) {
+		tx_desc	= (struct iser_tx_desc *) (unsigned long) wc.wr_id;
+		ib_conn = wc.qp->qp_context;
+		if (wc.status == IB_WC_SUCCESS) {
+			if (wc.opcode == IB_WC_SEND)
+				iser_snd_completion(tx_desc, ib_conn);
+			else
+				iser_err("expected opcode %d got %d\n",
+					IB_WC_SEND, wc.opcode);
+		} else {
+			iser_err("tx id %llx status %d vend_err %x\n",
+				wc.wr_id, wc.status, wc.vendor_err);
+			atomic_dec(&ib_conn->post_send_buf_count);
+			iser_handle_comp_error(tx_desc, ib_conn);
+		}
+		completed_tx++;
+	}
+	return completed_tx;
+}
+
+
 static void iser_cq_tasklet_fn(unsigned long data)
 {
 	 struct iser_device  *device = (struct iser_device *)data;
-	 struct ib_cq	     *cq = device->cq;
+	 struct ib_cq	     *cq = device->rx_cq;
 	 struct ib_wc	     wc;
-	 struct iser_desc    *desc;
+	 struct iser_rx_desc *desc;
 	 unsigned long	     xfer_len;
+	struct iser_conn *ib_conn;
+	int completed_tx, completed_rx;
+	completed_tx = completed_rx = 0;
 
 	while (ib_poll_cq(cq, 1, &wc) == 1) {
-		desc	 = (struct iser_desc *) (unsigned long) wc.wr_id;
+		desc	 = (struct iser_rx_desc *) (unsigned long) wc.wr_id;
 		BUG_ON(desc == NULL);
-
+		ib_conn = wc.qp->qp_context;
 		if (wc.status == IB_WC_SUCCESS) {
-			if (desc->type == ISCSI_RX) {
+			if (wc.opcode == IB_WC_RECV) {
 				xfer_len = (unsigned long)wc.byte_len;
-				iser_rcv_completion(desc, xfer_len);
-			} else /* type == ISCSI_TX_CONTROL/SCSI_CMD/DOUT */
-				iser_snd_completion(desc);
+				iser_rcv_completion(desc, xfer_len, ib_conn);
+			} else
+				iser_err("expected opcode %d got %d\n",
+					IB_WC_RECV, wc.opcode);
 		} else {
-			iser_err("comp w. error op %d status %d\n",desc->type,wc.status);
-			iser_handle_comp_error(desc);
+			if (wc.status != IB_WC_WR_FLUSH_ERR)
+				iser_err("rx id %llx status %d vend_err %x\n",
+					wc.wr_id, wc.status, wc.vendor_err);
+			ib_conn->post_recv_buf_count--;
+			iser_handle_comp_error(NULL, ib_conn);
 		}
+		completed_rx++;
+		if (!(completed_rx & 63))
+			completed_tx += iser_drain_tx_cq(device);
 	}
 	/* #warning "it is assumed here that arming CQ only once its empty" *
 	 * " would not cause interrupts to be missed"                       */
 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	completed_tx += iser_drain_tx_cq(device);
+	iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx);
 }
 
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 54c8fe2..ed3f9eb 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -80,7 +80,8 @@
 
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device);
-static void srp_completion(struct ib_cq *cq, void *target_ptr);
+static void srp_recv_completion(struct ib_cq *cq, void *target_ptr);
+static void srp_send_completion(struct ib_cq *cq, void *target_ptr);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
@@ -227,14 +228,21 @@
 	if (!init_attr)
 		return -ENOMEM;
 
-	target->cq = ib_create_cq(target->srp_host->srp_dev->dev,
-				  srp_completion, NULL, target, SRP_CQ_SIZE, 0);
-	if (IS_ERR(target->cq)) {
-		ret = PTR_ERR(target->cq);
-		goto out;
+	target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
+				       srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0);
+	if (IS_ERR(target->recv_cq)) {
+		ret = PTR_ERR(target->recv_cq);
+		goto err;
 	}
 
-	ib_req_notify_cq(target->cq, IB_CQ_NEXT_COMP);
+	target->send_cq = ib_create_cq(target->srp_host->srp_dev->dev,
+				       srp_send_completion, NULL, target, SRP_SQ_SIZE, 0);
+	if (IS_ERR(target->send_cq)) {
+		ret = PTR_ERR(target->send_cq);
+		goto err_recv_cq;
+	}
+
+	ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP);
 
 	init_attr->event_handler       = srp_qp_event;
 	init_attr->cap.max_send_wr     = SRP_SQ_SIZE;
@@ -243,24 +251,32 @@
 	init_attr->cap.max_send_sge    = 1;
 	init_attr->sq_sig_type         = IB_SIGNAL_ALL_WR;
 	init_attr->qp_type             = IB_QPT_RC;
-	init_attr->send_cq             = target->cq;
-	init_attr->recv_cq             = target->cq;
+	init_attr->send_cq             = target->send_cq;
+	init_attr->recv_cq             = target->recv_cq;
 
 	target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr);
 	if (IS_ERR(target->qp)) {
 		ret = PTR_ERR(target->qp);
-		ib_destroy_cq(target->cq);
-		goto out;
+		goto err_send_cq;
 	}
 
 	ret = srp_init_qp(target, target->qp);
-	if (ret) {
-		ib_destroy_qp(target->qp);
-		ib_destroy_cq(target->cq);
-		goto out;
-	}
+	if (ret)
+		goto err_qp;
 
-out:
+	kfree(init_attr);
+	return 0;
+
+err_qp:
+	ib_destroy_qp(target->qp);
+
+err_send_cq:
+	ib_destroy_cq(target->send_cq);
+
+err_recv_cq:
+	ib_destroy_cq(target->recv_cq);
+
+err:
 	kfree(init_attr);
 	return ret;
 }
@@ -270,7 +286,8 @@
 	int i;
 
 	ib_destroy_qp(target->qp);
-	ib_destroy_cq(target->cq);
+	ib_destroy_cq(target->send_cq);
+	ib_destroy_cq(target->recv_cq);
 
 	for (i = 0; i < SRP_RQ_SIZE; ++i)
 		srp_free_iu(target->srp_host, target->rx_ring[i]);
@@ -568,7 +585,9 @@
 	if (ret)
 		goto err;
 
-	while (ib_poll_cq(target->cq, 1, &wc) > 0)
+	while (ib_poll_cq(target->recv_cq, 1, &wc) > 0)
+		; /* nothing */
+	while (ib_poll_cq(target->send_cq, 1, &wc) > 0)
 		; /* nothing */
 
 	spin_lock_irq(target->scsi_host->host_lock);
@@ -851,7 +870,7 @@
 	struct srp_iu *iu;
 	u8 opcode;
 
-	iu = target->rx_ring[wc->wr_id & ~SRP_OP_RECV];
+	iu = target->rx_ring[wc->wr_id];
 
 	dev = target->srp_host->srp_dev->dev;
 	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len,
@@ -898,7 +917,7 @@
 				      DMA_FROM_DEVICE);
 }
 
-static void srp_completion(struct ib_cq *cq, void *target_ptr)
+static void srp_recv_completion(struct ib_cq *cq, void *target_ptr)
 {
 	struct srp_target_port *target = target_ptr;
 	struct ib_wc wc;
@@ -907,17 +926,31 @@
 	while (ib_poll_cq(cq, 1, &wc) > 0) {
 		if (wc.status) {
 			shost_printk(KERN_ERR, target->scsi_host,
-				     PFX "failed %s status %d\n",
-				     wc.wr_id & SRP_OP_RECV ? "receive" : "send",
+				     PFX "failed receive status %d\n",
 				     wc.status);
 			target->qp_in_error = 1;
 			break;
 		}
 
-		if (wc.wr_id & SRP_OP_RECV)
-			srp_handle_recv(target, &wc);
-		else
-			++target->tx_tail;
+		srp_handle_recv(target, &wc);
+	}
+}
+
+static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
+{
+	struct srp_target_port *target = target_ptr;
+	struct ib_wc wc;
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		if (wc.status) {
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "failed send status %d\n",
+				     wc.status);
+			target->qp_in_error = 1;
+			break;
+		}
+
+		++target->tx_tail;
 	}
 }
 
@@ -930,7 +963,7 @@
 	int ret;
 
 	next 	 = target->rx_head & (SRP_RQ_SIZE - 1);
-	wr.wr_id = next | SRP_OP_RECV;
+	wr.wr_id = next;
 	iu 	 = target->rx_ring[next];
 
 	list.addr   = iu->dma;
@@ -970,6 +1003,8 @@
 {
 	s32 min = (req_type == SRP_REQ_TASK_MGMT) ? 1 : 2;
 
+	srp_send_completion(target->send_cq, target);
+
 	if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE)
 		return NULL;
 
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index e185b90..5a80eac 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -60,7 +60,6 @@
 	SRP_RQ_SHIFT    	= 6,
 	SRP_RQ_SIZE		= 1 << SRP_RQ_SHIFT,
 	SRP_SQ_SIZE		= SRP_RQ_SIZE - 1,
-	SRP_CQ_SIZE		= SRP_SQ_SIZE + SRP_RQ_SIZE,
 
 	SRP_TAG_TSK_MGMT	= 1 << (SRP_RQ_SHIFT + 1),
 
@@ -69,8 +68,6 @@
 	SRP_FMR_DIRTY_SIZE	= SRP_FMR_POOL_SIZE / 4
 };
 
-#define SRP_OP_RECV		(1 << 31)
-
 enum srp_target_state {
 	SRP_TARGET_LIVE,
 	SRP_TARGET_CONNECTING,
@@ -133,7 +130,8 @@
 	int			path_query_id;
 
 	struct ib_cm_id	       *cm_id;
-	struct ib_cq	       *cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_cq	       *send_cq;
 	struct ib_qp	       *qp;
 
 	int			max_ti_iu_len;
diff --git a/drivers/net/cxgb3/adapter.h b/drivers/net/cxgb3/adapter.h
index 3e8618b..4cd7f42 100644
--- a/drivers/net/cxgb3/adapter.h
+++ b/drivers/net/cxgb3/adapter.h
@@ -264,6 +264,10 @@
 	struct work_struct fatal_error_handler_task;
 	struct work_struct link_fault_handler_task;
 
+	struct work_struct db_full_task;
+	struct work_struct db_empty_task;
+	struct work_struct db_drop_task;
+
 	struct dentry *debugfs_root;
 
 	struct mutex mdio_lock;
@@ -335,6 +339,7 @@
 int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
 		unsigned char *data);
 irqreturn_t t3_sge_intr_msix(int irq, void *cookie);
+extern struct workqueue_struct *cxgb3_wq;
 
 int t3_get_edc_fw(struct cphy *phy, int edc_idx, int size);
 
diff --git a/drivers/net/cxgb3/cxgb3_main.c b/drivers/net/cxgb3/cxgb3_main.c
index 89bec9c..37945fc 100644
--- a/drivers/net/cxgb3/cxgb3_main.c
+++ b/drivers/net/cxgb3/cxgb3_main.c
@@ -45,6 +45,7 @@
 #include <linux/firmware.h>
 #include <linux/log2.h>
 #include <linux/stringify.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 
 #include "common.h"
@@ -140,7 +141,7 @@
  * will block keventd as it needs the rtnl lock, and we'll deadlock waiting
  * for our work to complete.  Get our own work queue to solve this.
  */
-static struct workqueue_struct *cxgb3_wq;
+struct workqueue_struct *cxgb3_wq;
 
 /**
  *	link_report - show link status and link speed/duplex
@@ -590,6 +591,19 @@
 		      V_RRCPLCPUSIZE(6) | F_HASHTOEPLITZ, cpus, rspq_map);
 }
 
+static void ring_dbs(struct adapter *adap)
+{
+	int i, j;
+
+	for (i = 0; i < SGE_QSETS; i++) {
+		struct sge_qset *qs = &adap->sge.qs[i];
+
+		if (qs->adap)
+			for (j = 0; j < SGE_TXQ_PER_SET; j++)
+				t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(qs->txq[j].cntxt_id));
+	}
+}
+
 static void init_napi(struct adapter *adap)
 {
 	int i;
@@ -2754,6 +2768,42 @@
 	spin_unlock_irq(&adapter->work_lock);
 }
 
+static void db_full_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_full_task);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_FULL, 0);
+}
+
+static void db_empty_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_empty_task);
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_EMPTY, 0);
+}
+
+static void db_drop_task(struct work_struct *work)
+{
+	struct adapter *adapter = container_of(work, struct adapter,
+					       db_drop_task);
+	unsigned long delay = 1000;
+	unsigned short r;
+
+	cxgb3_event_notify(&adapter->tdev, OFFLOAD_DB_DROP, 0);
+
+	/*
+	 * Sleep a while before ringing the driver qset dbs.
+	 * The delay is between 1000-2023 usecs.
+	 */
+	get_random_bytes(&r, 2);
+	delay += r & 1023;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(usecs_to_jiffies(delay));
+	ring_dbs(adapter);
+}
+
 /*
  * Processes external (PHY) interrupts in process context.
  */
@@ -3222,6 +3272,11 @@
 	INIT_LIST_HEAD(&adapter->adapter_list);
 	INIT_WORK(&adapter->ext_intr_handler_task, ext_intr_task);
 	INIT_WORK(&adapter->fatal_error_handler_task, fatal_error_task);
+
+	INIT_WORK(&adapter->db_full_task, db_full_task);
+	INIT_WORK(&adapter->db_empty_task, db_empty_task);
+	INIT_WORK(&adapter->db_drop_task, db_drop_task);
+
 	INIT_DELAYED_WORK(&adapter->adap_check_task, t3_adap_check_task);
 
 	for (i = 0; i < ai->nports0 + ai->nports1; ++i) {
diff --git a/drivers/net/cxgb3/cxgb3_offload.h b/drivers/net/cxgb3/cxgb3_offload.h
index 670aa62..929c298 100644
--- a/drivers/net/cxgb3/cxgb3_offload.h
+++ b/drivers/net/cxgb3/cxgb3_offload.h
@@ -73,7 +73,10 @@
 	OFFLOAD_STATUS_UP,
 	OFFLOAD_STATUS_DOWN,
 	OFFLOAD_PORT_DOWN,
-	OFFLOAD_PORT_UP
+	OFFLOAD_PORT_UP,
+	OFFLOAD_DB_FULL,
+	OFFLOAD_DB_EMPTY,
+	OFFLOAD_DB_DROP
 };
 
 struct cxgb3_client {
diff --git a/drivers/net/cxgb3/regs.h b/drivers/net/cxgb3/regs.h
index 1b5327b..cb42353 100644
--- a/drivers/net/cxgb3/regs.h
+++ b/drivers/net/cxgb3/regs.h
@@ -254,6 +254,22 @@
 #define V_LOPIODRBDROPERR(x) ((x) << S_LOPIODRBDROPERR)
 #define F_LOPIODRBDROPERR    V_LOPIODRBDROPERR(1U)
 
+#define S_HIPRIORITYDBFULL    7
+#define V_HIPRIORITYDBFULL(x) ((x) << S_HIPRIORITYDBFULL)
+#define F_HIPRIORITYDBFULL    V_HIPRIORITYDBFULL(1U)
+
+#define S_HIPRIORITYDBEMPTY   6
+#define V_HIPRIORITYDBEMPTY(x) ((x) << S_HIPRIORITYDBEMPTY)
+#define F_HIPRIORITYDBEMPTY    V_HIPRIORITYDBEMPTY(1U)
+
+#define S_LOPRIORITYDBFULL    5
+#define V_LOPRIORITYDBFULL(x) ((x) << S_LOPRIORITYDBFULL)
+#define F_LOPRIORITYDBFULL    V_LOPRIORITYDBFULL(1U)
+
+#define S_LOPRIORITYDBEMPTY   4
+#define V_LOPRIORITYDBEMPTY(x) ((x) << S_LOPRIORITYDBEMPTY)
+#define F_LOPRIORITYDBEMPTY    V_LOPRIORITYDBEMPTY(1U)
+
 #define S_RSPQDISABLED    3
 #define V_RSPQDISABLED(x) ((x) << S_RSPQDISABLED)
 #define F_RSPQDISABLED    V_RSPQDISABLED(1U)
diff --git a/drivers/net/cxgb3/sge.c b/drivers/net/cxgb3/sge.c
index 318a018..9b43446 100644
--- a/drivers/net/cxgb3/sge.c
+++ b/drivers/net/cxgb3/sge.c
@@ -42,6 +42,7 @@
 #include "sge_defs.h"
 #include "t3_cpl.h"
 #include "firmware_exports.h"
+#include "cxgb3_offload.h"
 
 #define USE_GTS 0
 
@@ -2833,8 +2834,13 @@
 	}
 
 	if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
-		CH_ALERT(adapter, "SGE dropped %s priority doorbell\n",
-			 status & F_HIPIODRBDROPERR ? "high" : "lo");
+		queue_work(cxgb3_wq, &adapter->db_drop_task);
+
+	if (status & (F_HIPRIORITYDBFULL | F_LOPRIORITYDBFULL))
+		queue_work(cxgb3_wq, &adapter->db_full_task);
+
+	if (status & (F_HIPRIORITYDBEMPTY | F_LOPRIORITYDBEMPTY))
+		queue_work(cxgb3_wq, &adapter->db_empty_task);
 
 	t3_write_reg(adapter, A_SG_INT_CAUSE, status);
 	if (status &  SGE_FATALERR)
diff --git a/drivers/net/cxgb3/t3_hw.c b/drivers/net/cxgb3/t3_hw.c
index 032cfe0..c38fc71 100644
--- a/drivers/net/cxgb3/t3_hw.c
+++ b/drivers/net/cxgb3/t3_hw.c
@@ -1432,7 +1432,10 @@
 		       F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
 		       V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
 		       F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
-		       F_HIRCQPARITYERROR)
+		       F_HIRCQPARITYERROR | F_LOPRIORITYDBFULL | \
+		       F_HIPRIORITYDBFULL | F_LOPRIORITYDBEMPTY | \
+		       F_HIPRIORITYDBEMPTY | F_HIPIODRBDROPERR | \
+		       F_LOPIODRBDROPERR)
 #define MC5_INTR_MASK (F_PARITYERR | F_ACTRGNFULL | F_UNKNOWNCMD | \
 		       F_REQQPARERR | F_DISPQPARERR | F_DELACTEMPTY | \
 		       F_NFASRCHFAIL)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 09509ed..a585e0f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -984,9 +984,9 @@
 	struct list_head              event_handler_list;
 	spinlock_t                    event_handler_lock;
 
+	spinlock_t                    client_data_lock;
 	struct list_head              core_list;
 	struct list_head              client_data_list;
-	spinlock_t                    client_data_lock;
 
 	struct ib_cache               cache;
 	int                          *pkey_tbl_len;
@@ -1144,8 +1144,8 @@
 		IB_DEV_UNREGISTERED
 	}                            reg_state;
 
-	u64			     uverbs_cmd_mask;
 	int			     uverbs_abi_ver;
+	u64			     uverbs_cmd_mask;
 
 	char			     node_desc[64];
 	__be64			     node_guid;
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index c6b2962..4fae903 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -67,7 +67,6 @@
 	RDMA_PS_IPOIB = 0x0002,
 	RDMA_PS_TCP   = 0x0106,
 	RDMA_PS_UDP   = 0x0111,
-	RDMA_PS_SCTP  = 0x0183
 };
 
 struct rdma_addr {