[SCSI] bnx2i: Added the use of kthreads to handle SCSI cmd completion

This patch breaks the SCSI cmd completion into two parts:
1. The bh will allocate and queued work to the cmd specific CPU IO
completion kthread.  The CPU for the cmd is from the sc->request->cpu.

2. The CPU specific IO completion kthread will call the scsi_cmd_resp
routine to do the actual cmd completion.

In the normal case, these IO completion kthreads should complete before
the blk IO times out at 60s.  However, in the case when these kthreads
are blocked for whatever reason and exceeded the timeout, the call
to conn_destroy will have to iterate and exhaust all related work in the
percpu work list for all online CPUs.  This will guarantee the protection
of the work->session and conn pointers before they get freed.

Also modified the event coalescing formula to have at least the
event_coal_min outstanding cmds in the pipeline so the SCSI producer
would not get underrun.

Also changed the following SCSI parameters:
- can_queue from 1024 to 2048
- cmds_per_lun from 24 to 128

Signed-off-by: Eddie Wai <eddie.wai@broadcom.com>
Acked-by: Benjamin Li <benli@broadcom.com>
Acked-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 041928b..9d40f32 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -27,6 +27,7 @@
  */
 static DEFINE_SPINLOCK(bnx2i_resc_lock); /* protects global resources */
 
+DECLARE_PER_CPU(struct bnx2i_percpu_s, bnx2i_percpu);
 
 static int bnx2i_adapter_ready(struct bnx2i_hba *hba)
 {
@@ -1214,7 +1215,8 @@
 	struct bnx2i_cmd *cmd = task->dd_data;
 	struct iscsi_cmd *hdr = (struct iscsi_cmd *) task->hdr;
 
-	if (bnx2i_conn->ep->num_active_cmds + 1 > hba->max_sqes)
+	if (atomic_read(&bnx2i_conn->ep->num_active_cmds) + 1  >
+	    hba->max_sqes)
 		return -ENOMEM;
 
 	/*
@@ -1354,6 +1356,9 @@
 	bnx2i_conn = conn->dd_data;
 	bnx2i_conn->cls_conn = cls_conn;
 	bnx2i_conn->hba = hba;
+
+	atomic_set(&bnx2i_conn->work_cnt, 0);
+
 	/* 'ep' ptr will be assigned in bind() call */
 	bnx2i_conn->ep = NULL;
 	init_completion(&bnx2i_conn->cmd_cleanup_cmpl);
@@ -1457,11 +1462,34 @@
 	struct bnx2i_conn *bnx2i_conn = conn->dd_data;
 	struct Scsi_Host *shost;
 	struct bnx2i_hba *hba;
+	struct bnx2i_work *work, *tmp;
+	unsigned cpu = 0;
+	struct bnx2i_percpu_s *p;
 
 	shost = iscsi_session_to_shost(iscsi_conn_to_session(cls_conn));
 	hba = iscsi_host_priv(shost);
 
 	bnx2i_conn_free_login_resources(hba, bnx2i_conn);
+
+	if (atomic_read(&bnx2i_conn->work_cnt)) {
+		for_each_online_cpu(cpu) {
+			p = &per_cpu(bnx2i_percpu, cpu);
+			spin_lock_bh(&p->p_work_lock);
+			list_for_each_entry_safe(work, tmp,
+						 &p->work_list, list) {
+				if (work->session == conn->session &&
+				    work->bnx2i_conn == bnx2i_conn) {
+					list_del_init(&work->list);
+					kfree(work);
+					if (!atomic_dec_and_test(
+							&bnx2i_conn->work_cnt))
+						break;
+				}
+			}
+			spin_unlock_bh(&p->p_work_lock);
+		}
+	}
+
 	iscsi_conn_teardown(cls_conn);
 }
 
@@ -1769,7 +1797,7 @@
 	}
 	bnx2i_ep = ep->dd_data;
 
-	bnx2i_ep->num_active_cmds = 0;
+	atomic_set(&bnx2i_ep->num_active_cmds, 0);
 	iscsi_cid = bnx2i_alloc_iscsi_cid(hba);
 	if (iscsi_cid == -1) {
 		printk(KERN_ALERT "bnx2i (%s): alloc_ep - unable to allocate "
@@ -2163,9 +2191,9 @@
 	.eh_device_reset_handler = iscsi_eh_device_reset,
 	.eh_target_reset_handler = iscsi_eh_recover_target,
 	.change_queue_depth	= iscsi_change_queue_depth,
-	.can_queue		= 1024,
+	.can_queue		= 2048,
 	.max_sectors		= 127,
-	.cmd_per_lun		= 24,
+	.cmd_per_lun		= 128,
 	.this_id		= -1,
 	.use_clustering		= ENABLE_CLUSTERING,
 	.sg_tablesize		= ISCSI_MAX_BDS_PER_CMD,