[SCSI] lpfc 8.2.5 : Correct ndlp referencing issues

Correct ndlp referencing issues:
- Fix ndlp kref issues due to race conditions between threads
- Fix cancel els delay retry event which missed an ndlp reference count

Signed-off-by: James Smart <james.smart@emulex.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index dc042bd..1ee3e62 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -1,7 +1,7 @@
 /*******************************************************************
  * This file is part of the Emulex Linux Device Driver for         *
  * Fibre Channel Host Bus Adapters.                                *
- * Copyright (C) 2004-2007 Emulex.  All rights reserved.           *
+ * Copyright (C) 2004-2008 Emulex.  All rights reserved.           *
  * EMULEX and SLI are trademarks of Emulex.                        *
  * www.emulex.com                                                  *
  * Portions Copyright (C) 2004-2005 Christoph Hellwig              *
@@ -272,9 +272,8 @@
 	if (!(vport->load_flag & FC_UNLOADING) &&
 	    !(ndlp->nlp_flag & NLP_DELAY_TMO) &&
 	    !(ndlp->nlp_flag & NLP_NPR_2B_DISC) &&
-	    (ndlp->nlp_state != NLP_STE_UNMAPPED_NODE)) {
+	    (ndlp->nlp_state != NLP_STE_UNMAPPED_NODE))
 		lpfc_disc_state_machine(vport, ndlp, NULL, NLP_EVT_DEVICE_RM);
-	}
 }
 
 
@@ -566,9 +565,10 @@
 	int  rc;
 
 	list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp))
+			continue;
 		if (ndlp->nlp_state == NLP_STE_UNUSED_NODE)
 			continue;
-
 		if ((phba->sli3_options & LPFC_SLI3_VPORT_TEARDOWN) ||
 			((vport->port_type == LPFC_NPIV_PORT) &&
 			(ndlp->nlp_DID == NameServer_DID)))
@@ -684,20 +684,21 @@
 	struct lpfc_nodelist *ndlp;
 
 	list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) {
+		if (!NLP_CHK_NODE_ACT(ndlp))
+			continue;
 		if (ndlp->nlp_state == NLP_STE_UNUSED_NODE)
 			continue;
-
 		if (ndlp->nlp_type & NLP_FABRIC) {
-				/* On Linkup its safe to clean up the ndlp
-				 * from Fabric connections.
-				 */
+			/* On Linkup its safe to clean up the ndlp
+			 * from Fabric connections.
+			 */
 			if (ndlp->nlp_DID != Fabric_DID)
 				lpfc_unreg_rpi(vport, ndlp);
 			lpfc_nlp_set_state(vport, ndlp, NLP_STE_NPR_NODE);
 		} else if (!(ndlp->nlp_flag & NLP_NPR_ADISC)) {
-				/* Fail outstanding IO now since device is
-				 * marked for PLOGI.
-				 */
+			/* Fail outstanding IO now since device is
+			 * marked for PLOGI.
+			 */
 			lpfc_unreg_rpi(vport, ndlp);
 		}
 	}
@@ -1305,7 +1306,6 @@
 		lpfc_mbuf_free(phba, mp->virt, mp->phys);
 		kfree(mp);
 		mempool_free(pmb, phba->mbox_mem_pool);
-		lpfc_nlp_put(ndlp);
 
 		if (phba->fc_topology == TOPOLOGY_LOOP) {
 			/* FLOGI failed, use loop map to make discovery list */
@@ -1313,6 +1313,10 @@
 
 			/* Start discovery */
 			lpfc_disc_start(vport);
+			/* Decrement the reference count to ndlp after the
+			 * reference to the ndlp are done.
+			 */
+			lpfc_nlp_put(ndlp);
 			return;
 		}
 
@@ -1320,6 +1324,10 @@
 		lpfc_printf_vlog(vport, KERN_ERR, LOG_MBOX,
 				 "0258 Register Fabric login error: 0x%x\n",
 				 mb->mbxStatus);
+		/* Decrement the reference count to ndlp after the reference
+		 * to the ndlp are done.
+		 */
+		lpfc_nlp_put(ndlp);
 		return;
 	}
 
@@ -1327,8 +1335,6 @@
 	ndlp->nlp_type |= NLP_FABRIC;
 	lpfc_nlp_set_state(vport, ndlp, NLP_STE_UNMAPPED_NODE);
 
-	lpfc_nlp_put(ndlp);	/* Drop the reference from the mbox */
-
 	if (vport->port_state == LPFC_FABRIC_CFG_LINK) {
 		vports = lpfc_create_vport_work_array(phba);
 		if (vports != NULL)
@@ -1356,6 +1362,11 @@
 	lpfc_mbuf_free(phba, mp->virt, mp->phys);
 	kfree(mp);
 	mempool_free(pmb, phba->mbox_mem_pool);
+
+	/* Drop the reference count from the mbox at the end after
+	 * all the current reference to the ndlp have been done.
+	 */
+	lpfc_nlp_put(ndlp);
 	return;
 }
 
@@ -1463,9 +1474,8 @@
 	 * registered the port.
 	 */
 	if (ndlp->rport && ndlp->rport->dd_data &&
-	    ((struct lpfc_rport_data *) ndlp->rport->dd_data)->pnode == ndlp) {
+	    ((struct lpfc_rport_data *) ndlp->rport->dd_data)->pnode == ndlp)
 		lpfc_nlp_put(ndlp);
-	}
 
 	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_RPORT,
 		"rport add:       did:x%x flg:x%x type x%x",
@@ -1660,6 +1670,18 @@
 }
 
 void
+lpfc_enqueue_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
+{
+	struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
+
+	if (list_empty(&ndlp->nlp_listp)) {
+		spin_lock_irq(shost->host_lock);
+		list_add_tail(&ndlp->nlp_listp, &vport->fc_nodes);
+		spin_unlock_irq(shost->host_lock);
+	}
+}
+
+void
 lpfc_dequeue_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
 {
 	struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
@@ -1672,7 +1694,80 @@
 	list_del_init(&ndlp->nlp_listp);
 	spin_unlock_irq(shost->host_lock);
 	lpfc_nlp_state_cleanup(vport, ndlp, ndlp->nlp_state,
-			       NLP_STE_UNUSED_NODE);
+				NLP_STE_UNUSED_NODE);
+}
+
+void
+lpfc_disable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
+{
+	if ((ndlp->nlp_flag & NLP_DELAY_TMO) != 0)
+		lpfc_cancel_retry_delay_tmo(vport, ndlp);
+	if (ndlp->nlp_state && !list_empty(&ndlp->nlp_listp))
+		lpfc_nlp_counters(vport, ndlp->nlp_state, -1);
+	lpfc_nlp_state_cleanup(vport, ndlp, ndlp->nlp_state,
+				NLP_STE_UNUSED_NODE);
+}
+
+struct lpfc_nodelist *
+lpfc_enable_node(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp,
+		 int state)
+{
+	struct lpfc_hba *phba = vport->phba;
+	uint32_t did;
+	unsigned long flags;
+
+	if (!ndlp)
+		return NULL;
+
+	spin_lock_irqsave(&phba->ndlp_lock, flags);
+	/* The ndlp should not be in memory free mode */
+	if (NLP_CHK_FREE_REQ(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0277 lpfc_enable_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return NULL;
+	}
+	/* The ndlp should not already be in active mode */
+	if (NLP_CHK_NODE_ACT(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0278 lpfc_enable_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return NULL;
+	}
+
+	/* Keep the original DID */
+	did = ndlp->nlp_DID;
+
+	/* re-initialize ndlp except of ndlp linked list pointer */
+	memset((((char *)ndlp) + sizeof (struct list_head)), 0,
+		sizeof (struct lpfc_nodelist) - sizeof (struct list_head));
+	INIT_LIST_HEAD(&ndlp->els_retry_evt.evt_listp);
+	INIT_LIST_HEAD(&ndlp->dev_loss_evt.evt_listp);
+	init_timer(&ndlp->nlp_delayfunc);
+	ndlp->nlp_delayfunc.function = lpfc_els_retry_delay;
+	ndlp->nlp_delayfunc.data = (unsigned long)ndlp;
+	ndlp->nlp_DID = did;
+	ndlp->vport = vport;
+	ndlp->nlp_sid = NLP_NO_SID;
+	/* ndlp management re-initialize */
+	kref_init(&ndlp->kref);
+	NLP_INT_NODE_ACT(ndlp);
+
+	spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+
+	if (state != NLP_STE_UNUSED_NODE)
+		lpfc_nlp_set_state(vport, ndlp, state);
+
+	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_NODE,
+		"node enable:       did:x%x",
+		ndlp->nlp_DID, 0, 0);
+	return ndlp;
 }
 
 void
@@ -1972,7 +2067,21 @@
 			 "Data: x%x x%x x%x\n",
 			 ndlp->nlp_DID, ndlp->nlp_flag,
 			 ndlp->nlp_state, ndlp->nlp_rpi);
-	lpfc_dequeue_node(vport, ndlp);
+	if (NLP_CHK_FREE_REQ(ndlp)) {
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0280 lpfc_cleanup_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		lpfc_dequeue_node(vport, ndlp);
+	} else {
+		lpfc_printf_vlog(vport, KERN_WARNING, LOG_NODE,
+				"0281 lpfc_cleanup_node: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		lpfc_disable_node(vport, ndlp);
+	}
 
 	/* cleanup any ndlp on mbox q waiting for reglogin cmpl */
 	if ((mb = phba->sli.mbox_active)) {
@@ -1994,12 +2103,16 @@
 			}
 			list_del(&mb->list);
 			mempool_free(mb, phba->mbox_mem_pool);
-			lpfc_nlp_put(ndlp);
+			/* We shall not invoke the lpfc_nlp_put to decrement
+			 * the ndlp reference count as we are in the process
+			 * of lpfc_nlp_release.
+			 */
 		}
 	}
 	spin_unlock_irq(&phba->hbalock);
 
-	lpfc_els_abort(phba,ndlp);
+	lpfc_els_abort(phba, ndlp);
+
 	spin_lock_irq(shost->host_lock);
 	ndlp->nlp_flag &= ~NLP_DELAY_TMO;
 	spin_unlock_irq(shost->host_lock);
@@ -2057,7 +2170,6 @@
 			}
 		}
 	}
-
 	lpfc_cleanup_node(vport, ndlp);
 
 	/*
@@ -2182,7 +2294,16 @@
 		ndlp->nlp_flag |= NLP_NPR_2B_DISC;
 		spin_unlock_irq(shost->host_lock);
 		return ndlp;
+	} else if (!NLP_CHK_NODE_ACT(ndlp)) {
+		ndlp = lpfc_enable_node(vport, ndlp, NLP_STE_NPR_NODE);
+		if (!ndlp)
+			return NULL;
+		spin_lock_irq(shost->host_lock);
+		ndlp->nlp_flag |= NLP_NPR_2B_DISC;
+		spin_unlock_irq(shost->host_lock);
+		return ndlp;
 	}
+
 	if (vport->fc_flag & FC_RSCN_MODE) {
 		if (lpfc_rscn_payload_check(vport, did)) {
 			/* If we've already recieved a PLOGI from this NPort
@@ -2485,6 +2606,8 @@
 	if (vport->fc_plogi_cnt || vport->fc_adisc_cnt) {
 		list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes,
 					 nlp_listp) {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				continue;
 			if (ndlp->nlp_state == NLP_STE_PLOGI_ISSUE ||
 			    ndlp->nlp_state == NLP_STE_ADISC_ISSUE) {
 				lpfc_free_tx(phba, ndlp);
@@ -2572,6 +2695,8 @@
 		/* Start discovery by sending FLOGI, clean up old rpis */
 		list_for_each_entry_safe(ndlp, next_ndlp, &vport->fc_nodes,
 					 nlp_listp) {
+			if (!NLP_CHK_NODE_ACT(ndlp))
+				continue;
 			if (ndlp->nlp_state != NLP_STE_NPR_NODE)
 				continue;
 			if (ndlp->nlp_type & NLP_FABRIC) {
@@ -2618,7 +2743,7 @@
 				 "NameServer login\n");
 		/* Next look for NameServer ndlp */
 		ndlp = lpfc_findnode_did(vport, NameServer_DID);
-		if (ndlp)
+		if (ndlp && NLP_CHK_NODE_ACT(ndlp))
 			lpfc_els_abort(phba, ndlp);
 
 		/* ReStart discovery */
@@ -2897,6 +3022,7 @@
 	ndlp->nlp_sid = NLP_NO_SID;
 	INIT_LIST_HEAD(&ndlp->nlp_listp);
 	kref_init(&ndlp->kref);
+	NLP_INT_NODE_ACT(ndlp);
 
 	lpfc_debugfs_disc_trc(vport, LPFC_DISC_TRC_NODE,
 		"node init:       did:x%x",
@@ -2911,6 +3037,8 @@
 static void
 lpfc_nlp_release(struct kref *kref)
 {
+	struct lpfc_hba *phba;
+	unsigned long flags;
 	struct lpfc_nodelist *ndlp = container_of(kref, struct lpfc_nodelist,
 						  kref);
 
@@ -2918,8 +3046,24 @@
 		"node release:    did:x%x flg:x%x type:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag, ndlp->nlp_type);
 
+	lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
+			"0279 lpfc_nlp_release: ndlp:x%p "
+			"usgmap:x%x refcnt:%d\n",
+			(void *)ndlp, ndlp->nlp_usg_map,
+			atomic_read(&ndlp->kref.refcount));
+
+	/* remove ndlp from action. */
 	lpfc_nlp_remove(ndlp->vport, ndlp);
-	mempool_free(ndlp, ndlp->vport->phba->nlp_mem_pool);
+
+	/* clear the ndlp active flag for all release cases */
+	phba = ndlp->vport->phba;
+	spin_lock_irqsave(&phba->ndlp_lock, flags);
+	NLP_CLR_NODE_ACT(ndlp);
+	spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+
+	/* free ndlp memory for final ndlp release */
+	if (NLP_CHK_FREE_REQ(ndlp))
+		mempool_free(ndlp, ndlp->vport->phba->nlp_mem_pool);
 }
 
 /* This routine bumps the reference count for a ndlp structure to ensure
@@ -2929,37 +3073,108 @@
 struct lpfc_nodelist *
 lpfc_nlp_get(struct lpfc_nodelist *ndlp)
 {
+	struct lpfc_hba *phba;
+	unsigned long flags;
+
 	if (ndlp) {
 		lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
 			"node get:        did:x%x flg:x%x refcnt:x%x",
 			ndlp->nlp_DID, ndlp->nlp_flag,
 			atomic_read(&ndlp->kref.refcount));
-		kref_get(&ndlp->kref);
+		/* The check of ndlp usage to prevent incrementing the
+		 * ndlp reference count that is in the process of being
+		 * released.
+		 */
+		phba = ndlp->vport->phba;
+		spin_lock_irqsave(&phba->ndlp_lock, flags);
+		if (!NLP_CHK_NODE_ACT(ndlp) || NLP_CHK_FREE_ACK(ndlp)) {
+			spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+			lpfc_printf_vlog(ndlp->vport, KERN_WARNING, LOG_NODE,
+				"0276 lpfc_nlp_get: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+			return NULL;
+		} else
+			kref_get(&ndlp->kref);
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
 	}
 	return ndlp;
 }
 
-
 /* This routine decrements the reference count for a ndlp structure. If the
- * count goes to 0, this indicates the the associated nodelist should be freed.
+ * count goes to 0, this indicates the the associated nodelist should be
+ * freed. Returning 1 indicates the ndlp resource has been released; on the
+ * other hand, returning 0 indicates the ndlp resource has not been released
+ * yet.
  */
 int
 lpfc_nlp_put(struct lpfc_nodelist *ndlp)
 {
-	if (ndlp) {
-		lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
-		"node put:        did:x%x flg:x%x refcnt:x%x",
-			ndlp->nlp_DID, ndlp->nlp_flag,
-			atomic_read(&ndlp->kref.refcount));
+	struct lpfc_hba *phba;
+	unsigned long flags;
+
+	if (!ndlp)
+		return 1;
+
+	lpfc_debugfs_disc_trc(ndlp->vport, LPFC_DISC_TRC_NODE,
+	"node put:        did:x%x flg:x%x refcnt:x%x",
+		ndlp->nlp_DID, ndlp->nlp_flag,
+		atomic_read(&ndlp->kref.refcount));
+	phba = ndlp->vport->phba;
+	spin_lock_irqsave(&phba->ndlp_lock, flags);
+	/* Check the ndlp memory free acknowledge flag to avoid the
+	 * possible race condition that kref_put got invoked again
+	 * after previous one has done ndlp memory free.
+	 */
+	if (NLP_CHK_FREE_ACK(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(ndlp->vport, KERN_WARNING, LOG_NODE,
+				"0274 lpfc_nlp_put: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return 1;
 	}
-	return ndlp ? kref_put(&ndlp->kref, lpfc_nlp_release) : 0;
+	/* Check the ndlp inactivate log flag to avoid the possible
+	 * race condition that kref_put got invoked again after ndlp
+	 * is already in inactivating state.
+	 */
+	if (NLP_CHK_IACT_REQ(ndlp)) {
+		spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+		lpfc_printf_vlog(ndlp->vport, KERN_WARNING, LOG_NODE,
+				"0275 lpfc_nlp_put: ndlp:x%p "
+				"usgmap:x%x refcnt:%d\n",
+				(void *)ndlp, ndlp->nlp_usg_map,
+				atomic_read(&ndlp->kref.refcount));
+		return 1;
+	}
+	/* For last put, mark the ndlp usage flags to make sure no
+	 * other kref_get and kref_put on the same ndlp shall get
+	 * in between the process when the final kref_put has been
+	 * invoked on this ndlp.
+	 */
+	if (atomic_read(&ndlp->kref.refcount) == 1) {
+		/* Indicate ndlp is put to inactive state. */
+		NLP_SET_IACT_REQ(ndlp);
+		/* Acknowledge ndlp memory free has been seen. */
+		if (NLP_CHK_FREE_REQ(ndlp))
+			NLP_SET_FREE_ACK(ndlp);
+	}
+	spin_unlock_irqrestore(&phba->ndlp_lock, flags);
+	/* Note, the kref_put returns 1 when decrementing a reference
+	 * count that was 1, it invokes the release callback function,
+	 * but it still left the reference count as 1 (not actually
+	 * performs the last decrementation). Otherwise, it actually
+	 * decrements the reference count and returns 0.
+	 */
+	return kref_put(&ndlp->kref, lpfc_nlp_release);
 }
 
 /* This routine free's the specified nodelist if it is not in use
- * by any other discovery thread. This routine returns 1 if the ndlp
- * is not being used by anyone and has been freed. A return value of
- * 0 indicates it is being used by another discovery thread and the
- * refcount is left unchanged.
+ * by any other discovery thread. This routine returns 1 if the
+ * ndlp has been freed. A return value of 0 indicates the ndlp is
+ * not yet been released.
  */
 int
 lpfc_nlp_not_used(struct lpfc_nodelist *ndlp)
@@ -2968,11 +3183,8 @@
 		"node not used:   did:x%x flg:x%x refcnt:x%x",
 		ndlp->nlp_DID, ndlp->nlp_flag,
 		atomic_read(&ndlp->kref.refcount));
-
-	if (atomic_read(&ndlp->kref.refcount) == 1) {
-		lpfc_nlp_put(ndlp);
-		return 1;
-	}
+	if (atomic_read(&ndlp->kref.refcount) == 1)
+		if (lpfc_nlp_put(ndlp))
+			return 1;
 	return 0;
 }
-