RDMA/cxgb3: Correctly serialize peer abort path

Open MPI and other stress testing exposed a few bad bugs in handling
aborts in the middle of a normal close.  Fix these by:

 - serializing abort reply and peer abort processing with disconnect
   processing

 - warning (and ignoring) if ep timer is stopped when it wasn't running

 - cleaning up disconnect path to correctly deal with aborting and
   dead endpoints

 - in iwch_modify_qp(), taking a ref on the ep before releasing the qp
   lock if iwch_ep_disconnect() will be called.  The ref is dropped
   after calling disconnect.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 72ca360..0b515d8 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -125,6 +125,12 @@
 static void stop_ep_timer(struct iwch_ep *ep)
 {
 	PDBG("%s ep %p\n", __func__, ep);
+	if (!timer_pending(&ep->timer)) {
+		printk(KERN_ERR "%s timer stopped when its not running!  ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		WARN_ON(1);
+		return;
+	}
 	del_timer_sync(&ep->timer);
 	put_ep(&ep->com);
 }
@@ -1083,8 +1089,11 @@
 static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 {
 	struct iwch_ep *ep = ctx;
+	unsigned long flags;
+	int release = 0;
 
 	PDBG("%s ep %p\n", __func__, ep);
+	BUG_ON(!ep);
 
 	/*
 	 * We get 2 abort replies from the HW.  The first one must
@@ -1095,9 +1104,22 @@
 		return CPL_RET_BUF_DONE;
 	}
 
-	close_complete_upcall(ep);
-	state_set(&ep->com, DEAD);
-	release_ep_resources(ep);
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case ABORTING:
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		printk(KERN_ERR "%s ep %p state %d\n",
+		     __func__, ep, ep->com.state);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	if (release)
+		release_ep_resources(ep);
 	return CPL_RET_BUF_DONE;
 }
 
@@ -1470,7 +1492,8 @@
 	struct sk_buff *rpl_skb;
 	struct iwch_qp_attributes attrs;
 	int ret;
-	int state;
+	int release = 0;
+	unsigned long flags;
 
 	if (is_neg_adv_abort(req->status)) {
 		PDBG("%s neg_adv_abort ep %p tid %d\n", __func__, ep,
@@ -1488,9 +1511,9 @@
 		return CPL_RET_BUF_DONE;
 	}
 
-	state = state_read(&ep->com);
-	PDBG("%s ep %p state %u\n", __func__, ep, state);
-	switch (state) {
+	spin_lock_irqsave(&ep->com.lock, flags);
+	PDBG("%s ep %p state %u\n", __func__, ep, ep->com.state);
+	switch (ep->com.state) {
 	case CONNECTING:
 		break;
 	case MPA_REQ_WAIT:
@@ -1536,21 +1559,25 @@
 		break;
 	case DEAD:
 		PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		spin_unlock_irqrestore(&ep->com.lock, flags);
 		return CPL_RET_BUF_DONE;
 	default:
 		BUG_ON(1);
 		break;
 	}
 	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		release = 1;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
 
 	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
 	if (!rpl_skb) {
 		printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
 		       __func__);
-		dst_release(ep->dst);
-		l2t_release(L2DATA(ep->com.tdev), ep->l2t);
-		put_ep(&ep->com);
-		return CPL_RET_BUF_DONE;
+		release = 1;
+		goto out;
 	}
 	rpl_skb->priority = CPL_PRIORITY_DATA;
 	rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl));
@@ -1559,10 +1586,9 @@
 	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
 	rpl->cmd = CPL_ABORT_NO_RST;
 	cxgb3_ofld_send(ep->com.tdev, rpl_skb);
-	if (state != ABORTING) {
-		state_set(&ep->com, DEAD);
+out:
+	if (release)
 		release_ep_resources(ep);
-	}
 	return CPL_RET_BUF_DONE;
 }
 
@@ -1661,15 +1687,18 @@
 	struct iwch_ep *ep = (struct iwch_ep *)arg;
 	struct iwch_qp_attributes attrs;
 	unsigned long flags;
+	int abort = 1;
 
 	spin_lock_irqsave(&ep->com.lock, flags);
 	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
 	     ep->com.state);
 	switch (ep->com.state) {
 	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
 		connect_reply_upcall(ep, -ETIMEDOUT);
 		break;
 	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
 		break;
 	case CLOSING:
 	case MORIBUND:
@@ -1679,13 +1708,17 @@
 				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
 				     &attrs, 1);
 		}
+		__state_set(&ep->com, ABORTING);
 		break;
 	default:
-		BUG();
+		printk(KERN_ERR "%s unexpected state ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		WARN_ON(1);
+		abort = 0;
 	}
-	__state_set(&ep->com, CLOSING);
 	spin_unlock_irqrestore(&ep->com.lock, flags);
-	abort_connection(ep, NULL, GFP_ATOMIC);
+	if (abort)
+		abort_connection(ep, NULL, GFP_ATOMIC);
 	put_ep(&ep->com);
 }
 
@@ -1968,40 +2001,39 @@
 	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
 	     states[ep->com.state], abrupt);
 
-	if (ep->com.state == DEAD) {
-		PDBG("%s already dead ep %p\n", __func__, ep);
-		goto out;
-	}
-
-	if (abrupt) {
-		if (ep->com.state != ABORTING) {
-			ep->com.state = ABORTING;
-			close = 1;
-		}
-		goto out;
-	}
-
 	switch (ep->com.state) {
 	case MPA_REQ_WAIT:
 	case MPA_REQ_SENT:
 	case MPA_REQ_RCVD:
 	case MPA_REP_SENT:
 	case FPDU_MODE:
-		start_ep_timer(ep);
-		ep->com.state = CLOSING;
 		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_ep_timer(ep);
+		}
 		break;
 	case CLOSING:
-		ep->com.state = MORIBUND;
 		close = 1;
+		if (abrupt) {
+			stop_ep_timer(ep);
+			ep->com.state = ABORTING;
+		} else
+			ep->com.state = MORIBUND;
 		break;
 	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		PDBG("%s ignoring disconnect ep %p state %u\n",
+		     __func__, ep, ep->com.state);
 		break;
 	default:
 		BUG();
 		break;
 	}
-out:
+
 	spin_unlock_irqrestore(&ep->com.lock, flags);
 	if (close) {
 		if (abrupt)
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
index 2bb7fbd..a2f1b78 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.h
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@@ -56,6 +56,7 @@
 #define put_ep(ep) { \
 	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
 	     ep, atomic_read(&((ep)->kref.refcount))); \
+	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
 	kref_put(&((ep)->kref), __free_ep); \
 }
 
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 8891c3b..6cd484e 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -832,6 +832,7 @@
 				abort=0;
 				disconnect = 1;
 				ep = qhp->ep;
+				get_ep(&ep->com);
 			}
 			flush_qp(qhp, &flag);
 			break;
@@ -848,6 +849,7 @@
 				abort=1;
 				disconnect = 1;
 				ep = qhp->ep;
+				get_ep(&ep->com);
 			}
 			goto err;
 			break;
@@ -929,8 +931,10 @@
 	 * on the EP.  This can be a normal close (RTS->CLOSING) or
 	 * an abnormal close (RTS/CLOSING->ERROR).
 	 */
-	if (disconnect)
+	if (disconnect) {
 		iwch_ep_disconnect(ep, abort, GFP_KERNEL);
+		put_ep(&ep->com);
+	}
 
 	/*
 	 * If free is 1, then we've disassociated the EP from the QP