IPoIB: Connected mode experimental support

The following patch adds experimental support for IPoIB connected
mode, as defined by the draft from the IETF ipoib working group.  The
idea is to increase performance by increasing the MTU from the maximum
of 2K (theoretically 4K) supported by IPoIB on top of UD.  With this
code, I'm able to get 800MByte/sec or more with netperf without
options on a Mellanox 4x back-to-back DDR system.

Some notes on code:
1. SRQ is used for scalability to large cluster sizes
2. Only RC connections are used (UC does not support SRQ now)
3. Retry count is set to 0 since spec draft warns against retries
4. Each connection is used for data transfers in only 1 direction, so
   each connection is either active(TX) or passive (RX).  2 sides that
   want to communicate create 2 connections.
5. Each active (TX) connection has a separate CQ for send completions -
   this keeps the code simple without CQ resize and other tricks
6. To detect stale passive side connections (where the remote side is
   down), we keep an LRU list of passive connections (updated once per
   second per connection) and destroy a connection after it has been
   unused for several seconds. The LRU rule makes it possible to avoid
   scanning connections that have recently been active.

Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index af5ee2e..18d27fd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -49,8 +49,6 @@
 
 #include <net/dst.h>
 
-#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
-
 MODULE_AUTHOR("Roland Dreier");
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -145,6 +143,8 @@
 
 	netif_stop_queue(dev);
 
+	clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+
 	/*
 	 * Now flush workqueue to make sure a scheduled task doesn't
 	 * bring our internal state back up.
@@ -178,8 +178,18 @@
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
-	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+	/* dev->mtu > 2K ==> connected mode */
+	if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) {
+		if (new_mtu > priv->mcast_mtu)
+			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
+				   priv->mcast_mtu);
+		dev->mtu = new_mtu;
+		return 0;
+	}
+
+	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
 		return -EINVAL;
+	}
 
 	priv->admin_mtu = new_mtu;
 
@@ -414,6 +424,20 @@
 			memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
 			       sizeof(union ib_gid));
 
+			if (ipoib_cm_enabled(dev, neigh->neighbour)) {
+				if (!ipoib_cm_get(neigh))
+					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
+									       path,
+									       neigh));
+				if (!ipoib_cm_get(neigh)) {
+					list_del(&neigh->list);
+					if (neigh->ah)
+						ipoib_put_ah(neigh->ah);
+					ipoib_neigh_free(dev, neigh);
+					continue;
+				}
+			}
+
 			while ((skb = __skb_dequeue(&neigh->queue)))
 				__skb_queue_tail(&skqueue, skb);
 		}
@@ -520,7 +544,25 @@
 		memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
 		       sizeof(union ib_gid));
 
-		ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+		if (ipoib_cm_enabled(dev, neigh->neighbour)) {
+			if (!ipoib_cm_get(neigh))
+				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
+			if (!ipoib_cm_get(neigh)) {
+				list_del(&neigh->list);
+				if (neigh->ah)
+					ipoib_put_ah(neigh->ah);
+				ipoib_neigh_free(dev, neigh);
+				goto err_drop;
+			}
+			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+				__skb_queue_tail(&neigh->queue, skb);
+			else {
+				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
+					   skb_queue_len(&neigh->queue));
+				goto err_drop;
+			}
+		} else
+			ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
 	} else {
 		neigh->ah  = NULL;
 
@@ -538,6 +580,7 @@
 
 err_path:
 	ipoib_neigh_free(dev, neigh);
+err_drop:
 	++priv->stats.tx_dropped;
 	dev_kfree_skb_any(skb);
 
@@ -640,7 +683,12 @@
 
 		neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-		if (likely(neigh->ah)) {
+		if (ipoib_cm_get(neigh)) {
+			if (ipoib_cm_up(neigh)) {
+				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+				goto out;
+			}
+		} else if (neigh->ah) {
 			if (unlikely(memcmp(&neigh->dgid.raw,
 					    skb->dst->neighbour->ha + 4,
 					    sizeof(union ib_gid)))) {
@@ -805,6 +853,7 @@
 	neigh->neighbour = neighbour;
 	*to_ipoib_neigh(neighbour) = neigh;
 	skb_queue_head_init(&neigh->queue);
+	ipoib_cm_set(neigh, NULL);
 
 	return neigh;
 }
@@ -818,6 +867,8 @@
 		++priv->stats.tx_dropped;
 		dev_kfree_skb_any(skb);
 	}
+	if (ipoib_cm_get(neigh))
+		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
 	kfree(neigh);
 }
 
@@ -1080,6 +1131,8 @@
 
 	ipoib_create_debug_files(priv->dev);
 
+	if (ipoib_cm_add_mode_attr(priv->dev))
+		goto sysfs_failed;
 	if (ipoib_add_pkey_attr(priv->dev))
 		goto sysfs_failed;
 	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))