IPoIB: Connected mode experimental support The following patch adds experimental support for IPoIB connected mode, as defined by the draft from the IETF ipoib working group. The idea is to increase performance by increasing the MTU from the maximum of 2K (theoretically 4K) supported by IPoIB on top of UD. With this code, I'm able to get 800MByte/sec or more with netperf without options on a Mellanox 4x back-to-back DDR system. Some notes on code: 1. SRQ is used for scalability to large cluster sizes 2. Only RC connections are used (UC does not support SRQ now) 3. Retry count is set to 0 since spec draft warns against retries 4. Each connection is used for data transfers in only 1 direction, so each connection is either active(TX) or passive (RX). 2 sides that want to communicate create 2 connections. 5. Each active (TX) connection has a separate CQ for send completions - this keeps the code simple without CQ resize and other tricks 6. To detect stale passive side connections (where the remote side is down), we keep an LRU list of passive connections (updated once per second per connection) and destroy a connection after it has been unused for several seconds. The LRU rule makes it possible to avoid scanning connections that have recently been active. Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>

commit: 839fcaba355abaffb7b44f0f4504093acb0b11cf [log] [tgz]
author: Michael S. Tsirkin <mst@mellanox.co.il> Mon Feb 05 22:12:23 2007 +0200
committer: Roland Dreier <rolandd@cisco.com> Sat Feb 10 08:00:48 2007 -0800
tree: 9e23f61ab0569ff144e6d9d4cb6a0887783f923c
parent: 9a6b090c0d1cd5c90f21db772dbe2fbcf14366de [diff] [blame]
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 59d9594..f2aa923 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c

@@ -50,8 +50,6 @@
 		 "Enable data path debug tracing if > 0");
 #endif
 
-#define	IPOIB_OP_RECV	(1ul << 31)
-
 static DEFINE_MUTEX(pkey_mutex);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@ -268,10 +266,11 @@
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
 	++priv->tx_tail;
-	if (netif_queue_stopped(dev) &&
-	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
-	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
+	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
+		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
 		netif_wake_queue(dev);
+	}
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
 	if (wc->status != IB_WC_SUCCESS &&
@@ -283,7 +282,9 @@
 
 static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
 {
-	if (wc->wr_id & IPOIB_OP_RECV)
+	if (wc->wr_id & IPOIB_CM_OP_SRQ)
+		ipoib_cm_handle_rx_wc(dev, wc);
+	else if (wc->wr_id & IPOIB_OP_RECV)
 		ipoib_ib_handle_rx_wc(dev, wc);
 	else
 		ipoib_ib_handle_tx_wc(dev, wc);
@@ -327,12 +328,12 @@
 	struct ipoib_tx_buf *tx_req;
 	u64 addr;
 
-	if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) {
+	if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
-			   skb->len, dev->mtu + INFINIBAND_ALEN);
+			   skb->len, priv->mcast_mtu + INFINIBAND_ALEN);
 		++priv->stats.tx_dropped;
 		++priv->stats.tx_errors;
-		dev_kfree_skb_any(skb);
+		ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
 		return;
 	}
 
@@ -372,6 +373,7 @@
 		if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
 			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
 			netif_stop_queue(dev);
+			set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
 		}
 	}
 }
@@ -424,6 +426,13 @@
 		return -1;
 	}
 
+	ret = ipoib_cm_dev_open(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+		ipoib_ib_dev_stop(dev);
+		return -1;
+	}
+
 	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
 
@@ -509,6 +518,8 @@
 
 	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
+	ipoib_cm_dev_stop(dev);
+
 	/*
 	 * Move our QP to the error state and then reinitialize in
 	 * when all work requests have completed or have been flushed.
commit	839fcaba355abaffb7b44f0f4504093acb0b11cf	[log] [tgz]
author	Michael S. Tsirkin <mst@mellanox.co.il>	Mon Feb 05 22:12:23 2007 +0200
committer	Roland Dreier <rolandd@cisco.com>	Sat Feb 10 08:00:48 2007 -0800
tree	9e23f61ab0569ff144e6d9d4cb6a0887783f923c
parent	9a6b090c0d1cd5c90f21db772dbe2fbcf14366de [diff] [blame]