[TCP]: MTU probing

Implementation of packetization layer path mtu discovery for TCP, based on
the internet-draft currently found at
<http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f498a6..8197b5e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,6 +51,12 @@
  */
 int sysctl_tcp_tso_win_divisor = 3;
 
+int sysctl_tcp_mtu_probing = 0;
+int sysctl_tcp_base_mss = 512;
+
+EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
+EXPORT_SYMBOL(sysctl_tcp_base_mss);
+
 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
 			     struct sk_buff *skb)
 {
@@ -681,6 +687,62 @@
 	return 0;
 }
 
+/* Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mss_now;
+
+	/* Calculate base mss without TCP options:
+	   It is MMS_S - sizeof(tcphdr) of rfc1122
+	 */
+	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+	/* Clamp it (mss_clamp does not include tcp options) */
+	if (mss_now > tp->rx_opt.mss_clamp)
+		mss_now = tp->rx_opt.mss_clamp;
+
+	/* Now subtract optional transport overhead */
+	mss_now -= icsk->icsk_ext_hdr_len;
+
+	/* Then reserve room for full set of TCP options and 8 bytes of data */
+	if (mss_now < 48)
+		mss_now = 48;
+
+	/* Now subtract TCP options size, not including SACKs */
+	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+	return mss_now;
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	int mtu;
+
+	mtu = mss +
+	      tp->tcp_header_len +
+	      icsk->icsk_ext_hdr_len +
+	      icsk->icsk_af_ops->net_header_len;
+
+	return mtu;
+}
+
+void tcp_mtup_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+	                       icsk->icsk_af_ops->net_header_len;
+	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+	icsk->icsk_mtup.probe_size = 0;
+}
+
 /* This function synchronize snd mss to current pmtu/exthdr set.
 
    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -708,25 +770,12 @@
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
-	/* Calculate base mss without TCP options:
-	   It is MMS_S - sizeof(tcphdr) of rfc1122
-	 */
-	int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
-		       sizeof(struct tcphdr));
+	int mss_now;
 
-	/* Clamp it (mss_clamp does not include tcp options) */
-	if (mss_now > tp->rx_opt.mss_clamp)
-		mss_now = tp->rx_opt.mss_clamp;
+	if (icsk->icsk_mtup.search_high > pmtu)
+		icsk->icsk_mtup.search_high = pmtu;
 
-	/* Now subtract optional transport overhead */
-	mss_now -= icsk->icsk_ext_hdr_len;
-
-	/* Then reserve room for full set of TCP options and 8 bytes of data */
-	if (mss_now < 48)
-		mss_now = 48;
-
-	/* Now subtract TCP options size, not including SACKs */
-	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+	mss_now = tcp_mtu_to_mss(sk, pmtu);
 
 	/* Bound mss with half of window */
 	if (tp->max_window && mss_now > (tp->max_window>>1))
@@ -734,6 +783,8 @@
 
 	/* And store cached results */
 	icsk->icsk_pmtu_cookie = pmtu;
+	if (icsk->icsk_mtup.enabled)
+		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
 	tp->mss_cache = mss_now;
 
 	return mss_now;
@@ -1063,6 +1114,140 @@
 	return 1;
 }
 
+/* Create a new MTU probe if we are ready.
+ * Returns 0 if we should wait to probe (no cwnd available),
+ *         1 if a probe was sent,
+ *         -1 otherwise */
+static int tcp_mtu_probe(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct sk_buff *skb, *nskb, *next;
+	int len;
+	int probe_size;
+	unsigned int pif;
+	int copy;
+	int mss_now;
+
+	/* Not currently probing/verifying,
+	 * not in recovery,
+	 * have enough cwnd, and
+	 * not SACKing (the variable headers throw things off) */
+	if (!icsk->icsk_mtup.enabled ||
+	    icsk->icsk_mtup.probe_size ||
+	    inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+	    tp->snd_cwnd < 11 ||
+	    tp->rx_opt.eff_sacks)
+		return -1;
+
+	/* Very simple search strategy: just double the MSS. */
+	mss_now = tcp_current_mss(sk, 0);
+	probe_size = 2*tp->mss_cache;
+	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
+		/* TODO: set timer for probe_converge_event */
+		return -1;
+	}
+
+	/* Have enough data in the send queue to probe? */
+	len = 0;
+	if ((skb = sk->sk_send_head) == NULL)
+		return -1;
+	while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
+		skb = skb->next;
+	if (len < probe_size)
+		return -1;
+
+	/* Receive window check. */
+	if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
+		if (tp->snd_wnd < probe_size)
+			return -1;
+		else
+			return 0;
+	}
+
+	/* Do we need to wait to drain cwnd? */
+	pif = tcp_packets_in_flight(tp);
+	if (pif + 2 > tp->snd_cwnd) {
+		/* With no packets in flight, don't stall. */
+		if (pif == 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	/* We're allowed to probe.  Build it now. */
+	if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+		return -1;
+	sk_charge_skb(sk, nskb);
+
+	skb = sk->sk_send_head;
+	__skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
+	sk->sk_send_head = nskb;
+
+	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+	TCP_SKB_CB(nskb)->sacked = 0;
+	nskb->csum = 0;
+	if (skb->ip_summed == CHECKSUM_HW)
+		nskb->ip_summed = CHECKSUM_HW;
+
+	len = 0;
+	while (len < probe_size) {
+		next = skb->next;
+
+		copy = min_t(int, skb->len, probe_size - len);
+		if (nskb->ip_summed)
+			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+		else
+			nskb->csum = skb_copy_and_csum_bits(skb, 0,
+			                 skb_put(nskb, copy), copy, nskb->csum);
+
+		if (skb->len <= copy) {
+			/* We've eaten all the data from this skb.
+			 * Throw it away. */
+			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+			__skb_unlink(skb, &sk->sk_write_queue);
+			sk_stream_free_skb(sk, skb);
+		} else {
+			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+			                           ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+			if (!skb_shinfo(skb)->nr_frags) {
+				skb_pull(skb, copy);
+				if (skb->ip_summed != CHECKSUM_HW)
+					skb->csum = csum_partial(skb->data, skb->len, 0);
+			} else {
+				__pskb_trim_head(skb, copy);
+				tcp_set_skb_tso_segs(sk, skb, mss_now);
+			}
+			TCP_SKB_CB(skb)->seq += copy;
+		}
+
+		len += copy;
+		skb = next;
+	}
+	tcp_init_tso_segs(sk, nskb, nskb->len);
+
+	/* We're ready to send.  If this fails, the probe will
+	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */
+	TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+		/* Decrement cwnd here because we are sending
+		* effectively two packets. */
+		tp->snd_cwnd--;
+		update_send_head(sk, tp, nskb);
+
+		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+		icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+		icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+		return 1;
+	}
+
+	return -1;
+}
+
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -1076,6 +1261,7 @@
 	struct sk_buff *skb;
 	unsigned int tso_segs, sent_pkts;
 	int cwnd_quota;
+	int result;
 
 	/* If we are closed, the bytes will have to remain here.
 	 * In time closedown will finish, we empty the write queue and all
@@ -1085,6 +1271,14 @@
 		return 0;
 
 	sent_pkts = 0;
+
+	/* Do MTU probing. */
+	if ((result = tcp_mtu_probe(sk)) == 0) {
+		return 0;
+	} else if (result > 0) {
+		sent_pkts = 1;
+	}
+
 	while ((skb = sk->sk_send_head)) {
 		unsigned int limit;
 
@@ -1455,9 +1649,15 @@
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct inet_connection_sock *icsk = inet_csk(sk);
  	unsigned int cur_mss = tcp_current_mss(sk, 0);
 	int err;
 
+	/* Inconslusive MTU probe */
+	if (icsk->icsk_mtup.probe_size) {
+		icsk->icsk_mtup.probe_size = 0;
+	}
+
 	/* Do not sent more than we queued. 1/4 is reserved for possible
 	 * copying overhead: fragmentation, tunneling, mangling etc.
 	 */
@@ -1883,6 +2083,7 @@
 	if (tp->rx_opt.user_mss)
 		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
 	tp->max_window = 0;
+	tcp_mtup_init(sk);
 	tcp_sync_mss(sk, dst_mtu(dst));
 
 	if (!tp->window_clamp)
@@ -2180,3 +2381,4 @@
 EXPORT_SYMBOL(tcp_simple_retransmit);
 EXPORT_SYMBOL(tcp_sync_mss);
 EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
+EXPORT_SYMBOL(tcp_mtup_init);