tcp: add reordering timer in RACK loss detection
This patch makes RACK install a reordering timer when it suspects
some packets might be lost, but wants to delay the decision
a little bit to accomodate reordering.
It does not create a new timer but instead repurposes the existing
RTO timer, because both are meant to retransmit packets.
Specifically it arms a timer ICSK_TIME_REO_TIMEOUT when
the RACK timing check fails. The wait time is set to
RACK.RTT + RACK.reo_wnd - (NOW - Packet.xmit_time) + fudge
This translates to expecting a packet (Packet) should take
(RACK.RTT + RACK.reo_wnd + fudge) to deliver after it was sent.
When there are multiple packets that need a timer, we use one timer
with the maximum timeout. Therefore the timer conservatively uses
the maximum window to expire N packets by one timeout, instead of
N timeouts to expire N packets sent at different times.
The fudge factor is 2 jiffies to ensure when the timer fires, all
the suspected packets would exceed the deadline and be marked lost
by tcp_rack_detect_loss(). It has to be at least 1 jiffy because the
clock may tick between calling icsk_reset_xmit_timer(timeout) and
actually hang the timer. The next jiffy is to lower-bound the timeout
to 2 jiffies when reo_wnd is < 1ms.
When the reordering timer fires (tcp_rack_reo_timeout): If we aren't
in Recovery we'll enter fast recovery and force fast retransmit.
This is very similar to the early retransmit (RFC5827) except RACK
is not constrained to only enter recovery for small outstanding
flights.
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 85ee387..84b2edd 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -144,6 +144,7 @@
#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */
#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */
#define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */
+#define ICSK_TIME_REO_TIMEOUT 6 /* Reordering timer */
static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
@@ -234,7 +235,8 @@
}
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
- what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) {
+ what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
+ what == ICSK_TIME_REO_TIMEOUT) {
icsk->icsk_pending = what;
icsk->icsk_timeout = jiffies + when;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 1439107..64fcdeb3 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -143,6 +143,7 @@
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
* for local resources.
*/
+#define TCP_REO_TIMEOUT_MIN (2000) /* Min RACK reordering timeout in usec */
#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
@@ -397,6 +398,7 @@
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk);
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
@@ -541,6 +543,7 @@
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
+void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
@@ -1867,6 +1870,7 @@
extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
const struct skb_mstamp *xmit_time,
const struct skb_mstamp *ack_time);
+extern void tcp_rack_reo_timeout(struct sock *sk);
/*
* Save and compile IPv4 options, return a pointer to it
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4dea33e..d216e40 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -216,6 +216,7 @@
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8ccd171..be11918 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2522,8 +2522,7 @@
tcp_ecn_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
- int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2691,7 +2690,7 @@
}
EXPORT_SYMBOL(tcp_simple_retransmit);
-static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
@@ -3031,6 +3030,7 @@
u32 rto = inet_csk(sk)->icsk_rto;
/* Offset the time elapsed after installing regular RTO */
if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
struct sk_buff *skb = tcp_write_queue_head(sk);
const u32 rto_time_stamp =
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 56d756e..ebf3e0c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2230,6 +2230,7 @@
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1d5331a..0ba9026 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2960,7 +2960,8 @@
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
- if (skb == tcp_write_queue_head(sk))
+ if (skb == tcp_write_queue_head(sk) &&
+ icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
inet_csk(sk)->icsk_rto,
TCP_RTO_MAX);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 557363c..eb39b1b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -32,19 +32,18 @@
* The current version is only used after recovery starts but can be
* easily extended to detect the first loss.
*/
-static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now)
+static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
+ u32 *reo_timeout)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
u32 reo_wnd;
+ *reo_timeout = 0;
/* To be more reordering resilient, allow min_rtt/4 settling delay
* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
* RTT because reordering is often a path property and less related
* to queuing or delayed ACKs.
- *
- * TODO: measure and adapt to the observed reordering delay, and
- * use a timer to retransmit like the delayed early retransmit.
*/
reo_wnd = 1000;
if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
@@ -66,10 +65,23 @@
* A packet is lost if its elapsed time is beyond
* the recent RTT plus the reordering window.
*/
- if (skb_mstamp_us_delta(now, &skb->skb_mstamp) >
- tp->rack.rtt_us + reo_wnd) {
+ u32 elapsed = skb_mstamp_us_delta(now,
+ &skb->skb_mstamp);
+ s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
+
+ if (remaining < 0) {
tcp_rack_mark_skb_lost(sk, skb);
+ continue;
}
+
+ /* Skip ones marked lost but not yet retransmitted */
+ if ((scb->sacked & TCPCB_LOST) &&
+ !(scb->sacked & TCPCB_SACKED_RETRANS))
+ continue;
+
+ /* Record maximum wait time (+1 to avoid 0) */
+ *reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
* b/c the rest are all sent after rack_sent
@@ -82,12 +94,19 @@
void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 timeout;
if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
return;
+
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
- tcp_rack_detect_loss(sk, now);
+ tcp_rack_detect_loss(sk, now, &timeout);
+ if (timeout) {
+ timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
+ timeout, inet_csk(sk)->icsk_rto);
+ }
}
/* Record the most recently (re)sent time among the (s)acked packets
@@ -123,3 +142,27 @@
tp->rack.mstamp = *xmit_time;
tp->rack.advanced = 1;
}
+
+/* We have waited long enough to accommodate reordering. Mark the expired
+ * packets lost and retransmit them.
+ */
+void tcp_rack_reo_timeout(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ u32 timeout, prior_inflight;
+
+ skb_mstamp_get(&now);
+ prior_inflight = tcp_packets_in_flight(tp);
+ tcp_rack_detect_loss(sk, &now, &timeout);
+ if (prior_inflight != tcp_packets_in_flight(tp)) {
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
+ tcp_enter_recovery(sk, false);
+ if (!inet_csk(sk)->icsk_ca_ops->cong_control)
+ tcp_cwnd_reduction(sk, 1, 0);
+ }
+ tcp_xmit_retransmit_queue(sk);
+ }
+ if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
+ tcp_rearm_rto(sk);
+}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 29a9bd5..953c02a 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -563,6 +563,9 @@
event = icsk->icsk_pending;
switch (event) {
+ case ICSK_TIME_REO_TIMEOUT:
+ tcp_rack_reo_timeout(sk);
+ break;
case ICSK_TIME_EARLY_RETRANS:
tcp_resume_early_retransmit(sk);
break;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 228965d..f52c374 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1746,6 +1746,7 @@
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;