| /* |
| * TCP NV: TCP with Congestion Avoidance |
| * |
| * TCP-NV is a successor of TCP-Vegas that has been developed to |
| * deal with the issues that occur in modern networks. |
| * Like TCP-Vegas, TCP-NV supports true congestion avoidance, |
| * the ability to detect congestion before packet losses occur. |
| * When congestion (queue buildup) starts to occur, TCP-NV |
| * predicts what the cwnd size should be for the current |
| * throughput and it reduces the cwnd proportionally to |
| * the difference between the current cwnd and the predicted cwnd. |
| * |
| * NV is only recommeneded for traffic within a data center, and when |
| * all the flows are NV (at least those within the data center). This |
| * is due to the inherent unfairness between flows using losses to |
| * detect congestion (congestion control) and those that use queue |
| * buildup to detect congestion (congestion avoidance). |
| * |
| * Note: High NIC coalescence values may lower the performance of NV |
| * due to the increased noise in RTT values. In particular, we have |
| * seen issues with rx-frames values greater than 8. |
| * |
| * TODO: |
| * 1) Add mechanism to deal with reverse congestion. |
| */ |
| |
| #include <linux/mm.h> |
| #include <linux/module.h> |
| #include <linux/math64.h> |
| #include <net/tcp.h> |
| #include <linux/inet_diag.h> |
| |
| /* TCP NV parameters |
| * |
| * nv_pad Max number of queued packets allowed in network |
| * nv_pad_buffer Do not grow cwnd if this closed to nv_pad |
| * nv_reset_period How often (in) seconds)to reset min_rtt |
| * nv_min_cwnd Don't decrease cwnd below this if there are no losses |
| * nv_cong_dec_mult Decrease cwnd by X% (30%) of congestion when detected |
| * nv_ssthresh_factor On congestion set ssthresh to this * <desired cwnd> / 8 |
| * nv_rtt_factor RTT averaging factor |
| * nv_loss_dec_factor Decrease cwnd to this (80%) when losses occur |
| * nv_dec_eval_min_calls Wait this many RTT measurements before dec cwnd |
| * nv_inc_eval_min_calls Wait this many RTT measurements before inc cwnd |
| * nv_ssthresh_eval_min_calls Wait this many RTT measurements before stopping |
| * slow-start due to congestion |
| * nv_stop_rtt_cnt Only grow cwnd for this many RTTs after non-congestion |
| * nv_rtt_min_cnt Wait these many RTTs before making congesion decision |
| * nv_cwnd_growth_rate_neg |
| * nv_cwnd_growth_rate_pos |
| * How quickly to double growth rate (not rate) of cwnd when not |
| * congested. One value (nv_cwnd_growth_rate_neg) for when |
| * rate < 1 pkt/RTT (after losses). The other (nv_cwnd_growth_rate_pos) |
| * otherwise. |
| */ |
| |
| static int nv_pad __read_mostly = 10; |
| static int nv_pad_buffer __read_mostly = 2; |
| static int nv_reset_period __read_mostly = 5; /* in seconds */ |
| static int nv_min_cwnd __read_mostly = 2; |
| static int nv_cong_dec_mult __read_mostly = 30 * 128 / 100; /* = 30% */ |
| static int nv_ssthresh_factor __read_mostly = 8; /* = 1 */ |
| static int nv_rtt_factor __read_mostly = 128; /* = 1/2*old + 1/2*new */ |
| static int nv_loss_dec_factor __read_mostly = 819; /* => 80% */ |
| static int nv_cwnd_growth_rate_neg __read_mostly = 8; |
| static int nv_cwnd_growth_rate_pos __read_mostly; /* 0 => fixed like Reno */ |
| static int nv_dec_eval_min_calls __read_mostly = 60; |
| static int nv_inc_eval_min_calls __read_mostly = 20; |
| static int nv_ssthresh_eval_min_calls __read_mostly = 30; |
| static int nv_stop_rtt_cnt __read_mostly = 10; |
| static int nv_rtt_min_cnt __read_mostly = 2; |
| |
| module_param(nv_pad, int, 0644); |
| MODULE_PARM_DESC(nv_pad, "max queued packets allowed in network"); |
| module_param(nv_reset_period, int, 0644); |
| MODULE_PARM_DESC(nv_reset_period, "nv_min_rtt reset period (secs)"); |
| module_param(nv_min_cwnd, int, 0644); |
| MODULE_PARM_DESC(nv_min_cwnd, "NV will not decrease cwnd below this value" |
| " without losses"); |
| |
| /* TCP NV Parameters */ |
| struct tcpnv { |
| unsigned long nv_min_rtt_reset_jiffies; /* when to switch to |
| * nv_min_rtt_new */ |
| s8 cwnd_growth_factor; /* Current cwnd growth factor, |
| * < 0 => less than 1 packet/RTT */ |
| u8 available8; |
| u16 available16; |
| u8 nv_allow_cwnd_growth:1, /* whether cwnd can grow */ |
| nv_reset:1, /* whether to reset values */ |
| nv_catchup:1; /* whether we are growing because |
| * of temporary cwnd decrease */ |
| u8 nv_eval_call_cnt; /* call count since last eval */ |
| u8 nv_min_cwnd; /* nv won't make a ca decision if cwnd is |
| * smaller than this. It may grow to handle |
| * TSO, LRO and interrupt coalescence because |
| * with these a small cwnd cannot saturate |
| * the link. Note that this is different from |
| * the file local nv_min_cwnd */ |
| u8 nv_rtt_cnt; /* RTTs without making ca decision */; |
| u32 nv_last_rtt; /* last rtt */ |
| u32 nv_min_rtt; /* active min rtt. Used to determine slope */ |
| u32 nv_min_rtt_new; /* min rtt for future use */ |
| u32 nv_base_rtt; /* If non-zero it represents the threshold for |
| * congestion */ |
| u32 nv_lower_bound_rtt; /* Used in conjunction with nv_base_rtt. It is |
| * set to 80% of nv_base_rtt. It helps reduce |
| * unfairness between flows */ |
| u32 nv_rtt_max_rate; /* max rate seen during current RTT */ |
| u32 nv_rtt_start_seq; /* current RTT ends when packet arrives |
| * acking beyond nv_rtt_start_seq */ |
| u32 nv_last_snd_una; /* Previous value of tp->snd_una. It is |
| * used to determine bytes acked since last |
| * call to bictcp_acked */ |
| u32 nv_no_cong_cnt; /* Consecutive no congestion decisions */ |
| }; |
| |
| #define NV_INIT_RTT U32_MAX |
| #define NV_MIN_CWND 4 |
| #define NV_MIN_CWND_GROW 2 |
| #define NV_TSO_CWND_BOUND 80 |
| |
| static inline void tcpnv_reset(struct tcpnv *ca, struct sock *sk) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| |
| ca->nv_reset = 0; |
| ca->nv_no_cong_cnt = 0; |
| ca->nv_rtt_cnt = 0; |
| ca->nv_last_rtt = 0; |
| ca->nv_rtt_max_rate = 0; |
| ca->nv_rtt_start_seq = tp->snd_una; |
| ca->nv_eval_call_cnt = 0; |
| ca->nv_last_snd_una = tp->snd_una; |
| } |
| |
| static void tcpnv_init(struct sock *sk) |
| { |
| struct tcpnv *ca = inet_csk_ca(sk); |
| int base_rtt; |
| |
| tcpnv_reset(ca, sk); |
| |
| /* See if base_rtt is available from socket_ops bpf program. |
| * It is meant to be used in environments, such as communication |
| * within a datacenter, where we have reasonable estimates of |
| * RTTs |
| */ |
| base_rtt = tcp_call_bpf(sk, BPF_SOCK_OPS_BASE_RTT); |
| if (base_rtt > 0) { |
| ca->nv_base_rtt = base_rtt; |
| ca->nv_lower_bound_rtt = (base_rtt * 205) >> 8; /* 80% */ |
| } else { |
| ca->nv_base_rtt = 0; |
| ca->nv_lower_bound_rtt = 0; |
| } |
| |
| ca->nv_allow_cwnd_growth = 1; |
| ca->nv_min_rtt_reset_jiffies = jiffies + 2 * HZ; |
| ca->nv_min_rtt = NV_INIT_RTT; |
| ca->nv_min_rtt_new = NV_INIT_RTT; |
| ca->nv_min_cwnd = NV_MIN_CWND; |
| ca->nv_catchup = 0; |
| ca->cwnd_growth_factor = 0; |
| } |
| |
| /* If provided, apply upper (base_rtt) and lower (lower_bound_rtt) |
| * bounds to RTT. |
| */ |
| inline u32 nv_get_bounded_rtt(struct tcpnv *ca, u32 val) |
| { |
| if (ca->nv_lower_bound_rtt > 0 && val < ca->nv_lower_bound_rtt) |
| return ca->nv_lower_bound_rtt; |
| else if (ca->nv_base_rtt > 0 && val > ca->nv_base_rtt) |
| return ca->nv_base_rtt; |
| else |
| return val; |
| } |
| |
| static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) |
| { |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct tcpnv *ca = inet_csk_ca(sk); |
| u32 cnt; |
| |
| if (!tcp_is_cwnd_limited(sk)) |
| return; |
| |
| /* Only grow cwnd if NV has not detected congestion */ |
| if (!ca->nv_allow_cwnd_growth) |
| return; |
| |
| if (tcp_in_slow_start(tp)) { |
| acked = tcp_slow_start(tp, acked); |
| if (!acked) |
| return; |
| } |
| |
| if (ca->cwnd_growth_factor < 0) { |
| cnt = tp->snd_cwnd << -ca->cwnd_growth_factor; |
| tcp_cong_avoid_ai(tp, cnt, acked); |
| } else { |
| cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor); |
| tcp_cong_avoid_ai(tp, cnt, acked); |
| } |
| } |
| |
| static u32 tcpnv_recalc_ssthresh(struct sock *sk) |
| { |
| const struct tcp_sock *tp = tcp_sk(sk); |
| |
| return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); |
| } |
| |
| static void tcpnv_state(struct sock *sk, u8 new_state) |
| { |
| struct tcpnv *ca = inet_csk_ca(sk); |
| |
| if (new_state == TCP_CA_Open && ca->nv_reset) { |
| tcpnv_reset(ca, sk); |
| } else if (new_state == TCP_CA_Loss || new_state == TCP_CA_CWR || |
| new_state == TCP_CA_Recovery) { |
| ca->nv_reset = 1; |
| ca->nv_allow_cwnd_growth = 0; |
| if (new_state == TCP_CA_Loss) { |
| /* Reset cwnd growth factor to Reno value */ |
| if (ca->cwnd_growth_factor > 0) |
| ca->cwnd_growth_factor = 0; |
| /* Decrease growth rate if allowed */ |
| if (nv_cwnd_growth_rate_neg > 0 && |
| ca->cwnd_growth_factor > -8) |
| ca->cwnd_growth_factor--; |
| } |
| } |
| } |
| |
| /* Do congestion avoidance calculations for TCP-NV |
| */ |
| static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) |
| { |
| const struct inet_connection_sock *icsk = inet_csk(sk); |
| struct tcp_sock *tp = tcp_sk(sk); |
| struct tcpnv *ca = inet_csk_ca(sk); |
| unsigned long now = jiffies; |
| u64 rate64; |
| u32 rate, max_win, cwnd_by_slope; |
| u32 avg_rtt; |
| u32 bytes_acked = 0; |
| |
| /* Some calls are for duplicates without timetamps */ |
| if (sample->rtt_us < 0) |
| return; |
| |
| /* If not in TCP_CA_Open or TCP_CA_Disorder states, skip. */ |
| if (icsk->icsk_ca_state != TCP_CA_Open && |
| icsk->icsk_ca_state != TCP_CA_Disorder) |
| return; |
| |
| /* Stop cwnd growth if we were in catch up mode */ |
| if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) { |
| ca->nv_catchup = 0; |
| ca->nv_allow_cwnd_growth = 0; |
| } |
| |
| bytes_acked = tp->snd_una - ca->nv_last_snd_una; |
| ca->nv_last_snd_una = tp->snd_una; |
| |
| if (sample->in_flight == 0) |
| return; |
| |
| /* Calculate moving average of RTT */ |
| if (nv_rtt_factor > 0) { |
| if (ca->nv_last_rtt > 0) { |
| avg_rtt = (((u64)sample->rtt_us) * nv_rtt_factor + |
| ((u64)ca->nv_last_rtt) |
| * (256 - nv_rtt_factor)) >> 8; |
| } else { |
| avg_rtt = sample->rtt_us; |
| ca->nv_min_rtt = avg_rtt << 1; |
| } |
| ca->nv_last_rtt = avg_rtt; |
| } else { |
| avg_rtt = sample->rtt_us; |
| } |
| |
| /* rate in 100's bits per second */ |
| rate64 = ((u64)sample->in_flight) * 80000; |
| do_div(rate64, avg_rtt ?: 1); |
| rate = (u32)rate64; |
| |
| /* Remember the maximum rate seen during this RTT |
| * Note: It may be more than one RTT. This function should be |
| * called at least nv_dec_eval_min_calls times. |
| */ |
| if (ca->nv_rtt_max_rate < rate) |
| ca->nv_rtt_max_rate = rate; |
| |
| /* We have valid information, increment counter */ |
| if (ca->nv_eval_call_cnt < 255) |
| ca->nv_eval_call_cnt++; |
| |
| /* Apply bounds to rtt. Only used to update min_rtt */ |
| avg_rtt = nv_get_bounded_rtt(ca, avg_rtt); |
| |
| /* update min rtt if necessary */ |
| if (avg_rtt < ca->nv_min_rtt) |
| ca->nv_min_rtt = avg_rtt; |
| |
| /* update future min_rtt if necessary */ |
| if (avg_rtt < ca->nv_min_rtt_new) |
| ca->nv_min_rtt_new = avg_rtt; |
| |
| /* nv_min_rtt is updated with the minimum (possibley averaged) rtt |
| * seen in the last sysctl_tcp_nv_reset_period seconds (i.e. a |
| * warm reset). This new nv_min_rtt will be continued to be updated |
| * and be used for another sysctl_tcp_nv_reset_period seconds, |
| * when it will be updated again. |
| * In practice we introduce some randomness, so the actual period used |
| * is chosen randomly from the range: |
| * [sysctl_tcp_nv_reset_period*3/4, sysctl_tcp_nv_reset_period*5/4) |
| */ |
| if (time_after_eq(now, ca->nv_min_rtt_reset_jiffies)) { |
| unsigned char rand; |
| |
| ca->nv_min_rtt = ca->nv_min_rtt_new; |
| ca->nv_min_rtt_new = NV_INIT_RTT; |
| get_random_bytes(&rand, 1); |
| ca->nv_min_rtt_reset_jiffies = |
| now + ((nv_reset_period * (384 + rand) * HZ) >> 9); |
| /* Every so often we decrease ca->nv_min_cwnd in case previous |
| * value is no longer accurate. |
| */ |
| ca->nv_min_cwnd = max(ca->nv_min_cwnd / 2, NV_MIN_CWND); |
| } |
| |
| /* Once per RTT check if we need to do congestion avoidance */ |
| if (before(ca->nv_rtt_start_seq, tp->snd_una)) { |
| ca->nv_rtt_start_seq = tp->snd_nxt; |
| if (ca->nv_rtt_cnt < 0xff) |
| /* Increase counter for RTTs without CA decision */ |
| ca->nv_rtt_cnt++; |
| |
| /* If this function is only called once within an RTT |
| * the cwnd is probably too small (in some cases due to |
| * tso, lro or interrupt coalescence), so we increase |
| * ca->nv_min_cwnd. |
| */ |
| if (ca->nv_eval_call_cnt == 1 && |
| bytes_acked >= (ca->nv_min_cwnd - 1) * tp->mss_cache && |
| ca->nv_min_cwnd < (NV_TSO_CWND_BOUND + 1)) { |
| ca->nv_min_cwnd = min(ca->nv_min_cwnd |
| + NV_MIN_CWND_GROW, |
| NV_TSO_CWND_BOUND + 1); |
| ca->nv_rtt_start_seq = tp->snd_nxt + |
| ca->nv_min_cwnd * tp->mss_cache; |
| ca->nv_eval_call_cnt = 0; |
| ca->nv_allow_cwnd_growth = 1; |
| return; |
| } |
| |
| /* Find the ideal cwnd for current rate from slope |
| * slope = 80000.0 * mss / nv_min_rtt |
| * cwnd_by_slope = nv_rtt_max_rate / slope |
| */ |
| cwnd_by_slope = (u32) |
| div64_u64(((u64)ca->nv_rtt_max_rate) * ca->nv_min_rtt, |
| (u64)(80000 * tp->mss_cache)); |
| max_win = cwnd_by_slope + nv_pad; |
| |
| /* If cwnd > max_win, decrease cwnd |
| * if cwnd < max_win, grow cwnd |
| * else leave the same |
| */ |
| if (tp->snd_cwnd > max_win) { |
| /* there is congestion, check that it is ok |
| * to make a CA decision |
| * 1. We should have at least nv_dec_eval_min_calls |
| * data points before making a CA decision |
| * 2. We only make a congesion decision after |
| * nv_rtt_min_cnt RTTs |
| */ |
| if (ca->nv_rtt_cnt < nv_rtt_min_cnt) { |
| return; |
| } else if (tp->snd_ssthresh == TCP_INFINITE_SSTHRESH) { |
| if (ca->nv_eval_call_cnt < |
| nv_ssthresh_eval_min_calls) |
| return; |
| /* otherwise we will decrease cwnd */ |
| } else if (ca->nv_eval_call_cnt < |
| nv_dec_eval_min_calls) { |
| if (ca->nv_allow_cwnd_growth && |
| ca->nv_rtt_cnt > nv_stop_rtt_cnt) |
| ca->nv_allow_cwnd_growth = 0; |
| return; |
| } |
| |
| /* We have enough data to determine we are congested */ |
| ca->nv_allow_cwnd_growth = 0; |
| tp->snd_ssthresh = |
| (nv_ssthresh_factor * max_win) >> 3; |
| if (tp->snd_cwnd - max_win > 2) { |
| /* gap > 2, we do exponential cwnd decrease */ |
| int dec; |
| |
| dec = max(2U, ((tp->snd_cwnd - max_win) * |
| nv_cong_dec_mult) >> 7); |
| tp->snd_cwnd -= dec; |
| } else if (nv_cong_dec_mult > 0) { |
| tp->snd_cwnd = max_win; |
| } |
| if (ca->cwnd_growth_factor > 0) |
| ca->cwnd_growth_factor = 0; |
| ca->nv_no_cong_cnt = 0; |
| } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) { |
| /* There is no congestion, grow cwnd if allowed*/ |
| if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) |
| return; |
| |
| ca->nv_allow_cwnd_growth = 1; |
| ca->nv_no_cong_cnt++; |
| if (ca->cwnd_growth_factor < 0 && |
| nv_cwnd_growth_rate_neg > 0 && |
| ca->nv_no_cong_cnt > nv_cwnd_growth_rate_neg) { |
| ca->cwnd_growth_factor++; |
| ca->nv_no_cong_cnt = 0; |
| } else if (ca->cwnd_growth_factor >= 0 && |
| nv_cwnd_growth_rate_pos > 0 && |
| ca->nv_no_cong_cnt > |
| nv_cwnd_growth_rate_pos) { |
| ca->cwnd_growth_factor++; |
| ca->nv_no_cong_cnt = 0; |
| } |
| } else { |
| /* cwnd is in-between, so do nothing */ |
| return; |
| } |
| |
| /* update state */ |
| ca->nv_eval_call_cnt = 0; |
| ca->nv_rtt_cnt = 0; |
| ca->nv_rtt_max_rate = 0; |
| |
| /* Don't want to make cwnd < nv_min_cwnd |
| * (it wasn't before, if it is now is because nv |
| * decreased it). |
| */ |
| if (tp->snd_cwnd < nv_min_cwnd) |
| tp->snd_cwnd = nv_min_cwnd; |
| } |
| } |
| |
| /* Extract info for Tcp socket info provided via netlink */ |
| static size_t tcpnv_get_info(struct sock *sk, u32 ext, int *attr, |
| union tcp_cc_info *info) |
| { |
| const struct tcpnv *ca = inet_csk_ca(sk); |
| |
| if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { |
| info->vegas.tcpv_enabled = 1; |
| info->vegas.tcpv_rttcnt = ca->nv_rtt_cnt; |
| info->vegas.tcpv_rtt = ca->nv_last_rtt; |
| info->vegas.tcpv_minrtt = ca->nv_min_rtt; |
| |
| *attr = INET_DIAG_VEGASINFO; |
| return sizeof(struct tcpvegas_info); |
| } |
| return 0; |
| } |
| |
| static struct tcp_congestion_ops tcpnv __read_mostly = { |
| .init = tcpnv_init, |
| .ssthresh = tcpnv_recalc_ssthresh, |
| .cong_avoid = tcpnv_cong_avoid, |
| .set_state = tcpnv_state, |
| .undo_cwnd = tcp_reno_undo_cwnd, |
| .pkts_acked = tcpnv_acked, |
| .get_info = tcpnv_get_info, |
| |
| .owner = THIS_MODULE, |
| .name = "nv", |
| }; |
| |
| static int __init tcpnv_register(void) |
| { |
| BUILD_BUG_ON(sizeof(struct tcpnv) > ICSK_CA_PRIV_SIZE); |
| |
| return tcp_register_congestion_control(&tcpnv); |
| } |
| |
| static void __exit tcpnv_unregister(void) |
| { |
| tcp_unregister_congestion_control(&tcpnv); |
| } |
| |
| module_init(tcpnv_register); |
| module_exit(tcpnv_unregister); |
| |
| MODULE_AUTHOR("Lawrence Brakmo"); |
| MODULE_LICENSE("GPL"); |
| MODULE_DESCRIPTION("TCP NV"); |
| MODULE_VERSION("1.0"); |