David S. Miller | 4aabd8e | 2012-07-09 16:07:30 -0700 | [diff] [blame^] | 1 | #include <linux/cache.h> |
| 2 | #include <linux/tcp.h> |
| 3 | |
| 4 | #include <net/inet_connection_sock.h> |
| 5 | #include <net/sock.h> |
| 6 | #include <net/dst.h> |
| 7 | #include <net/tcp.h> |
| 8 | |
| 9 | int sysctl_tcp_nometrics_save __read_mostly; |
| 10 | |
| 11 | /* Save metrics learned by this TCP session. This function is called |
| 12 | * only, when TCP finishes successfully i.e. when it enters TIME-WAIT |
| 13 | * or goes from LAST-ACK to CLOSE. |
| 14 | */ |
| 15 | void tcp_update_metrics(struct sock *sk) |
| 16 | { |
| 17 | struct tcp_sock *tp = tcp_sk(sk); |
| 18 | struct dst_entry *dst = __sk_dst_get(sk); |
| 19 | |
| 20 | if (sysctl_tcp_nometrics_save) |
| 21 | return; |
| 22 | |
| 23 | if (dst && (dst->flags & DST_HOST)) { |
| 24 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 25 | int m; |
| 26 | unsigned long rtt; |
| 27 | |
| 28 | dst_confirm(dst); |
| 29 | |
| 30 | if (icsk->icsk_backoff || !tp->srtt) { |
| 31 | /* This session failed to estimate rtt. Why? |
| 32 | * Probably, no packets returned in time. |
| 33 | * Reset our results. |
| 34 | */ |
| 35 | if (!(dst_metric_locked(dst, RTAX_RTT))) |
| 36 | dst_metric_set(dst, RTAX_RTT, 0); |
| 37 | return; |
| 38 | } |
| 39 | |
| 40 | rtt = dst_metric_rtt(dst, RTAX_RTT); |
| 41 | m = rtt - tp->srtt; |
| 42 | |
| 43 | /* If newly calculated rtt larger than stored one, |
| 44 | * store new one. Otherwise, use EWMA. Remember, |
| 45 | * rtt overestimation is always better than underestimation. |
| 46 | */ |
| 47 | if (!(dst_metric_locked(dst, RTAX_RTT))) { |
| 48 | if (m <= 0) |
| 49 | set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); |
| 50 | else |
| 51 | set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); |
| 52 | } |
| 53 | |
| 54 | if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { |
| 55 | unsigned long var; |
| 56 | if (m < 0) |
| 57 | m = -m; |
| 58 | |
| 59 | /* Scale deviation to rttvar fixed point */ |
| 60 | m >>= 1; |
| 61 | if (m < tp->mdev) |
| 62 | m = tp->mdev; |
| 63 | |
| 64 | var = dst_metric_rtt(dst, RTAX_RTTVAR); |
| 65 | if (m >= var) |
| 66 | var = m; |
| 67 | else |
| 68 | var -= (var - m) >> 2; |
| 69 | |
| 70 | set_dst_metric_rtt(dst, RTAX_RTTVAR, var); |
| 71 | } |
| 72 | |
| 73 | if (tcp_in_initial_slowstart(tp)) { |
| 74 | /* Slow start still did not finish. */ |
| 75 | if (dst_metric(dst, RTAX_SSTHRESH) && |
| 76 | !dst_metric_locked(dst, RTAX_SSTHRESH) && |
| 77 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) |
| 78 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); |
| 79 | if (!dst_metric_locked(dst, RTAX_CWND) && |
| 80 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) |
| 81 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); |
| 82 | } else if (tp->snd_cwnd > tp->snd_ssthresh && |
| 83 | icsk->icsk_ca_state == TCP_CA_Open) { |
| 84 | /* Cong. avoidance phase, cwnd is reliable. */ |
| 85 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) |
| 86 | dst_metric_set(dst, RTAX_SSTHRESH, |
| 87 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); |
| 88 | if (!dst_metric_locked(dst, RTAX_CWND)) |
| 89 | dst_metric_set(dst, RTAX_CWND, |
| 90 | (dst_metric(dst, RTAX_CWND) + |
| 91 | tp->snd_cwnd) >> 1); |
| 92 | } else { |
| 93 | /* Else slow start did not finish, cwnd is non-sense, |
| 94 | ssthresh may be also invalid. |
| 95 | */ |
| 96 | if (!dst_metric_locked(dst, RTAX_CWND)) |
| 97 | dst_metric_set(dst, RTAX_CWND, |
| 98 | (dst_metric(dst, RTAX_CWND) + |
| 99 | tp->snd_ssthresh) >> 1); |
| 100 | if (dst_metric(dst, RTAX_SSTHRESH) && |
| 101 | !dst_metric_locked(dst, RTAX_SSTHRESH) && |
| 102 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) |
| 103 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); |
| 104 | } |
| 105 | |
| 106 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { |
| 107 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && |
| 108 | tp->reordering != sysctl_tcp_reordering) |
| 109 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); |
| 110 | } |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | /* Initialize metrics on socket. */ |
| 115 | |
| 116 | void tcp_init_metrics(struct sock *sk) |
| 117 | { |
| 118 | struct tcp_sock *tp = tcp_sk(sk); |
| 119 | struct dst_entry *dst = __sk_dst_get(sk); |
| 120 | |
| 121 | if (dst == NULL) |
| 122 | goto reset; |
| 123 | |
| 124 | dst_confirm(dst); |
| 125 | |
| 126 | if (dst_metric_locked(dst, RTAX_CWND)) |
| 127 | tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); |
| 128 | if (dst_metric(dst, RTAX_SSTHRESH)) { |
| 129 | tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); |
| 130 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) |
| 131 | tp->snd_ssthresh = tp->snd_cwnd_clamp; |
| 132 | } else { |
| 133 | /* ssthresh may have been reduced unnecessarily during. |
| 134 | * 3WHS. Restore it back to its initial default. |
| 135 | */ |
| 136 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
| 137 | } |
| 138 | if (dst_metric(dst, RTAX_REORDERING) && |
| 139 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { |
| 140 | tcp_disable_fack(tp); |
| 141 | tcp_disable_early_retrans(tp); |
| 142 | tp->reordering = dst_metric(dst, RTAX_REORDERING); |
| 143 | } |
| 144 | |
| 145 | if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) |
| 146 | goto reset; |
| 147 | |
| 148 | /* Initial rtt is determined from SYN,SYN-ACK. |
| 149 | * The segment is small and rtt may appear much |
| 150 | * less than real one. Use per-dst memory |
| 151 | * to make it more realistic. |
| 152 | * |
| 153 | * A bit of theory. RTT is time passed after "normal" sized packet |
| 154 | * is sent until it is ACKed. In normal circumstances sending small |
| 155 | * packets force peer to delay ACKs and calculation is correct too. |
| 156 | * The algorithm is adaptive and, provided we follow specs, it |
| 157 | * NEVER underestimate RTT. BUT! If peer tries to make some clever |
| 158 | * tricks sort of "quick acks" for time long enough to decrease RTT |
| 159 | * to low value, and then abruptly stops to do it and starts to delay |
| 160 | * ACKs, wait for troubles. |
| 161 | */ |
| 162 | if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { |
| 163 | tp->srtt = dst_metric_rtt(dst, RTAX_RTT); |
| 164 | tp->rtt_seq = tp->snd_nxt; |
| 165 | } |
| 166 | if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { |
| 167 | tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); |
| 168 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); |
| 169 | } |
| 170 | tcp_set_rto(sk); |
| 171 | reset: |
| 172 | if (tp->srtt == 0) { |
| 173 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from |
| 174 | * 3WHS. This is most likely due to retransmission, |
| 175 | * including spurious one. Reset the RTO back to 3secs |
| 176 | * from the more aggressive 1sec to avoid more spurious |
| 177 | * retransmission. |
| 178 | */ |
| 179 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; |
| 180 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; |
| 181 | } |
| 182 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
| 183 | * retransmitted. In light of RFC6298 more aggressive 1sec |
| 184 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK |
| 185 | * retransmission has occurred. |
| 186 | */ |
| 187 | if (tp->total_retrans > 1) |
| 188 | tp->snd_cwnd = 1; |
| 189 | else |
| 190 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); |
| 191 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 192 | } |