David S. Miller | ab92bb2 | 2012-07-09 16:19:30 -0700 | [diff] [blame^] | 1 | #include <linux/module.h> |
David S. Miller | 4aabd8e | 2012-07-09 16:07:30 -0700 | [diff] [blame] | 2 | #include <linux/cache.h> |
| 3 | #include <linux/tcp.h> |
| 4 | |
| 5 | #include <net/inet_connection_sock.h> |
David S. Miller | ab92bb2 | 2012-07-09 16:19:30 -0700 | [diff] [blame^] | 6 | #include <net/request_sock.h> |
David S. Miller | 4aabd8e | 2012-07-09 16:07:30 -0700 | [diff] [blame] | 7 | #include <net/sock.h> |
| 8 | #include <net/dst.h> |
| 9 | #include <net/tcp.h> |
| 10 | |
| 11 | int sysctl_tcp_nometrics_save __read_mostly; |
| 12 | |
| 13 | /* Save metrics learned by this TCP session. This function is called |
| 14 | * only, when TCP finishes successfully i.e. when it enters TIME-WAIT |
| 15 | * or goes from LAST-ACK to CLOSE. |
| 16 | */ |
| 17 | void tcp_update_metrics(struct sock *sk) |
| 18 | { |
| 19 | struct tcp_sock *tp = tcp_sk(sk); |
| 20 | struct dst_entry *dst = __sk_dst_get(sk); |
| 21 | |
| 22 | if (sysctl_tcp_nometrics_save) |
| 23 | return; |
| 24 | |
| 25 | if (dst && (dst->flags & DST_HOST)) { |
| 26 | const struct inet_connection_sock *icsk = inet_csk(sk); |
| 27 | int m; |
| 28 | unsigned long rtt; |
| 29 | |
| 30 | dst_confirm(dst); |
| 31 | |
| 32 | if (icsk->icsk_backoff || !tp->srtt) { |
| 33 | /* This session failed to estimate rtt. Why? |
| 34 | * Probably, no packets returned in time. |
| 35 | * Reset our results. |
| 36 | */ |
| 37 | if (!(dst_metric_locked(dst, RTAX_RTT))) |
| 38 | dst_metric_set(dst, RTAX_RTT, 0); |
| 39 | return; |
| 40 | } |
| 41 | |
| 42 | rtt = dst_metric_rtt(dst, RTAX_RTT); |
| 43 | m = rtt - tp->srtt; |
| 44 | |
| 45 | /* If newly calculated rtt larger than stored one, |
| 46 | * store new one. Otherwise, use EWMA. Remember, |
| 47 | * rtt overestimation is always better than underestimation. |
| 48 | */ |
| 49 | if (!(dst_metric_locked(dst, RTAX_RTT))) { |
| 50 | if (m <= 0) |
| 51 | set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); |
| 52 | else |
| 53 | set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); |
| 54 | } |
| 55 | |
| 56 | if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { |
| 57 | unsigned long var; |
| 58 | if (m < 0) |
| 59 | m = -m; |
| 60 | |
| 61 | /* Scale deviation to rttvar fixed point */ |
| 62 | m >>= 1; |
| 63 | if (m < tp->mdev) |
| 64 | m = tp->mdev; |
| 65 | |
| 66 | var = dst_metric_rtt(dst, RTAX_RTTVAR); |
| 67 | if (m >= var) |
| 68 | var = m; |
| 69 | else |
| 70 | var -= (var - m) >> 2; |
| 71 | |
| 72 | set_dst_metric_rtt(dst, RTAX_RTTVAR, var); |
| 73 | } |
| 74 | |
| 75 | if (tcp_in_initial_slowstart(tp)) { |
| 76 | /* Slow start still did not finish. */ |
| 77 | if (dst_metric(dst, RTAX_SSTHRESH) && |
| 78 | !dst_metric_locked(dst, RTAX_SSTHRESH) && |
| 79 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) |
| 80 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); |
| 81 | if (!dst_metric_locked(dst, RTAX_CWND) && |
| 82 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) |
| 83 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); |
| 84 | } else if (tp->snd_cwnd > tp->snd_ssthresh && |
| 85 | icsk->icsk_ca_state == TCP_CA_Open) { |
| 86 | /* Cong. avoidance phase, cwnd is reliable. */ |
| 87 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) |
| 88 | dst_metric_set(dst, RTAX_SSTHRESH, |
| 89 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); |
| 90 | if (!dst_metric_locked(dst, RTAX_CWND)) |
| 91 | dst_metric_set(dst, RTAX_CWND, |
| 92 | (dst_metric(dst, RTAX_CWND) + |
| 93 | tp->snd_cwnd) >> 1); |
| 94 | } else { |
| 95 | /* Else slow start did not finish, cwnd is non-sense, |
| 96 | ssthresh may be also invalid. |
| 97 | */ |
| 98 | if (!dst_metric_locked(dst, RTAX_CWND)) |
| 99 | dst_metric_set(dst, RTAX_CWND, |
| 100 | (dst_metric(dst, RTAX_CWND) + |
| 101 | tp->snd_ssthresh) >> 1); |
| 102 | if (dst_metric(dst, RTAX_SSTHRESH) && |
| 103 | !dst_metric_locked(dst, RTAX_SSTHRESH) && |
| 104 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) |
| 105 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); |
| 106 | } |
| 107 | |
| 108 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { |
| 109 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && |
| 110 | tp->reordering != sysctl_tcp_reordering) |
| 111 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); |
| 112 | } |
| 113 | } |
| 114 | } |
| 115 | |
| 116 | /* Initialize metrics on socket. */ |
| 117 | |
| 118 | void tcp_init_metrics(struct sock *sk) |
| 119 | { |
| 120 | struct tcp_sock *tp = tcp_sk(sk); |
| 121 | struct dst_entry *dst = __sk_dst_get(sk); |
| 122 | |
| 123 | if (dst == NULL) |
| 124 | goto reset; |
| 125 | |
| 126 | dst_confirm(dst); |
| 127 | |
| 128 | if (dst_metric_locked(dst, RTAX_CWND)) |
| 129 | tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); |
| 130 | if (dst_metric(dst, RTAX_SSTHRESH)) { |
| 131 | tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); |
| 132 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) |
| 133 | tp->snd_ssthresh = tp->snd_cwnd_clamp; |
| 134 | } else { |
| 135 | /* ssthresh may have been reduced unnecessarily during. |
| 136 | * 3WHS. Restore it back to its initial default. |
| 137 | */ |
| 138 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; |
| 139 | } |
| 140 | if (dst_metric(dst, RTAX_REORDERING) && |
| 141 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { |
| 142 | tcp_disable_fack(tp); |
| 143 | tcp_disable_early_retrans(tp); |
| 144 | tp->reordering = dst_metric(dst, RTAX_REORDERING); |
| 145 | } |
| 146 | |
| 147 | if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) |
| 148 | goto reset; |
| 149 | |
| 150 | /* Initial rtt is determined from SYN,SYN-ACK. |
| 151 | * The segment is small and rtt may appear much |
| 152 | * less than real one. Use per-dst memory |
| 153 | * to make it more realistic. |
| 154 | * |
| 155 | * A bit of theory. RTT is time passed after "normal" sized packet |
| 156 | * is sent until it is ACKed. In normal circumstances sending small |
| 157 | * packets force peer to delay ACKs and calculation is correct too. |
| 158 | * The algorithm is adaptive and, provided we follow specs, it |
| 159 | * NEVER underestimate RTT. BUT! If peer tries to make some clever |
| 160 | * tricks sort of "quick acks" for time long enough to decrease RTT |
| 161 | * to low value, and then abruptly stops to do it and starts to delay |
| 162 | * ACKs, wait for troubles. |
| 163 | */ |
| 164 | if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { |
| 165 | tp->srtt = dst_metric_rtt(dst, RTAX_RTT); |
| 166 | tp->rtt_seq = tp->snd_nxt; |
| 167 | } |
| 168 | if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { |
| 169 | tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); |
| 170 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); |
| 171 | } |
| 172 | tcp_set_rto(sk); |
| 173 | reset: |
| 174 | if (tp->srtt == 0) { |
| 175 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from |
| 176 | * 3WHS. This is most likely due to retransmission, |
| 177 | * including spurious one. Reset the RTO back to 3secs |
| 178 | * from the more aggressive 1sec to avoid more spurious |
| 179 | * retransmission. |
| 180 | */ |
| 181 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; |
| 182 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; |
| 183 | } |
| 184 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been |
| 185 | * retransmitted. In light of RFC6298 more aggressive 1sec |
| 186 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK |
| 187 | * retransmission has occurred. |
| 188 | */ |
| 189 | if (tp->total_retrans > 1) |
| 190 | tp->snd_cwnd = 1; |
| 191 | else |
| 192 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); |
| 193 | tp->snd_cwnd_stamp = tcp_time_stamp; |
| 194 | } |
David S. Miller | ab92bb2 | 2012-07-09 16:19:30 -0700 | [diff] [blame^] | 195 | |
| 196 | bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) |
| 197 | { |
| 198 | if (!dst) |
| 199 | return false; |
| 200 | return dst_metric(dst, RTAX_RTT) ? true : false; |
| 201 | } |
| 202 | EXPORT_SYMBOL_GPL(tcp_peer_is_proven); |