| David S. Miller | ab92bb2 | 2012-07-09 16:19:30 -0700 | [diff] [blame^] | 1 | #include <linux/module.h> | 
| David S. Miller | 4aabd8e | 2012-07-09 16:07:30 -0700 | [diff] [blame] | 2 | #include <linux/cache.h> | 
|  | 3 | #include <linux/tcp.h> | 
|  | 4 |  | 
|  | 5 | #include <net/inet_connection_sock.h> | 
| David S. Miller | ab92bb2 | 2012-07-09 16:19:30 -0700 | [diff] [blame^] | 6 | #include <net/request_sock.h> | 
| David S. Miller | 4aabd8e | 2012-07-09 16:07:30 -0700 | [diff] [blame] | 7 | #include <net/sock.h> | 
|  | 8 | #include <net/dst.h> | 
|  | 9 | #include <net/tcp.h> | 
|  | 10 |  | 
|  | 11 | int sysctl_tcp_nometrics_save __read_mostly; | 
|  | 12 |  | 
|  | 13 | /* Save metrics learned by this TCP session.  This function is called | 
|  | 14 | * only, when TCP finishes successfully i.e. when it enters TIME-WAIT | 
|  | 15 | * or goes from LAST-ACK to CLOSE. | 
|  | 16 | */ | 
|  | 17 | void tcp_update_metrics(struct sock *sk) | 
|  | 18 | { | 
|  | 19 | struct tcp_sock *tp = tcp_sk(sk); | 
|  | 20 | struct dst_entry *dst = __sk_dst_get(sk); | 
|  | 21 |  | 
|  | 22 | if (sysctl_tcp_nometrics_save) | 
|  | 23 | return; | 
|  | 24 |  | 
|  | 25 | if (dst && (dst->flags & DST_HOST)) { | 
|  | 26 | const struct inet_connection_sock *icsk = inet_csk(sk); | 
|  | 27 | int m; | 
|  | 28 | unsigned long rtt; | 
|  | 29 |  | 
|  | 30 | dst_confirm(dst); | 
|  | 31 |  | 
|  | 32 | if (icsk->icsk_backoff || !tp->srtt) { | 
|  | 33 | /* This session failed to estimate rtt. Why? | 
|  | 34 | * Probably, no packets returned in time. | 
|  | 35 | * Reset our results. | 
|  | 36 | */ | 
|  | 37 | if (!(dst_metric_locked(dst, RTAX_RTT))) | 
|  | 38 | dst_metric_set(dst, RTAX_RTT, 0); | 
|  | 39 | return; | 
|  | 40 | } | 
|  | 41 |  | 
|  | 42 | rtt = dst_metric_rtt(dst, RTAX_RTT); | 
|  | 43 | m = rtt - tp->srtt; | 
|  | 44 |  | 
|  | 45 | /* If newly calculated rtt larger than stored one, | 
|  | 46 | * store new one. Otherwise, use EWMA. Remember, | 
|  | 47 | * rtt overestimation is always better than underestimation. | 
|  | 48 | */ | 
|  | 49 | if (!(dst_metric_locked(dst, RTAX_RTT))) { | 
|  | 50 | if (m <= 0) | 
|  | 51 | set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); | 
|  | 52 | else | 
|  | 53 | set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); | 
|  | 54 | } | 
|  | 55 |  | 
|  | 56 | if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { | 
|  | 57 | unsigned long var; | 
|  | 58 | if (m < 0) | 
|  | 59 | m = -m; | 
|  | 60 |  | 
|  | 61 | /* Scale deviation to rttvar fixed point */ | 
|  | 62 | m >>= 1; | 
|  | 63 | if (m < tp->mdev) | 
|  | 64 | m = tp->mdev; | 
|  | 65 |  | 
|  | 66 | var = dst_metric_rtt(dst, RTAX_RTTVAR); | 
|  | 67 | if (m >= var) | 
|  | 68 | var = m; | 
|  | 69 | else | 
|  | 70 | var -= (var - m) >> 2; | 
|  | 71 |  | 
|  | 72 | set_dst_metric_rtt(dst, RTAX_RTTVAR, var); | 
|  | 73 | } | 
|  | 74 |  | 
|  | 75 | if (tcp_in_initial_slowstart(tp)) { | 
|  | 76 | /* Slow start still did not finish. */ | 
|  | 77 | if (dst_metric(dst, RTAX_SSTHRESH) && | 
|  | 78 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | 
|  | 79 | (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) | 
|  | 80 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); | 
|  | 81 | if (!dst_metric_locked(dst, RTAX_CWND) && | 
|  | 82 | tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) | 
|  | 83 | dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); | 
|  | 84 | } else if (tp->snd_cwnd > tp->snd_ssthresh && | 
|  | 85 | icsk->icsk_ca_state == TCP_CA_Open) { | 
|  | 86 | /* Cong. avoidance phase, cwnd is reliable. */ | 
|  | 87 | if (!dst_metric_locked(dst, RTAX_SSTHRESH)) | 
|  | 88 | dst_metric_set(dst, RTAX_SSTHRESH, | 
|  | 89 | max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); | 
|  | 90 | if (!dst_metric_locked(dst, RTAX_CWND)) | 
|  | 91 | dst_metric_set(dst, RTAX_CWND, | 
|  | 92 | (dst_metric(dst, RTAX_CWND) + | 
|  | 93 | tp->snd_cwnd) >> 1); | 
|  | 94 | } else { | 
|  | 95 | /* Else slow start did not finish, cwnd is non-sense, | 
|  | 96 | ssthresh may be also invalid. | 
|  | 97 | */ | 
|  | 98 | if (!dst_metric_locked(dst, RTAX_CWND)) | 
|  | 99 | dst_metric_set(dst, RTAX_CWND, | 
|  | 100 | (dst_metric(dst, RTAX_CWND) + | 
|  | 101 | tp->snd_ssthresh) >> 1); | 
|  | 102 | if (dst_metric(dst, RTAX_SSTHRESH) && | 
|  | 103 | !dst_metric_locked(dst, RTAX_SSTHRESH) && | 
|  | 104 | tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) | 
|  | 105 | dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); | 
|  | 106 | } | 
|  | 107 |  | 
|  | 108 | if (!dst_metric_locked(dst, RTAX_REORDERING)) { | 
|  | 109 | if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && | 
|  | 110 | tp->reordering != sysctl_tcp_reordering) | 
|  | 111 | dst_metric_set(dst, RTAX_REORDERING, tp->reordering); | 
|  | 112 | } | 
|  | 113 | } | 
|  | 114 | } | 
|  | 115 |  | 
|  | 116 | /* Initialize metrics on socket. */ | 
|  | 117 |  | 
|  | 118 | void tcp_init_metrics(struct sock *sk) | 
|  | 119 | { | 
|  | 120 | struct tcp_sock *tp = tcp_sk(sk); | 
|  | 121 | struct dst_entry *dst = __sk_dst_get(sk); | 
|  | 122 |  | 
|  | 123 | if (dst == NULL) | 
|  | 124 | goto reset; | 
|  | 125 |  | 
|  | 126 | dst_confirm(dst); | 
|  | 127 |  | 
|  | 128 | if (dst_metric_locked(dst, RTAX_CWND)) | 
|  | 129 | tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); | 
|  | 130 | if (dst_metric(dst, RTAX_SSTHRESH)) { | 
|  | 131 | tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); | 
|  | 132 | if (tp->snd_ssthresh > tp->snd_cwnd_clamp) | 
|  | 133 | tp->snd_ssthresh = tp->snd_cwnd_clamp; | 
|  | 134 | } else { | 
|  | 135 | /* ssthresh may have been reduced unnecessarily during. | 
|  | 136 | * 3WHS. Restore it back to its initial default. | 
|  | 137 | */ | 
|  | 138 | tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; | 
|  | 139 | } | 
|  | 140 | if (dst_metric(dst, RTAX_REORDERING) && | 
|  | 141 | tp->reordering != dst_metric(dst, RTAX_REORDERING)) { | 
|  | 142 | tcp_disable_fack(tp); | 
|  | 143 | tcp_disable_early_retrans(tp); | 
|  | 144 | tp->reordering = dst_metric(dst, RTAX_REORDERING); | 
|  | 145 | } | 
|  | 146 |  | 
|  | 147 | if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) | 
|  | 148 | goto reset; | 
|  | 149 |  | 
|  | 150 | /* Initial rtt is determined from SYN,SYN-ACK. | 
|  | 151 | * The segment is small and rtt may appear much | 
|  | 152 | * less than real one. Use per-dst memory | 
|  | 153 | * to make it more realistic. | 
|  | 154 | * | 
|  | 155 | * A bit of theory. RTT is time passed after "normal" sized packet | 
|  | 156 | * is sent until it is ACKed. In normal circumstances sending small | 
|  | 157 | * packets force peer to delay ACKs and calculation is correct too. | 
|  | 158 | * The algorithm is adaptive and, provided we follow specs, it | 
|  | 159 | * NEVER underestimate RTT. BUT! If peer tries to make some clever | 
|  | 160 | * tricks sort of "quick acks" for time long enough to decrease RTT | 
|  | 161 | * to low value, and then abruptly stops to do it and starts to delay | 
|  | 162 | * ACKs, wait for troubles. | 
|  | 163 | */ | 
|  | 164 | if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { | 
|  | 165 | tp->srtt = dst_metric_rtt(dst, RTAX_RTT); | 
|  | 166 | tp->rtt_seq = tp->snd_nxt; | 
|  | 167 | } | 
|  | 168 | if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { | 
|  | 169 | tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); | 
|  | 170 | tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | 
|  | 171 | } | 
|  | 172 | tcp_set_rto(sk); | 
|  | 173 | reset: | 
|  | 174 | if (tp->srtt == 0) { | 
|  | 175 | /* RFC6298: 5.7 We've failed to get a valid RTT sample from | 
|  | 176 | * 3WHS. This is most likely due to retransmission, | 
|  | 177 | * including spurious one. Reset the RTO back to 3secs | 
|  | 178 | * from the more aggressive 1sec to avoid more spurious | 
|  | 179 | * retransmission. | 
|  | 180 | */ | 
|  | 181 | tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; | 
|  | 182 | inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; | 
|  | 183 | } | 
|  | 184 | /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been | 
|  | 185 | * retransmitted. In light of RFC6298 more aggressive 1sec | 
|  | 186 | * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK | 
|  | 187 | * retransmission has occurred. | 
|  | 188 | */ | 
|  | 189 | if (tp->total_retrans > 1) | 
|  | 190 | tp->snd_cwnd = 1; | 
|  | 191 | else | 
|  | 192 | tp->snd_cwnd = tcp_init_cwnd(tp, dst); | 
|  | 193 | tp->snd_cwnd_stamp = tcp_time_stamp; | 
|  | 194 | } | 
| David S. Miller | ab92bb2 | 2012-07-09 16:19:30 -0700 | [diff] [blame^] | 195 |  | 
|  | 196 | bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) | 
|  | 197 | { | 
|  | 198 | if (!dst) | 
|  | 199 | return false; | 
|  | 200 | return dst_metric(dst, RTAX_RTT) ? true : false; | 
|  | 201 | } | 
|  | 202 | EXPORT_SYMBOL_GPL(tcp_peer_is_proven); |