blob: 7a47d65ec6b90004f90dbe28061be20c1d879794 [file] [log] [blame]
Ben Chanb061f892013-02-27 17:46:55 -08001// Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "shill/traffic_monitor.h"
6
7#include <base/bind.h>
Ben Chana0ddf462014-02-06 11:32:42 -08008#include <base/strings/stringprintf.h>
Thieu Lefa7960e2013-04-15 13:14:55 -07009#include <netinet/in.h>
Ben Chanb061f892013-02-27 17:46:55 -080010
11#include "shill/device.h"
12#include "shill/device_info.h"
13#include "shill/event_dispatcher.h"
Thieu Le03026662013-04-04 10:45:11 -070014#include "shill/logging.h"
15#include "shill/socket_info_reader.h"
16
17using base::StringPrintf;
18using std::string;
19using std::vector;
Ben Chanb061f892013-02-27 17:46:55 -080020
21namespace shill {
22
Rebecca Silbersteinc9c31d82014-10-21 15:01:00 -070023namespace Logging {
24static auto kModuleLogScope = ScopeLogger::kLink;
Paul Stewart1a212a62015-06-16 13:13:10 -070025static string ObjectID(Device* d) { return d->link_name(); }
Rebecca Silbersteinc9c31d82014-10-21 15:01:00 -070026}
27
Thieu Le03026662013-04-04 10:45:11 -070028// static
Ben Chan7fab8972014-08-10 17:14:46 -070029const uint16_t TrafficMonitor::kDnsPort = 53;
30const int64_t TrafficMonitor::kDnsTimedOutThresholdSeconds = 15;
Thieu Le03026662013-04-04 10:45:11 -070031const int TrafficMonitor::kMinimumFailedSamplesToTrigger = 2;
Ben Chan7fab8972014-08-10 17:14:46 -070032const int64_t TrafficMonitor::kSamplingIntervalMilliseconds = 5000;
Ben Chanb061f892013-02-27 17:46:55 -080033
Paul Stewart1a212a62015-06-16 13:13:10 -070034TrafficMonitor::TrafficMonitor(const DeviceRefPtr& device,
35 EventDispatcher* dispatcher)
Ben Chanb061f892013-02-27 17:46:55 -080036 : device_(device),
37 dispatcher_(dispatcher),
Thieu Le03026662013-04-04 10:45:11 -070038 socket_info_reader_(new SocketInfoReader),
Thieu Lefa7960e2013-04-15 13:14:55 -070039 accummulated_congested_tx_queues_samples_(0),
40 connection_info_reader_(new ConnectionInfoReader),
41 accummulated_dns_failures_samples_(0) {
Ben Chanb061f892013-02-27 17:46:55 -080042}
43
44TrafficMonitor::~TrafficMonitor() {
45 Stop();
46}
47
48void TrafficMonitor::Start() {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -080049 SLOG(device_.get(), 2) << __func__;
Ben Chanb061f892013-02-27 17:46:55 -080050 Stop();
51
Ben Chanb061f892013-02-27 17:46:55 -080052 sample_traffic_callback_.Reset(base::Bind(&TrafficMonitor::SampleTraffic,
53 base::Unretained(this)));
54 dispatcher_->PostDelayedTask(sample_traffic_callback_.callback(),
55 kSamplingIntervalMilliseconds);
56}
57
58void TrafficMonitor::Stop() {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -080059 SLOG(device_.get(), 2) << __func__;
Ben Chanb061f892013-02-27 17:46:55 -080060 sample_traffic_callback_.Cancel();
Thieu Lefa7960e2013-04-15 13:14:55 -070061 ResetCongestedTxQueuesStats();
62 ResetDnsFailingStats();
63}
64
65void TrafficMonitor::ResetCongestedTxQueuesStats() {
66 accummulated_congested_tx_queues_samples_ = 0;
67}
68
69void TrafficMonitor::ResetCongestedTxQueuesStatsWithLogging() {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -080070 SLOG(device_.get(), 2) << __func__ << ": Tx-queues decongested";
Thieu Lefa7960e2013-04-15 13:14:55 -070071 ResetCongestedTxQueuesStats();
Thieu Le03026662013-04-04 10:45:11 -070072}
73
74void TrafficMonitor::BuildIPPortToTxQueueLength(
Paul Stewart1a212a62015-06-16 13:13:10 -070075 const vector<SocketInfo>& socket_infos,
76 IPPortToTxQueueLengthMap* tx_queue_lengths) {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -080077 SLOG(device_.get(), 3) << __func__;
Thieu Le03026662013-04-04 10:45:11 -070078 string device_ip_address = device_->ipconfig()->properties().address;
Paul Stewart1a212a62015-06-16 13:13:10 -070079 for (const auto& info : socket_infos) {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -080080 SLOG(device_.get(), 4) << "SocketInfo(IP="
81 << info.local_ip_address().ToString()
82 << ", TX=" << info.transmit_queue_value()
83 << ", State=" << info.connection_state()
84 << ", TimerState=" << info.timer_state();
Paul Stewart6db7b242014-05-02 15:34:21 -070085 if (info.local_ip_address().ToString() != device_ip_address ||
86 info.transmit_queue_value() == 0 ||
87 info.connection_state() != SocketInfo::kConnectionStateEstablished ||
88 (info.timer_state() != SocketInfo::kTimerStateRetransmitTimerPending &&
89 info.timer_state() !=
Arman Ugurayd42d8ec2013-04-08 19:28:21 -070090 SocketInfo::kTimerStateZeroWindowProbeTimerPending)) {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -080091 SLOG(device_.get(), 4) << "Connection Filtered.";
Thieu Le03026662013-04-04 10:45:11 -070092 continue;
Arman Ugurayd42d8ec2013-04-08 19:28:21 -070093 }
Alex Vakulenko0951ccb2014-12-10 12:52:31 -080094 SLOG(device_.get(), 3) << "Monitoring connection: TX="
95 << info.transmit_queue_value()
96 << " TimerState=" << info.timer_state();
Thieu Le03026662013-04-04 10:45:11 -070097
98 string local_ip_port =
99 StringPrintf("%s:%d",
Paul Stewart6db7b242014-05-02 15:34:21 -0700100 info.local_ip_address().ToString().c_str(),
101 info.local_port());
102 (*tx_queue_lengths)[local_ip_port] = info.transmit_queue_value();
Thieu Le03026662013-04-04 10:45:11 -0700103 }
Ben Chanb061f892013-02-27 17:46:55 -0800104}
105
Thieu Lefa7960e2013-04-15 13:14:55 -0700106bool TrafficMonitor::IsCongestedTxQueues() {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800107 SLOG(device_.get(), 4) << __func__;
Thieu Le03026662013-04-04 10:45:11 -0700108 vector<SocketInfo> socket_infos;
109 if (!socket_info_reader_->LoadTcpSocketInfo(&socket_infos) ||
110 socket_infos.empty()) {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800111 SLOG(device_.get(), 3) << __func__ << ": Empty socket info";
Thieu Lefa7960e2013-04-15 13:14:55 -0700112 ResetCongestedTxQueuesStatsWithLogging();
113 return false;
Ben Chanb061f892013-02-27 17:46:55 -0800114 }
Thieu Lefa7960e2013-04-15 13:14:55 -0700115 bool congested_tx_queues = true;
Thieu Le03026662013-04-04 10:45:11 -0700116 IPPortToTxQueueLengthMap curr_tx_queue_lengths;
117 BuildIPPortToTxQueueLength(socket_infos, &curr_tx_queue_lengths);
118 if (curr_tx_queue_lengths.empty()) {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800119 SLOG(device_.get(), 3) << __func__ << ": No interesting socket info";
Thieu Lefa7960e2013-04-15 13:14:55 -0700120 ResetCongestedTxQueuesStatsWithLogging();
Thieu Le03026662013-04-04 10:45:11 -0700121 } else {
Paul Stewart1a212a62015-06-16 13:13:10 -0700122 for (const auto& length_entry : old_tx_queue_lengths_) {
Thieu Le03026662013-04-04 10:45:11 -0700123 IPPortToTxQueueLengthMap::iterator curr_tx_queue_it =
Paul Stewart6db7b242014-05-02 15:34:21 -0700124 curr_tx_queue_lengths.find(length_entry.first);
Thieu Le03026662013-04-04 10:45:11 -0700125 if (curr_tx_queue_it == curr_tx_queue_lengths.end() ||
Paul Stewart6db7b242014-05-02 15:34:21 -0700126 curr_tx_queue_it->second < length_entry.second) {
Thieu Le03026662013-04-04 10:45:11 -0700127 congested_tx_queues = false;
Arman Ugurayf84a4242013-04-09 20:01:07 -0700128 // TODO(armansito): If we had a false positive earlier, we may
129 // want to correct it here by invoking a "connection back to normal
130 // callback", so that the OutOfCredits property can be set to
131 // false.
Thieu Le03026662013-04-04 10:45:11 -0700132 break;
133 }
134 }
Thieu Lefa7960e2013-04-15 13:14:55 -0700135 if (congested_tx_queues) {
136 ++accummulated_congested_tx_queues_samples_;
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800137 SLOG(device_.get(), 2) << __func__
138 << ": Congested tx-queues detected ("
139 << accummulated_congested_tx_queues_samples_
140 << ")";
Ben Chanb061f892013-02-27 17:46:55 -0800141 }
142 }
Thieu Le03026662013-04-04 10:45:11 -0700143 old_tx_queue_lengths_ = curr_tx_queue_lengths;
Ben Chanb061f892013-02-27 17:46:55 -0800144
Thieu Lefa7960e2013-04-15 13:14:55 -0700145 return congested_tx_queues;
146}
147
148void TrafficMonitor::ResetDnsFailingStats() {
149 accummulated_dns_failures_samples_ = 0;
150}
151
152void TrafficMonitor::ResetDnsFailingStatsWithLogging() {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800153 SLOG(device_.get(), 2) << __func__ << ": DNS queries restored";
Thieu Lefa7960e2013-04-15 13:14:55 -0700154 ResetDnsFailingStats();
155}
156
157bool TrafficMonitor::IsDnsFailing() {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800158 SLOG(device_.get(), 4) << __func__;
Thieu Lefa7960e2013-04-15 13:14:55 -0700159 vector<ConnectionInfo> connection_infos;
160 if (!connection_info_reader_->LoadConnectionInfo(&connection_infos) ||
161 connection_infos.empty()) {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800162 SLOG(device_.get(), 3) << __func__ << ": Empty connection info";
Thieu Lefa7960e2013-04-15 13:14:55 -0700163 } else {
164 // The time-to-expire counter is used to determine when a DNS request
165 // has timed out. This counter is the number of seconds remaining until
166 // the entry is removed from the system IP connection tracker. The
167 // default time is 30 seconds. This is too long of a wait. Instead, we
168 // want to time out at |kDnsTimedOutThresholdSeconds|. Unfortunately,
169 // we cannot simply look for entries less than
170 // |kDnsTimedOutThresholdSeconds| because we will count the entry
171 // multiple times once its time-to-expire is less than
172 // |kDnsTimedOutThresholdSeconds|. To ensure that we only count an
173 // entry once, we look for entries in this time window between
174 // |kDnsTimedOutThresholdSeconds| and |kDnsTimedOutLowerThresholdSeconds|.
Ben Chan7fab8972014-08-10 17:14:46 -0700175 const int64_t kDnsTimedOutLowerThresholdSeconds =
Thieu Lefa7960e2013-04-15 13:14:55 -0700176 kDnsTimedOutThresholdSeconds - kSamplingIntervalMilliseconds / 1000;
177 string device_ip_address = device_->ipconfig()->properties().address;
Paul Stewart1a212a62015-06-16 13:13:10 -0700178 for (const auto& info : connection_infos) {
Paul Stewart6db7b242014-05-02 15:34:21 -0700179 if (info.protocol() != IPPROTO_UDP ||
180 info.time_to_expire_seconds() > kDnsTimedOutThresholdSeconds ||
181 info.time_to_expire_seconds() <= kDnsTimedOutLowerThresholdSeconds ||
182 !info.is_unreplied() ||
183 info.original_source_ip_address().ToString() != device_ip_address ||
184 info.original_destination_port() != kDnsPort)
Thieu Lefa7960e2013-04-15 13:14:55 -0700185 continue;
186
187 ++accummulated_dns_failures_samples_;
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800188 SLOG(device_.get(), 2) << __func__
189 << ": DNS failures detected ("
190 << accummulated_dns_failures_samples_ << ")";
Thieu Lefa7960e2013-04-15 13:14:55 -0700191 return true;
192 }
193 }
194 ResetDnsFailingStatsWithLogging();
195 return false;
196}
197
198void TrafficMonitor::SampleTraffic() {
Alex Vakulenko0951ccb2014-12-10 12:52:31 -0800199 SLOG(device_.get(), 3) << __func__;
Thieu Lefa7960e2013-04-15 13:14:55 -0700200
Peter Qiudc335f82014-05-15 10:33:17 -0700201 // Schedule the sample callback first, so it is possible for the network
202 // problem callback to stop the traffic monitor.
203 dispatcher_->PostDelayedTask(sample_traffic_callback_.callback(),
204 kSamplingIntervalMilliseconds);
205
Thieu Lefa7960e2013-04-15 13:14:55 -0700206 if (IsCongestedTxQueues() &&
207 accummulated_congested_tx_queues_samples_ ==
208 kMinimumFailedSamplesToTrigger) {
209 LOG(WARNING) << "Congested tx queues detected, out-of-credits?";
Peter Qiudc335f82014-05-15 10:33:17 -0700210 network_problem_detected_callback_.Run(kNetworkProblemCongestedTxQueue);
Thieu Lefa7960e2013-04-15 13:14:55 -0700211 } else if (IsDnsFailing() &&
212 accummulated_dns_failures_samples_ ==
213 kMinimumFailedSamplesToTrigger) {
214 LOG(WARNING) << "DNS queries failing, out-of-credits?";
Peter Qiudc335f82014-05-15 10:33:17 -0700215 network_problem_detected_callback_.Run(kNetworkProblemDNSFailure);
Thieu Lefa7960e2013-04-15 13:14:55 -0700216 }
Ben Chanb061f892013-02-27 17:46:55 -0800217}
218
219} // namespace shill