shill: LinkMonitor: Add more metrics
LinkMonitorSecondsToFailure, LinkMonitorUnicastErrorsAtFailure,
and LinkMonitorBroadcastErrorsAtFailure.
BUG=chromium-os:32600
TEST=Unit tests.
Change-Id: I0a90deee5675ebfb665a76f8e7707eaaecc23ce9
Reviewed-on: https://gerrit.chromium.org/gerrit/29792
Reviewed-by: mukesh agrawal <quiche@chromium.org>
Commit-Ready: Paul Stewart <pstew@chromium.org>
Tested-by: Paul Stewart <pstew@chromium.org>
diff --git a/link_monitor.cc b/link_monitor.cc
index 28ebd5b..1d8dd1b 100644
--- a/link_monitor.cc
+++ b/link_monitor.cc
@@ -59,13 +59,16 @@
connection_->interface_index(), &local_mac_address_)) {
LOG(ERROR) << "Could not get local MAC address.";
metrics_->NotifyLinkMonitorFailure(
- connection_->technology(), Metrics::kLinkMonitorMacAddressNotFound);
+ connection_->technology(),
+ Metrics::kLinkMonitorMacAddressNotFound,
+ 0, 0, 0);
Stop();
return false;
}
gateway_mac_address_ = ByteString(local_mac_address_.GetLength());
send_request_callback_.Reset(
Bind(&LinkMonitor::SendRequestTask, Unretained(this)));
+ time_->GetTimeMonotonic(&started_monitoring_at_);
return SendRequest();
}
@@ -81,6 +84,7 @@
response_sample_count_ = 0;
receive_response_handler_.reset();
send_request_callback_.Cancel();
+ timerclear(&started_monitoring_at_);
timerclear(&sent_request_at_);
}
@@ -146,10 +150,19 @@
<< unicast_failure_count_
<< " unicast failures.";
failure_callback_.Run();
- Stop();
+
+ struct timeval now, elapsed_time;
+ time_->GetTimeMonotonic(&now);
+ timersub(&now, &started_monitoring_at_, &elapsed_time);
+
metrics_->NotifyLinkMonitorFailure(
connection_->technology(),
- Metrics::kLinkMonitorFailureThresholdReached);
+ Metrics::kLinkMonitorFailureThresholdReached,
+ elapsed_time.tv_sec,
+ broadcast_failure_count_,
+ unicast_failure_count_);
+
+ Stop();
return true;
}
is_unicast_ = !is_unicast_;
@@ -216,7 +229,9 @@
LOG(ERROR) << "Failed to start ARP client.";
Stop();
metrics_->NotifyLinkMonitorFailure(
- connection_->technology(), Metrics::kLinkMonitorClientStartFailure);
+ connection_->technology(),
+ Metrics::kLinkMonitorClientStartFailure,
+ 0, 0, 0);
return false;
}
} else if (AddMissedResponse()) {
@@ -251,7 +266,8 @@
LOG(ERROR) << "Failed to send ARP request. Stopping.";
Stop();
metrics_->NotifyLinkMonitorFailure(
- connection_->technology(), Metrics::kLinkMonitorTransmitFailure);
+ connection_->technology(), Metrics::kLinkMonitorTransmitFailure,
+ 0, 0, 0);
return false;
}
diff --git a/link_monitor.h b/link_monitor.h
index a3e0a79..60a4057 100644
--- a/link_monitor.h
+++ b/link_monitor.h
@@ -32,6 +32,11 @@
public:
typedef base::Closure FailureCallback;
+ // When the sum of consecutive unicast and broadcast failures
+ // equals this value, the failure callback is called, the counters
+ // are reset, and the link monitoring quiesces. Needed by Metrics.
+ static const unsigned int kFailureThreshold;
+
// The number of milliseconds between ARP requests. Needed by Metrics.
static const unsigned int kTestPeriodMilliseconds;
@@ -57,11 +62,6 @@
friend class LinkMonitorForTest;
friend class LinkMonitorTest;
- // When the sum of consecutive unicast and broadcast failures
- // equals this value, the failure callback is called, the counters
- // are reset, and the link monitoring quiesces.
- static const unsigned int kFailureThreshold;
-
// The number of samples to compute a "strict" average over. When
// more samples than this number arrive, this determines how "slow"
// our simple low-pass filter works.
@@ -119,6 +119,9 @@
// void callback function SendRequestTask().
base::CancelableClosure send_request_callback_;
+ // The time at which the link monitor started.
+ struct timeval started_monitoring_at_;
+
// The time at which the last ARP request was sent.
struct timeval sent_request_at_;
Time *time_;
diff --git a/link_monitor_unittest.cc b/link_monitor_unittest.cc
index 1aedc8f..27aaf52 100644
--- a/link_monitor_unittest.cc
+++ b/link_monitor_unittest.cc
@@ -96,6 +96,8 @@
ScopeLogger::GetInstance()->set_verbose_level(4);
}
monitor_.time_ = &time_;
+ time_val_.tv_sec = 0;
+ time_val_.tv_usec = 0;
EXPECT_CALL(time_, GetTimeMonotonic(_))
.WillRepeatedly(DoAll(SetArgumentPointee<0>(time_val_), Return(0)));
EXPECT_TRUE(local_ip_.SetAddressFromString(kLocalIPAddress));
@@ -345,8 +347,14 @@
HasSubstr("LinkMonitorResponseTimeSample"), GetTestPeriodMilliseconds(),
_, _, _)).Times(GetFailureThreshold());
StartMonitor();
+ // This value doesn't match real life (the timer in this scenario should
+ // advance by LinkMonitor::kTestPeriodMilliseconds), but this demonstrates
+ // the LinkMonitorSecondsToFailure independent from the response-time
+ // figures.
+ const int kTimeIncrement = 1000;
for (unsigned int i = 1; i < GetFailureThreshold(); ++i) {
ExpectTransmit(false);
+ AdvanceTime(kTimeIncrement);
TriggerRequestTimer();
EXPECT_FALSE(IsUnicast());
EXPECT_EQ(i, GetBroadcastFailureCount());
@@ -362,9 +370,17 @@
EXPECT_CALL(metrics_, SendEnumToUMA(
HasSubstr("LinkMonitorFailure"),
Metrics::kLinkMonitorFailureThresholdReached, _));
+ EXPECT_CALL(metrics_, SendToUMA(
+ HasSubstr("LinkMonitorSecondsToFailure"),
+ kTimeIncrement * GetFailureThreshold() / 1000, _, _, _));
+ EXPECT_CALL(metrics_, SendToUMA(
+ HasSubstr("BroadcastErrorsAtFailure"), GetFailureThreshold(), _, _, _));
+ EXPECT_CALL(metrics_, SendToUMA(
+ HasSubstr("UnicastErrorsAtFailure"), 0, _, _, _));
EXPECT_FALSE(GetSendRequestCallback().IsCancelled());
ExpectNoTransmit();
EXPECT_CALL(monitor_, FailureCallback());
+ AdvanceTime(kTimeIncrement);
TriggerRequestTimer();
ExpectReset();
}
@@ -405,6 +421,12 @@
EXPECT_CALL(metrics_, SendEnumToUMA(
HasSubstr("LinkMonitorFailure"),
Metrics::kLinkMonitorFailureThresholdReached, _));
+ EXPECT_CALL(metrics_, SendToUMA(
+ HasSubstr("LinkMonitorSecondsToFailure"), 0, _, _, _));
+ EXPECT_CALL(metrics_, SendToUMA(
+ HasSubstr("BroadcastErrorsAtFailure"), 0, _, _, _));
+ EXPECT_CALL(metrics_, SendToUMA(
+ HasSubstr("UnicastErrorsAtFailure"), GetFailureThreshold(), _, _, _));
EXPECT_FALSE(GetSendRequestCallback().IsCancelled());
ExpectNoTransmit();
EXPECT_CALL(monitor_, FailureCallback());
diff --git a/metrics.cc b/metrics.cc
index 725ef21..dc83a50 100644
--- a/metrics.cc
+++ b/metrics.cc
@@ -105,11 +105,24 @@
"Network.Shill.%s.LinkMonitorFailure";
const char Metrics::kMetricLinkMonitorResponseTimeSample[] =
"Network.Shill.%s.LinkMonitorResponseTimeSample";
-const int Metrics::kMetricLinkMonitorResponseTimeSampleMin = 0;
-const int Metrics::kMetricLinkMonitorResponseTimeSampleMax =
+const unsigned int Metrics::kMetricLinkMonitorResponseTimeSampleMin = 0;
+const unsigned int Metrics::kMetricLinkMonitorResponseTimeSampleMax =
LinkMonitor::kTestPeriodMilliseconds;
const int Metrics::kMetricLinkMonitorResponseTimeSampleNumBuckets = 50;
-
+const char Metrics::kMetricLinkMonitorSecondsToFailure[] =
+ "Network.Shill.%s.LinkMonitorSecondsToFailure";
+const unsigned int Metrics::kMetricLinkMonitorSecondsToFailureMin = 0;
+const unsigned int Metrics::kMetricLinkMonitorSecondsToFailureMax = 7200;
+const int Metrics::kMetricLinkMonitorSecondsToFailureNumBuckets = 50;
+const char Metrics::kMetricLinkMonitorBroadcastErrorsAtFailure[] =
+ "Network.Shill.%s.LinkMonitorBroadcastErrorsAtFailure";
+const char Metrics::kMetricLinkMonitorUnicastErrorsAtFailure[] =
+ "Network.Shill.%s.LinkMonitorUnicastErrorsAtFailure";
+const unsigned int Metrics::kMetricLinkMonitorErrorCountMin = 0;
+const unsigned int Metrics::kMetricLinkMonitorErrorCountMax =
+ LinkMonitor::kFailureThreshold;
+const int Metrics::kMetricLinkMonitorErrorCountNumBuckets =
+ LinkMonitor::kFailureThreshold + 1;
Metrics::Metrics()
: library_(&metrics_library_),
@@ -389,10 +402,41 @@
}
void Metrics::NotifyLinkMonitorFailure(
- Technology::Identifier technology, LinkMonitorFailure failure) {
+ Technology::Identifier technology,
+ LinkMonitorFailure failure,
+ unsigned int seconds_to_failure,
+ unsigned int broadcast_error_count,
+ unsigned int unicast_error_count) {
string histogram = GetFullMetricName(kMetricLinkMonitorFailure,
technology);
SendEnumToUMA(histogram, failure, kLinkMonitorFailureMax);
+
+ if (failure == kLinkMonitorFailureThresholdReached) {
+ if (seconds_to_failure > kMetricLinkMonitorSecondsToFailureMax) {
+ seconds_to_failure = kMetricLinkMonitorSecondsToFailureMax;
+ }
+ histogram = GetFullMetricName(kMetricLinkMonitorSecondsToFailure,
+ technology);
+ SendToUMA(histogram,
+ seconds_to_failure,
+ kMetricLinkMonitorSecondsToFailureMin,
+ kMetricLinkMonitorSecondsToFailureMax,
+ kMetricLinkMonitorSecondsToFailureNumBuckets);
+ histogram = GetFullMetricName(kMetricLinkMonitorBroadcastErrorsAtFailure,
+ technology);
+ SendToUMA(histogram,
+ broadcast_error_count,
+ kMetricLinkMonitorErrorCountMin,
+ kMetricLinkMonitorErrorCountMax,
+ kMetricLinkMonitorErrorCountNumBuckets);
+ histogram = GetFullMetricName(kMetricLinkMonitorUnicastErrorsAtFailure,
+ technology);
+ SendToUMA(histogram,
+ unicast_error_count,
+ kMetricLinkMonitorErrorCountMin,
+ kMetricLinkMonitorErrorCountMax,
+ kMetricLinkMonitorErrorCountNumBuckets);
+ }
}
void Metrics::NotifyLinkMonitorResponseTimeSampleAdded(
diff --git a/metrics.h b/metrics.h
index d241249..9e7ea0e 100644
--- a/metrics.h
+++ b/metrics.h
@@ -177,9 +177,18 @@
// LinkMonitor statistics.
static const char kMetricLinkMonitorFailure[];
static const char kMetricLinkMonitorResponseTimeSample[];
- static const int kMetricLinkMonitorResponseTimeSampleMin;
- static const int kMetricLinkMonitorResponseTimeSampleMax;
+ static const unsigned int kMetricLinkMonitorResponseTimeSampleMin;
+ static const unsigned int kMetricLinkMonitorResponseTimeSampleMax;
static const int kMetricLinkMonitorResponseTimeSampleNumBuckets;
+ static const char kMetricLinkMonitorSecondsToFailure[];
+ static const unsigned int kMetricLinkMonitorSecondsToFailureMin;
+ static const unsigned int kMetricLinkMonitorSecondsToFailureMax;
+ static const int kMetricLinkMonitorSecondsToFailureNumBuckets;
+ static const char kMetricLinkMonitorBroadcastErrorsAtFailure[];
+ static const char kMetricLinkMonitorUnicastErrorsAtFailure[];
+ static const unsigned int kMetricLinkMonitorErrorCountMin;
+ static const unsigned int kMetricLinkMonitorErrorCountMax;
+ static const int kMetricLinkMonitorErrorCountNumBuckets;
Metrics();
virtual ~Metrics();
@@ -229,7 +238,11 @@
// Notifies this object of a failure in LinkMonitor.
void NotifyLinkMonitorFailure(
- Technology::Identifier technology, LinkMonitorFailure failure);
+ Technology::Identifier technology,
+ LinkMonitorFailure failure,
+ unsigned int seconds_to_failure,
+ unsigned int broadcast_error_count,
+ unsigned int unicast_error_count);
// Notifies this object that LinkMonitor has added a response time sample
// for |connection| with a value of |response_time_milliseconds|.