blob: feba34f2937f4784a51424e9ab96b5af50026716 [file] [log] [blame]
/*
* Copyright (C) 2016 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
constexpr bool kVerboseLogging = false;
#define LOG_TAG "res_stats"
#include <arpa/nameser.h>
#include <stdbool.h>
#include <string.h>
#include <android-base/logging.h>
#include "netd_resolv/stats.h"
#define VLOG if (!kVerboseLogging) {} else LOG(INFO)
#ifndef RESOLV_ALLOW_VERBOSE_LOGGING
static_assert(kVerboseLogging == false,
"Verbose logging floods logs at high-rate and exposes privacy-sensitive information. "
"Do not enable in release builds.");
#endif
// Calculate the round-trip-time from start time t0 and end time t1.
int _res_stats_calculate_rtt(const timespec* t1, const timespec* t0) {
// Divide ns by one million to get ms, multiply s by thousand to get ms (obvious)
long ms0 = t0->tv_sec * 1000 + t0->tv_nsec / 1000000;
long ms1 = t1->tv_sec * 1000 + t1->tv_nsec / 1000000;
return (int) (ms1 - ms0);
}
// Create a sample for calculating server reachability statistics.
void _res_stats_set_sample(res_sample* sample, time_t now, int rcode, int rtt) {
VLOG << __func__ << ": rcode = " << rcode << ", sec = " << rtt;
sample->at = now;
sample->rcode = rcode;
sample->rtt = rtt;
}
/* Clears all stored samples for the given server. */
void _res_stats_clear_samples(res_stats* stats) {
stats->sample_count = stats->sample_next = 0;
}
/* Aggregates the reachability statistics for the given server based on on the stored samples. */
void android_net_res_stats_aggregate(res_stats* stats, int* successes, int* errors, int* timeouts,
int* internal_errors, int* rtt_avg, time_t* last_sample_time) {
int s = 0; // successes
int e = 0; // errors
int t = 0; // timouts
int ie = 0; // internal errors
long rtt_sum = 0;
time_t last = 0;
int rtt_count = 0;
for (int i = 0; i < stats->sample_count; ++i) {
// Treat everything as an error that the code in send_dg() already considers a
// rejection by the server, i.e. SERVFAIL, NOTIMP and REFUSED. Assume that NXDOMAIN
// and NOTAUTH can actually occur for user queries. NOERROR with empty answer section
// is not treated as an error here either. FORMERR seems to sometimes be returned by
// some versions of BIND in response to DNSSEC or EDNS0. Whether to treat such responses
// as an indication of a broken server is unclear, though. For now treat such responses,
// as well as unknown codes as errors.
switch (stats->samples[i].rcode) {
case NOERROR:
case NOTAUTH:
case NXDOMAIN:
++s;
rtt_sum += stats->samples[i].rtt;
++rtt_count;
break;
case RCODE_TIMEOUT:
++t;
break;
case RCODE_INTERNAL_ERROR:
++ie;
break;
case SERVFAIL:
case NOTIMP:
case REFUSED:
default:
++e;
break;
}
}
*successes = s;
*errors = e;
*timeouts = t;
*internal_errors = ie;
/* If there was at least one successful sample, calculate average RTT. */
if (rtt_count) {
*rtt_avg = rtt_sum / rtt_count;
} else {
*rtt_avg = -1;
}
/* If we had at least one sample, populate last sample time. */
if (stats->sample_count > 0) {
if (stats->sample_next > 0) {
last = stats->samples[stats->sample_next - 1].at;
} else {
last = stats->samples[stats->sample_count - 1].at;
}
}
*last_sample_time = last;
}
// Returns true if the server is considered unusable, i.e. if the success rate is not lower than the
// threshold for the stored stored samples. If not enough samples are stored, the server is
// considered usable.
static bool res_stats_usable_server(const res_params* params, res_stats* stats) {
int successes = -1;
int errors = -1;
int timeouts = -1;
int internal_errors = -1;
int rtt_avg = -1;
time_t last_sample_time = 0;
android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors,
&rtt_avg, &last_sample_time);
if (successes >= 0 && errors >= 0 && timeouts >= 0) {
int total = successes + errors + timeouts;
VLOG << "NS stats: S " << successes
<< " + E " << errors
<< " + T " << timeouts
<< " + I " << internal_errors
<< " = " << total
<< ", rtt = " << rtt_avg
<< ", min_samples = " << params->min_samples;
if (total >= params->min_samples && (errors > 0 || timeouts > 0)) {
int success_rate = successes * 100 / total;
VLOG << "success rate " << success_rate;
if (success_rate < params->success_threshold) {
time_t now = time(NULL);
if (now - last_sample_time > params->sample_validity) {
// Note: It might be worth considering to expire old servers after their expiry
// date has been reached, however the code for returning the ring buffer to its
// previous non-circular state would induce additional complexity.
VLOG << "samples stale, retrying server";
_res_stats_clear_samples(stats);
} else {
VLOG << "too many resolution errors, ignoring server";
return 0;
}
}
}
}
return 1;
}
void android_net_res_stats_get_usable_servers(const res_params* params, res_stats stats[],
int nscount, bool usable_servers[]) {
unsigned usable_servers_found = 0;
for (int ns = 0; ns < nscount; ns++) {
bool usable = res_stats_usable_server(params, &stats[ns]);
if (usable) {
++usable_servers_found;
}
usable_servers[ns] = usable;
}
// If there are no usable servers, consider all of them usable.
// TODO: Explore other possibilities, such as enabling only the best N servers, etc.
if (usable_servers_found == 0) {
for (int ns = 0; ns < nscount; ns++) {
usable_servers[ns] = true;
}
}
}