blob: 4001206075197bab81bddfa0df2b3689ef115b61 [file] [log] [blame]
Bernie Innocenti55864192018-08-30 04:05:20 +09001/*
2 * Copyright (C) 2016 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Bernie Innocentie9ba09c2018-09-12 23:20:10 +090017constexpr bool kVerboseLogging = false;
18#define LOG_TAG "res_stats"
19
Bernie Innocenti55864192018-08-30 04:05:20 +090020#include <arpa/nameser.h>
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090021#include <stdbool.h>
Bernie Innocenti55864192018-08-30 04:05:20 +090022#include <string.h>
23
Bernie Innocentie9ba09c2018-09-12 23:20:10 +090024#include <android-base/logging.h>
Bernie Innocenti55864192018-08-30 04:05:20 +090025
Bernie Innocenti189eb502018-10-01 23:10:18 +090026#include "netd_resolv/stats.h"
Bernie Innocenti55864192018-08-30 04:05:20 +090027
Bernie Innocentie9ba09c2018-09-12 23:20:10 +090028#define VLOG if (!kVerboseLogging) {} else LOG(INFO)
29
30#ifndef RESOLV_ALLOW_VERBOSE_LOGGING
31static_assert(kVerboseLogging == false,
32 "Verbose logging floods logs at high-rate and exposes privacy-sensitive information. "
33 "Do not enable in release builds.");
34#endif
Bernie Innocenti55864192018-08-30 04:05:20 +090035
Bernie Innocentiee1b85b2018-09-25 14:23:19 +090036// Calculate the round-trip-time from start time t0 and end time t1.
37int _res_stats_calculate_rtt(const timespec* t1, const timespec* t0) {
Bernie Innocenti55864192018-08-30 04:05:20 +090038 // Divide ns by one million to get ms, multiply s by thousand to get ms (obvious)
39 long ms0 = t0->tv_sec * 1000 + t0->tv_nsec / 1000000;
40 long ms1 = t1->tv_sec * 1000 + t1->tv_nsec / 1000000;
41 return (int) (ms1 - ms0);
42}
43
Bernie Innocentiee1b85b2018-09-25 14:23:19 +090044// Create a sample for calculating server reachability statistics.
Bernie Innocenti189eb502018-10-01 23:10:18 +090045void _res_stats_set_sample(res_sample* sample, time_t now, int rcode, int rtt) {
Bernie Innocentie9ba09c2018-09-12 23:20:10 +090046 VLOG << __func__ << ": rcode = " << rcode << ", sec = " << rtt;
Bernie Innocenti55864192018-08-30 04:05:20 +090047 sample->at = now;
48 sample->rcode = rcode;
49 sample->rtt = rtt;
50}
51
52/* Clears all stored samples for the given server. */
Bernie Innocenti189eb502018-10-01 23:10:18 +090053void _res_stats_clear_samples(res_stats* stats) {
Bernie Innocenti55864192018-08-30 04:05:20 +090054 stats->sample_count = stats->sample_next = 0;
55}
56
57/* Aggregates the reachability statistics for the given server based on on the stored samples. */
Bernie Innocenti189eb502018-10-01 23:10:18 +090058void android_net_res_stats_aggregate(res_stats* stats, int* successes, int* errors, int* timeouts,
59 int* internal_errors, int* rtt_avg, time_t* last_sample_time) {
Bernie Innocenti55864192018-08-30 04:05:20 +090060 int s = 0; // successes
61 int e = 0; // errors
62 int t = 0; // timouts
63 int ie = 0; // internal errors
64 long rtt_sum = 0;
65 time_t last = 0;
66 int rtt_count = 0;
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090067 for (int i = 0; i < stats->sample_count; ++i) {
Bernie Innocenti55864192018-08-30 04:05:20 +090068 // Treat everything as an error that the code in send_dg() already considers a
69 // rejection by the server, i.e. SERVFAIL, NOTIMP and REFUSED. Assume that NXDOMAIN
70 // and NOTAUTH can actually occur for user queries. NOERROR with empty answer section
71 // is not treated as an error here either. FORMERR seems to sometimes be returned by
72 // some versions of BIND in response to DNSSEC or EDNS0. Whether to treat such responses
73 // as an indication of a broken server is unclear, though. For now treat such responses,
74 // as well as unknown codes as errors.
75 switch (stats->samples[i].rcode) {
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090076 case NOERROR:
77 case NOTAUTH:
78 case NXDOMAIN:
79 ++s;
80 rtt_sum += stats->samples[i].rtt;
81 ++rtt_count;
82 break;
83 case RCODE_TIMEOUT:
84 ++t;
85 break;
86 case RCODE_INTERNAL_ERROR:
87 ++ie;
88 break;
89 case SERVFAIL:
90 case NOTIMP:
91 case REFUSED:
92 default:
93 ++e;
94 break;
Bernie Innocenti55864192018-08-30 04:05:20 +090095 }
96 }
97 *successes = s;
98 *errors = e;
99 *timeouts = t;
100 *internal_errors = ie;
101 /* If there was at least one successful sample, calculate average RTT. */
102 if (rtt_count) {
103 *rtt_avg = rtt_sum / rtt_count;
104 } else {
105 *rtt_avg = -1;
106 }
107 /* If we had at least one sample, populate last sample time. */
108 if (stats->sample_count > 0) {
109 if (stats->sample_next > 0) {
110 last = stats->samples[stats->sample_next - 1].at;
111 } else {
112 last = stats->samples[stats->sample_count - 1].at;
113 }
114 }
115 *last_sample_time = last;
116}
117
Bernie Innocentiee1b85b2018-09-25 14:23:19 +0900118// Returns true if the server is considered unusable, i.e. if the success rate is not lower than the
119// threshold for the stored stored samples. If not enough samples are stored, the server is
120// considered usable.
Bernie Innocenti189eb502018-10-01 23:10:18 +0900121static bool res_stats_usable_server(const struct __res_params* params, res_stats* stats) {
Bernie Innocenti55864192018-08-30 04:05:20 +0900122 int successes = -1;
123 int errors = -1;
124 int timeouts = -1;
125 int internal_errors = -1;
126 int rtt_avg = -1;
127 time_t last_sample_time = 0;
128 android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors,
Bernie Innocentif12d5bb2018-08-31 14:09:46 +0900129 &rtt_avg, &last_sample_time);
Bernie Innocenti55864192018-08-30 04:05:20 +0900130 if (successes >= 0 && errors >= 0 && timeouts >= 0) {
131 int total = successes + errors + timeouts;
Bernie Innocentie9ba09c2018-09-12 23:20:10 +0900132 VLOG << "NS stats: S " << successes
133 << " + E " << errors
134 << " + T " << timeouts
135 << " + I " << internal_errors
136 << " = " << total
137 << ", rtt = " << rtt_avg
138 << ", min_samples = " << params->min_samples;
Bernie Innocenti55864192018-08-30 04:05:20 +0900139 if (total >= params->min_samples && (errors > 0 || timeouts > 0)) {
140 int success_rate = successes * 100 / total;
Bernie Innocentie9ba09c2018-09-12 23:20:10 +0900141 VLOG << "success rate " << success_rate;
Bernie Innocenti55864192018-08-30 04:05:20 +0900142 if (success_rate < params->success_threshold) {
Bernie Innocentif89b3512018-08-30 07:34:37 +0900143 time_t now = time(NULL);
Bernie Innocenti55864192018-08-30 04:05:20 +0900144 if (now - last_sample_time > params->sample_validity) {
145 // Note: It might be worth considering to expire old servers after their expiry
146 // date has been reached, however the code for returning the ring buffer to its
147 // previous non-circular state would induce additional complexity.
Bernie Innocentie9ba09c2018-09-12 23:20:10 +0900148 VLOG << "samples stale, retrying server";
Bernie Innocenti55864192018-08-30 04:05:20 +0900149 _res_stats_clear_samples(stats);
150 } else {
Bernie Innocentie9ba09c2018-09-12 23:20:10 +0900151 VLOG << "too many resolution errors, ignoring server";
Bernie Innocenti55864192018-08-30 04:05:20 +0900152 return 0;
153 }
154 }
155 }
156 }
157 return 1;
158}
159
Bernie Innocenti189eb502018-10-01 23:10:18 +0900160void android_net_res_stats_get_usable_servers(const struct __res_params* params, res_stats stats[],
161 int nscount, bool usable_servers[]) {
Bernie Innocenti55864192018-08-30 04:05:20 +0900162 unsigned usable_servers_found = 0;
163 for (int ns = 0; ns < nscount; ns++) {
Bernie Innocentiee1b85b2018-09-25 14:23:19 +0900164 bool usable = res_stats_usable_server(params, &stats[ns]);
Bernie Innocenti55864192018-08-30 04:05:20 +0900165 if (usable) {
166 ++usable_servers_found;
167 }
168 usable_servers[ns] = usable;
169 }
170 // If there are no usable servers, consider all of them usable.
171 // TODO: Explore other possibilities, such as enabling only the best N servers, etc.
172 if (usable_servers_found == 0) {
173 for (int ns = 0; ns < nscount; ns++) {
174 usable_servers[ns] = true;
175 }
176 }
177}