blob: 7ac5a4c0a0dba75a1e7317b1feece4590baa53bb [file] [log] [blame]
Bernie Innocenti55864192018-08-30 04:05:20 +09001/*
2 * Copyright (C) 2016 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
Bernie Innocenti55864192018-08-30 04:05:20 +090017#include <arpa/nameser.h>
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090018#include <stdbool.h>
Bernie Innocenti55864192018-08-30 04:05:20 +090019#include <string.h>
20
21#include <async_safe/log.h>
22
23#include "isc/eventlib.h"
24#include "resolv_stats.h"
25
26#define DBG 0
27
28/* Calculate the round-trip-time from start time t0 and end time t1. */
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090029int _res_stats_calculate_rtt(const struct timespec* t1, const struct timespec* t0) {
Bernie Innocenti55864192018-08-30 04:05:20 +090030 // Divide ns by one million to get ms, multiply s by thousand to get ms (obvious)
31 long ms0 = t0->tv_sec * 1000 + t0->tv_nsec / 1000000;
32 long ms1 = t1->tv_sec * 1000 + t1->tv_nsec / 1000000;
33 return (int) (ms1 - ms0);
34}
35
36/* Create a sample for calculating server reachability statistics. */
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090037void _res_stats_set_sample(struct __res_sample* sample, time_t now, int rcode, int rtt) {
Bernie Innocenti55864192018-08-30 04:05:20 +090038 if (DBG) {
39 async_safe_format_log(ANDROID_LOG_INFO, "libc", "rcode = %d, sec = %d", rcode, rtt);
40 }
41 sample->at = now;
42 sample->rcode = rcode;
43 sample->rtt = rtt;
44}
45
46/* Clears all stored samples for the given server. */
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090047void _res_stats_clear_samples(struct __res_stats* stats) {
Bernie Innocenti55864192018-08-30 04:05:20 +090048 stats->sample_count = stats->sample_next = 0;
49}
50
51/* Aggregates the reachability statistics for the given server based on on the stored samples. */
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090052void android_net_res_stats_aggregate(struct __res_stats* stats, int* successes, int* errors,
53 int* timeouts, int* internal_errors, int* rtt_avg,
54 time_t* last_sample_time) {
Bernie Innocenti55864192018-08-30 04:05:20 +090055 int s = 0; // successes
56 int e = 0; // errors
57 int t = 0; // timouts
58 int ie = 0; // internal errors
59 long rtt_sum = 0;
60 time_t last = 0;
61 int rtt_count = 0;
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090062 for (int i = 0; i < stats->sample_count; ++i) {
Bernie Innocenti55864192018-08-30 04:05:20 +090063 // Treat everything as an error that the code in send_dg() already considers a
64 // rejection by the server, i.e. SERVFAIL, NOTIMP and REFUSED. Assume that NXDOMAIN
65 // and NOTAUTH can actually occur for user queries. NOERROR with empty answer section
66 // is not treated as an error here either. FORMERR seems to sometimes be returned by
67 // some versions of BIND in response to DNSSEC or EDNS0. Whether to treat such responses
68 // as an indication of a broken server is unclear, though. For now treat such responses,
69 // as well as unknown codes as errors.
70 switch (stats->samples[i].rcode) {
Bernie Innocentif12d5bb2018-08-31 14:09:46 +090071 case NOERROR:
72 case NOTAUTH:
73 case NXDOMAIN:
74 ++s;
75 rtt_sum += stats->samples[i].rtt;
76 ++rtt_count;
77 break;
78 case RCODE_TIMEOUT:
79 ++t;
80 break;
81 case RCODE_INTERNAL_ERROR:
82 ++ie;
83 break;
84 case SERVFAIL:
85 case NOTIMP:
86 case REFUSED:
87 default:
88 ++e;
89 break;
Bernie Innocenti55864192018-08-30 04:05:20 +090090 }
91 }
92 *successes = s;
93 *errors = e;
94 *timeouts = t;
95 *internal_errors = ie;
96 /* If there was at least one successful sample, calculate average RTT. */
97 if (rtt_count) {
98 *rtt_avg = rtt_sum / rtt_count;
99 } else {
100 *rtt_avg = -1;
101 }
102 /* If we had at least one sample, populate last sample time. */
103 if (stats->sample_count > 0) {
104 if (stats->sample_next > 0) {
105 last = stats->samples[stats->sample_next - 1].at;
106 } else {
107 last = stats->samples[stats->sample_count - 1].at;
108 }
109 }
110 *last_sample_time = last;
111}
112
Bernie Innocentif12d5bb2018-08-31 14:09:46 +0900113bool _res_stats_usable_server(const struct __res_params* params, struct __res_stats* stats) {
Bernie Innocenti55864192018-08-30 04:05:20 +0900114 int successes = -1;
115 int errors = -1;
116 int timeouts = -1;
117 int internal_errors = -1;
118 int rtt_avg = -1;
119 time_t last_sample_time = 0;
120 android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors,
Bernie Innocentif12d5bb2018-08-31 14:09:46 +0900121 &rtt_avg, &last_sample_time);
Bernie Innocenti55864192018-08-30 04:05:20 +0900122 if (successes >= 0 && errors >= 0 && timeouts >= 0) {
123 int total = successes + errors + timeouts;
124 if (DBG) {
Bernie Innocentif12d5bb2018-08-31 14:09:46 +0900125 async_safe_format_log(ANDROID_LOG_DEBUG, "libc",
126 "NS stats: S %d + E %d + T %d + I %d "
127 "= %d, rtt = %d, min_samples = %d\n",
128 successes, errors, timeouts, internal_errors, total, rtt_avg,
129 params->min_samples);
Bernie Innocenti55864192018-08-30 04:05:20 +0900130 }
131 if (total >= params->min_samples && (errors > 0 || timeouts > 0)) {
132 int success_rate = successes * 100 / total;
133 if (DBG) {
134 async_safe_format_log(ANDROID_LOG_DEBUG, "libc", "success rate %d%%\n",
135 success_rate);
136 }
137 if (success_rate < params->success_threshold) {
138 // evNowTime() is used here instead of time() to stay consistent with the rest of
139 // the code base
140 time_t now = evNowTime().tv_sec;
141 if (now - last_sample_time > params->sample_validity) {
142 // Note: It might be worth considering to expire old servers after their expiry
143 // date has been reached, however the code for returning the ring buffer to its
144 // previous non-circular state would induce additional complexity.
145 if (DBG) {
146 async_safe_format_log(ANDROID_LOG_INFO, "libc",
Bernie Innocentif12d5bb2018-08-31 14:09:46 +0900147 "samples stale, retrying server\n");
Bernie Innocenti55864192018-08-30 04:05:20 +0900148 }
149 _res_stats_clear_samples(stats);
150 } else {
151 if (DBG) {
152 async_safe_format_log(ANDROID_LOG_INFO, "libc",
Bernie Innocentif12d5bb2018-08-31 14:09:46 +0900153 "too many resolution errors, ignoring server\n");
Bernie Innocenti55864192018-08-30 04:05:20 +0900154 }
155 return 0;
156 }
157 }
158 }
159 }
160 return 1;
161}
162
Bernie Innocentif12d5bb2018-08-31 14:09:46 +0900163void android_net_res_stats_get_usable_servers(const struct __res_params* params,
164 struct __res_stats stats[], int nscount,
165 bool usable_servers[]) {
Bernie Innocenti55864192018-08-30 04:05:20 +0900166 unsigned usable_servers_found = 0;
167 for (int ns = 0; ns < nscount; ns++) {
168 bool usable = _res_stats_usable_server(params, &stats[ns]);
169 if (usable) {
170 ++usable_servers_found;
171 }
172 usable_servers[ns] = usable;
173 }
174 // If there are no usable servers, consider all of them usable.
175 // TODO: Explore other possibilities, such as enabling only the best N servers, etc.
176 if (usable_servers_found == 0) {
177 for (int ns = 0; ns < nscount; ns++) {
178 usable_servers[ns] = true;
179 }
180 }
181}