blob: 671cd11f39184cecbe4be733e6d0f8421dad214e [file] [log] [blame]
jackygam2001a58c7952022-03-24 22:50:19 +08001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcpcong Measure tcp congestion control status duration.
5# For Linux, uses BCC, eBPF.
6#
7# USAGE: tcpcong [-h] [-T] [-L] [-R] [-m] [-d] [interval] [outputs]
8#
9# Copyright (c) Ping Gan.
10#
11# 27-Jan-2022 Ping Gan Created this.
12
13from __future__ import print_function
14from bcc import BPF
15from time import sleep, strftime
16from struct import pack
17from socket import inet_ntop, AF_INET, AF_INET6
18from struct import pack
19import argparse
20
21examples = """examples:
22 ./tcpcong # show tcp congestion status duration
23 ./tcpcong 1 10 # show 1 second summaries, 10 times
24 ./tcpcong -L 3000-3006 1 # 1s summaries, local port 3000-3006
25 ./tcpcong -R 5000-5005 1 # 1s summaries, remote port 5000-5005
26 ./tcpcong -uT 1 # 1s summaries, microseconds, and timestamps
27 ./tcpcong -d # show the duration as histograms
28"""
29
30parser = argparse.ArgumentParser(
31 description="Summarize tcp socket congestion control status duration",
32 formatter_class=argparse.RawDescriptionHelpFormatter,
33 epilog=examples)
34parser.add_argument("-L", "--localport",
35 help="trace local ports only")
36parser.add_argument("-R", "--remoteport",
37 help="trace the dest ports only")
38parser.add_argument("-T", "--timestamp", action="store_true",
39 help="include timestamp on output")
40parser.add_argument("-d", "--dist", action="store_true",
41 help="show distributions as histograms")
42parser.add_argument("-u", "--microseconds", action="store_true",
43 help="output in microseconds")
44parser.add_argument("interval", nargs="?", default=99999999,
45 help="output interval, in seconds")
46parser.add_argument("outputs", nargs="?", default=99999999,
47 help="number of outputs")
48parser.add_argument("--ebpf", action="store_true",
49 help=argparse.SUPPRESS)
50args = parser.parse_args()
51countdown = int(args.outputs)
52debug = 0
53
54start_rport = end_rport = -1
55if args.remoteport:
56 rports = args.remoteport.split("-")
57 if (len(rports) != 2) and (len(rports) != 1):
58 print("unrecognized remote port range")
59 exit(1)
60 if len(rports) == 2:
61 start_rport = int(rports[0])
62 end_rport = int(rports[1])
63 else:
64 start_rport = int(rports[0])
65 end_rport = int(rports[0])
66if start_rport > end_rport:
67 tmp = start_rport
68 start_rport = end_rport
69 end_rport = tmp
70
71start_lport = end_lport = -1
72if args.localport:
73 lports = args.localport.split("-")
74 if (len(lports) != 2) and (len(lports) != 1):
75 print("unrecognized local port range")
76 exit(1)
77 if len(lports) == 2:
78 start_lport = int(lports[0])
79 end_lport = int(lports[1])
80 else:
81 start_lport = int(lports[0])
82 end_lport = int(lports[0])
83if start_lport > end_lport:
84 tmp = start_lport
85 start_lport = end_lport
86 end_lport = tmp
87
88# define BPF program
89bpf_text = """
90#include <uapi/linux/ptrace.h>
91#include <net/sock.h>
92#include <bcc/proto.h>
93#include <net/tcp.h>
94#include <net/inet_connection_sock.h>
95
96typedef struct ipv4_flow_key {
97 u32 saddr;
98 u32 daddr;
99 u16 lport;
100 u16 dport;
101} ipv4_flow_key_t;
102
103typedef struct ipv6_flow_key {
104 unsigned __int128 saddr;
105 unsigned __int128 daddr;
106 u16 lport;
107 u16 dport;
108} ipv6_flow_key_t;
109
110typedef struct process_key {
111 char comm[TASK_COMM_LEN];
112 u32 tid;
113} process_key_t;
114
115typedef struct ipv4_flow_val {
116 ipv4_flow_key_t ipv4_key;
117 u16 cong_state;
118} ipv4_flow_val_t;
119
120typedef struct ipv6_flow_val {
121 ipv6_flow_key_t ipv6_key;
122 u16 cong_state;
123} ipv6_flow_val_t;
124
125BPF_HASH(start_ipv4, process_key_t, ipv4_flow_val_t);
126BPF_HASH(start_ipv6, process_key_t, ipv6_flow_val_t);
127SOCK_STORE_DEF
128
129typedef struct data_val {
130 DEF_TEXT
131 u64 last_ts;
132 u16 last_cong_stat;
133} data_val_t;
134
135typedef struct cong {
136 u8 cong_stat:5,
137 ca_inited:1,
138 ca_setsockopt:1,
139 ca_dstlocked:1;
140} cong_status_t;
141
142BPF_HASH(ipv4_stat, ipv4_flow_key_t, data_val_t);
143BPF_HASH(ipv6_stat, ipv6_flow_key_t, data_val_t);
144
145HIST_TABLE
146
147static int entry_state_update_func(struct sock *sk)
148{
149 u16 dport = 0, lport = 0;
150 u32 tid = bpf_get_current_pid_tgid();
151 process_key_t key = {0};
152 bpf_get_current_comm(&key.comm, sizeof(key.comm));
153 key.tid = tid;
154
155 u64 family = sk->__sk_common.skc_family;
156 struct inet_connection_sock *icsk = inet_csk(sk);
157 cong_status_t cong_status;
158 bpf_probe_read_kernel(&cong_status, sizeof(cong_status),
159 (void *)((long)&icsk->icsk_retransmits) - 1);
160 if (family == AF_INET) {
161 ipv4_flow_val_t ipv4_val = {0};
162 ipv4_val.ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
163 ipv4_val.ipv4_key.daddr = sk->__sk_common.skc_daddr;
164 ipv4_val.ipv4_key.lport = sk->__sk_common.skc_num;
165 dport = sk->__sk_common.skc_dport;
166 dport = ntohs(dport);
167 lport = ipv4_val.ipv4_key.lport;
168 FILTER_LPORT
169 FILTER_DPORT
170 ipv4_val.ipv4_key.dport = dport;
171 ipv4_val.cong_state = cong_status.cong_stat + 1;
172 start_ipv4.update(&key, &ipv4_val);
173 } else if (family == AF_INET6) {
174 ipv6_flow_val_t ipv6_val = {0};
175 bpf_probe_read_kernel(&ipv6_val.ipv6_key.saddr,
176 sizeof(ipv6_val.ipv6_key.saddr),
177 &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
178 bpf_probe_read_kernel(&ipv6_val.ipv6_key.daddr,
179 sizeof(ipv6_val.ipv6_key.daddr),
180 &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
181 ipv6_val.ipv6_key.lport = sk->__sk_common.skc_num;
182 dport = sk->__sk_common.skc_dport;
183 dport = ntohs(dport);
184 lport = ipv6_val.ipv6_key.lport;
185 FILTER_LPORT
186 FILTER_DPORT
187 ipv6_val.ipv6_key.dport = dport;
188 ipv6_val.cong_state = cong_status.cong_stat + 1;
189 start_ipv6.update(&key, &ipv6_val);
190 }
191 SOCK_STORE_ADD
192 return 0;
193}
194
195static int ret_state_update_func(struct sock *sk)
196{
197 u64 ts, ts1;
198 u16 family, last_cong_state;
199 u16 dport = 0, lport = 0;
200 u32 tid = bpf_get_current_pid_tgid();
201 process_key_t key = {0};
202 bpf_get_current_comm(&key.comm, sizeof(key.comm));
203 key.tid = tid;
204
205 struct inet_connection_sock *icsk = inet_csk(sk);
206 cong_status_t cong_status;
207 bpf_probe_read_kernel(&cong_status, sizeof(cong_status),
208 (void *)((long)&icsk->icsk_retransmits) - 1);
209 data_val_t *datap, data = {0};
210 STATE_KEY
211 bpf_probe_read_kernel(&family, sizeof(family),
212 &sk->__sk_common.skc_family);
213 if (family == AF_INET) {
214 ipv4_flow_val_t *val4 = start_ipv4.lookup(&key);
215 if (val4 == 0) {
216 SOCK_STORE_DEL
217 return 0; //missed
218 }
219 ipv4_flow_key_t keyv4 = {0};
220 bpf_probe_read_kernel(&keyv4, sizeof(ipv4_flow_key_t),
221 &(val4->ipv4_key));
222 dport = keyv4.dport;
223 lport = keyv4.lport;
224 FILTER_LPORT
225 FILTER_DPORT
226 datap = ipv4_stat.lookup(&keyv4);
227 if (datap == 0) {
228 data.last_ts = bpf_ktime_get_ns();
229 data.last_cong_stat = val4->cong_state;
230 ipv4_stat.update(&keyv4, &data);
231 } else {
232 last_cong_state = val4->cong_state;
233 if ((cong_status.cong_stat + 1) != last_cong_state) {
234 ts1 = bpf_ktime_get_ns();
235 ts = ts1 - datap->last_ts;
236 datap->last_ts = ts1;
237 datap->last_cong_stat = cong_status.cong_stat + 1;
238 ts /= 1000;
239 STORE
240 }
241 }
242 start_ipv4.delete(&key);
243 } else if (family == AF_INET6) {
244 ipv6_flow_val_t *val6 = start_ipv6.lookup(&key);
245 if (val6 == 0) {
246 SOCK_STORE_DEL
247 return 0; //missed
248 }
249 ipv6_flow_key_t keyv6 = {0};
250 bpf_probe_read_kernel(&keyv6, sizeof(ipv6_flow_key_t),
251 &(val6->ipv6_key));
252 dport = keyv6.dport;
253 lport = keyv6.lport;
254 FILTER_LPORT
255 FILTER_DPORT
256 datap = ipv6_stat.lookup(&keyv6);
257 if (datap == 0) {
258 data.last_ts = bpf_ktime_get_ns();
259 data.last_cong_stat = val6->cong_state;
260 ipv6_stat.update(&keyv6, &data);
261 } else {
262 last_cong_state = val6->cong_state;
263 if ((cong_status.cong_stat + 1) != last_cong_state) {
264 ts1 = bpf_ktime_get_ns();
265 ts = ts1 - datap->last_ts;
266 datap->last_ts = ts1;
267 datap->last_cong_stat = (cong_status.cong_stat + 1);
268 ts /= 1000;
269 STORE
270 }
271 }
272 start_ipv6.delete(&key);
273 }
274 SOCK_STORE_DEL
275 return 0;
276}
277"""
278
279kprobe_program = """
280int entry_func(struct pt_regs *ctx, struct sock *sk)
281{
282 return entry_state_update_func(sk);
283}
284
285int ret_func(struct pt_regs *ctx)
286{
287 u32 tid = bpf_get_current_pid_tgid();
288 process_key_t key = {0};
289 bpf_get_current_comm(&key.comm, sizeof(key.comm));
290 key.tid = tid;
291 struct sock **sockpp;
292 sockpp = sock_store.lookup(&key);
293 if (sockpp == 0) {
294 return 0; //miss the entry
295 }
296 struct sock *sk = *sockpp;
297 return ret_state_update_func(sk);
298}
299"""
300
301kfunc_program = """
302KFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk)
303{
304 return entry_state_update_func(sk);
305}
306
307KRETFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk)
308{
309 return ret_state_update_func(sk);
310}
311
312KFUNC_PROBE(tcp_enter_cwr, struct sock *sk)
313{
314 return entry_state_update_func(sk);
315}
316
317KRETFUNC_PROBE(tcp_enter_cwr, struct sock *sk)
318{
319 return ret_state_update_func(sk);
320}
321
322KFUNC_PROBE(tcp_enter_loss, struct sock *sk)
323{
324 return entry_state_update_func(sk);
325}
326
327KRETFUNC_PROBE(tcp_enter_loss, struct sock *sk)
328{
329 return ret_state_update_func(sk);
330}
331
332KFUNC_PROBE(tcp_enter_recovery, struct sock *sk)
333{
334 return entry_state_update_func(sk);
335}
336
337KRETFUNC_PROBE(tcp_enter_recovery, struct sock *sk)
338{
339 return ret_state_update_func(sk);
340}
341
342KFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk)
343{
344 return entry_state_update_func(sk);
345}
346
347KRETFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk)
348{
349 return ret_state_update_func(sk);
350}
351"""
352
353# code replace
354is_support_kfunc = BPF.support_kfunc()
355if is_support_kfunc:
356 bpf_text += kfunc_program
357 bpf_text = bpf_text.replace('SOCK_STORE_DEF', '')
358 bpf_text = bpf_text.replace('SOCK_STORE_ADD', '')
359 bpf_text = bpf_text.replace('SOCK_STORE_DEL', '')
360else:
361 bpf_text += kprobe_program
362 bpf_text = bpf_text.replace('SOCK_STORE_DEF',
363 'BPF_HASH(sock_store, process_key_t, struct sock *);')
364 bpf_text = bpf_text.replace('SOCK_STORE_ADD',
365 'sock_store.update(&key, &sk);')
366 bpf_text = bpf_text.replace('SOCK_STORE_DEL',
367 'sock_store.delete(&key);')
368
369if args.localport:
370 bpf_text = bpf_text.replace('FILTER_LPORT',
371 'if (lport < %d || lport > %d) { return 0; }'
372 % (start_lport, end_lport))
373else:
374 bpf_text = bpf_text.replace('FILTER_LPORT', '')
375
376if args.remoteport:
377 bpf_text = bpf_text.replace('FILTER_DPORT',
378 'if (dport < %d || dport > %d) { return 0; }'
379 % (start_rport, end_rport))
380else:
381 bpf_text = bpf_text.replace('FILTER_DPORT', '')
382
383table_def_text = """
384 u64 open_dura;
385 u64 loss_dura;
386 u64 disorder_dura;
387 u64 recover_dura;
388 u64 cwr_dura;
389 u64 total_changes;
390"""
391
392store_text = """
393 datap->total_changes += 1;
394 if (last_cong_state == (TCP_CA_Open + 1)) {
395 datap->open_dura += ts;
396 } else if (last_cong_state == (TCP_CA_Disorder + 1)) {
397 datap->disorder_dura += ts;
398 } else if (last_cong_state == (TCP_CA_CWR + 1)) {
399 datap->cwr_dura += ts;
400 } else if (last_cong_state == (TCP_CA_Recovery + 1)) {
401 datap->recover_dura += ts;
402 } else if (last_cong_state == (TCP_CA_Loss + 1)) {
403 datap->loss_dura += ts;
404 }
405"""
406
407store_dist_text = """
408 if (last_cong_state == (TCP_CA_Open + 1)) {
409 key_s.state = TCP_CA_Open;
410 } else if (last_cong_state == (TCP_CA_Disorder + 1)) {
411 key_s.state = TCP_CA_Disorder;
412 } else if (last_cong_state == (TCP_CA_CWR + 1)) {
413 key_s.state = TCP_CA_CWR;
414 } else if (last_cong_state == (TCP_CA_Recovery + 1)) {
415 key_s.state = TCP_CA_Recovery;
416 } else if (last_cong_state == (TCP_CA_Loss + 1)) {
417 key_s.state = TCP_CA_Loss;
418 }
419 TIME_UNIT
420 key_s.slot = bpf_log2l(ts);
421 dist.atomic_increment(key_s);
422"""
423
424hist_table_text = """
425typedef struct congest_state_key {
426 u32 state;
427 u64 slot;
428}congest_state_key_t;
429
430BPF_HISTOGRAM(dist, congest_state_key_t);
431"""
432
433if args.dist:
434 bpf_text = bpf_text.replace('DEF_TEXT', '')
435 bpf_text = bpf_text.replace('STORE', store_dist_text)
436 bpf_text = bpf_text.replace('STATE_KEY',
437 'congest_state_key_t key_s = {0};')
438 bpf_text = bpf_text.replace('HIST_TABLE', hist_table_text)
439 if args.microseconds:
440 bpf_text = bpf_text.replace('TIME_UNIT', '')
441 else:
442 bpf_text = bpf_text.replace('TIME_UNIT', 'ts /= 1000;')
443else:
444 bpf_text = bpf_text.replace('DEF_TEXT', table_def_text)
445 bpf_text = bpf_text.replace('STORE', store_text)
446 bpf_text = bpf_text.replace('STATE_KEY', '')
447 bpf_text = bpf_text.replace('HIST_TABLE', '')
448
449
450if debug or args.ebpf:
451 print(bpf_text)
452 if args.ebpf:
453 exit()
454
455# load BPF program
456b = BPF(text=bpf_text)
457
458if not is_support_kfunc:
459 # all the tcp congestion control status update functions
460 # are called by below 5 functions.
461 b.attach_kprobe(event="tcp_fastretrans_alert", fn_name="entry_func")
462 b.attach_kretprobe(event="tcp_fastretrans_alert", fn_name="ret_func")
463 b.attach_kprobe(event="tcp_enter_cwr", fn_name="entry_func")
464 b.attach_kretprobe(event="tcp_enter_cwr", fn_name="ret_func")
465 b.attach_kprobe(event="tcp_process_tlp_ack", fn_name="entry_func")
466 b.attach_kretprobe(event="tcp_process_tlp_ack", fn_name="ret_func")
467 b.attach_kprobe(event="tcp_enter_loss", fn_name="entry_func")
468 b.attach_kretprobe(event="tcp_enter_loss", fn_name="ret_func")
469 b.attach_kprobe(event="tcp_enter_recovery", fn_name="entry_func")
470 b.attach_kretprobe(event="tcp_enter_recovery", fn_name="ret_func")
471
472print("Tracing tcp congestion control status duration... Hit Ctrl-C to end.")
473
474
475def cong_state_to_name(state):
476 # this need to match with kernel state
477 state_name = ["open", "disorder", "cwr", "recovery", "loss"]
478 return state_name[state]
479
480# output
481exiting = 0 if args.interval else 1
482ipv6_stat = b.get_table("ipv6_stat")
483ipv4_stat = b.get_table("ipv4_stat")
484if args.dist:
485 dist = b.get_table("dist")
486label = "ms"
487if args.microseconds:
488 label = "us"
489while (1):
490 try:
491 sleep(int(args.interval))
492 except KeyboardInterrupt:
493 exiting = 1
494
495 print()
496 if args.timestamp:
497 print("%-8s\n" % strftime("%H:%M:%S"), end="")
498 if args.dist:
499 if args.microseconds:
500 dist.print_log2_hist("usecs", "tcp_congest_state",
501 section_print_fn=cong_state_to_name)
502 else:
503 dist.print_log2_hist("msecs", "tcp_congest_state",
504 section_print_fn=cong_state_to_name)
505 dist.clear()
506 else:
507 if ipv4_stat:
508 print("%-21s% -21s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort",
509 "RAddrPort", "Open_" + label, "Dod_" + label,
510 "Rcov_" + label, "Cwr_" + label, "Los_" + label, "Chgs"))
511 laddr = ""
512 raddr = ""
513 for k, v in sorted(ipv4_stat.items(), key=lambda ipv4_stat: ipv4_stat[0].lport):
514 laddr = inet_ntop(AF_INET, pack("I", k.saddr))
515 raddr = inet_ntop(AF_INET, pack("I", k.daddr))
516 open_dura = v.open_dura
517 disorder_dura = v.disorder_dura
518 recover_dura = v.recover_dura
519 cwr_dura = v.cwr_dura
520 loss_dura = v.loss_dura
521 if not args.microseconds:
522 open_dura /= 1000
523 disorder_dura /= 1000
524 recover_dura /= 1000
525 cwr_dura /= 1000
526 loss_dura /= 1000
527 if v.total_changes != 0:
528 print("%-21s %-21s %-7d %-6d %-7d %-7d %-6d %-5d" % (laddr +
529 "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura,
530 disorder_dura, recover_dura, cwr_dura, loss_dura,
531 v.total_changes))
532 if ipv6_stat:
533 print("%-32s %-32s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort6",
534 "RAddrPort6", "Open_" + label, "Dod_" + label, "Rcov_" + label,
535 "Cwr_" + label, "Los_" + label, "Chgs"))
536 for k, v in sorted(ipv6_stat.items(), key=lambda ipv6_stat: ipv6_stat[0].lport):
537 laddr = inet_ntop(AF_INET6, bytes(k.saddr))
538 raddr = inet_ntop(AF_INET6, bytes(k.daddr))
539 open_dura = v.open_dura
540 disorder_dura = v.disorder_dura
541 recover_dura = v.recover_dura
542 cwr_dura = v.cwr_dura
543 loss_dura = v.loss_dura
544 if not args.microseconds:
545 open_dura /= 1000
546 disorder_dura /= 1000
547 recover_dura /= 1000
548 cwr_dura /= 1000
549 loss_dura /= 1000
550 if v.total_changes != 0:
551 print("%-32s %-32s %-7d %-7d %-7d %-6d %-6d %-5d" % (laddr +
552 "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura,
553 disorder_dura, recover_dura, cwr_dura, loss_dura,
554 v.total_changes))
555 ipv4_stat.clear()
556 ipv6_stat.clear()
557 countdown -= 1
558 if exiting or countdown == 0:
559 exit()