blob: ac02a340e778f475eb8e7ca9c0d393de72681183 [file] [log] [blame]
Brendan Gregg797c3ec2016-10-19 18:55:10 -07001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcplife Trace the lifespan of TCP sessions and summarize.
5# For Linux, uses BCC, BPF. Embedded C.
6#
7# USAGE: tcplife [-h] [-C] [-S] [-p PID] [interval [count]]
8#
Brendan Gregge023bc82017-12-29 22:46:27 -08009# This uses the tcp:tcp_set_state tracepoint if it exists (added to
10# Linux 4.15), else it uses kernel dynamic tracing of tcp_set_state().
Brendan Gregg797c3ec2016-10-19 18:55:10 -070011#
12# While throughput counters are emitted, they are fetched in a low-overhead
13# manner: reading members of the tcp_info struct on TCP close. ie, we do not
14# trace send/receive.
15#
16# Copyright 2016 Netflix, Inc.
17# Licensed under the Apache License, Version 2.0 (the "License")
18#
19# IDEA: Julia Evans
20#
21# 18-Oct-2016 Brendan Gregg Created this.
Brendan Gregge023bc82017-12-29 22:46:27 -080022# 29-Dec-2017 " " Added tracepoint support.
Brendan Gregg797c3ec2016-10-19 18:55:10 -070023
24from __future__ import print_function
25from bcc import BPF
26import argparse
27from socket import inet_ntop, ntohs, AF_INET, AF_INET6
28from struct import pack
29import ctypes as ct
30from time import strftime
31
32# arguments
33examples = """examples:
34 ./tcplife # trace all TCP connect()s
35 ./tcplife -t # include time column (HH:MM:SS)
36 ./tcplife -w # wider colums (fit IPv6)
37 ./tcplife -stT # csv output, with times & timestamps
38 ./tcplife -p 181 # only trace PID 181
39 ./tcplife -L 80 # only trace local port 80
40 ./tcplife -L 80,81 # only trace local ports 80 and 81
41 ./tcplife -D 80 # only trace remote port 80
42"""
43parser = argparse.ArgumentParser(
44 description="Trace the lifespan of TCP sessions and summarize",
45 formatter_class=argparse.RawDescriptionHelpFormatter,
Nathan Scott1a197db2018-01-21 09:14:27 +110046 epilog=examples)
Brendan Gregg797c3ec2016-10-19 18:55:10 -070047parser.add_argument("-T", "--time", action="store_true",
48 help="include time column on output (HH:MM:SS)")
49parser.add_argument("-t", "--timestamp", action="store_true",
50 help="include timestamp on output (seconds)")
51parser.add_argument("-w", "--wide", action="store_true",
52 help="wide column output (fits IPv6 addresses)")
53parser.add_argument("-s", "--csv", action="store_true",
Edward Bettsfdf9b082017-10-10 21:13:28 +010054 help="comma separated values output")
Brendan Gregg797c3ec2016-10-19 18:55:10 -070055parser.add_argument("-p", "--pid",
56 help="trace this PID only")
57parser.add_argument("-L", "--localport",
58 help="comma-separated list of local ports to trace.")
59parser.add_argument("-D", "--remoteport",
60 help="comma-separated list of remote ports to trace.")
Nathan Scottf5fb9af2018-01-17 09:39:59 +110061parser.add_argument("--ebpf", action="store_true",
62 help=argparse.SUPPRESS)
Brendan Gregg797c3ec2016-10-19 18:55:10 -070063args = parser.parse_args()
64debug = 0
65
66# define BPF program
67bpf_text = """
68#include <uapi/linux/ptrace.h>
69#define KBUILD_MODNAME "foo"
70#include <linux/tcp.h>
71#include <net/sock.h>
72#include <bcc/proto.h>
73
74BPF_HASH(birth, struct sock *, u64);
75
76// separate data structs for ipv4 and ipv6
77struct ipv4_data_t {
78 // XXX: switch some to u32's when supported
79 u64 ts_us;
80 u64 pid;
81 u64 saddr;
82 u64 daddr;
83 u64 ports;
84 u64 rx_b;
85 u64 tx_b;
86 u64 span_us;
87 char task[TASK_COMM_LEN];
88};
89BPF_PERF_OUTPUT(ipv4_events);
90
91struct ipv6_data_t {
92 u64 ts_us;
93 u64 pid;
94 unsigned __int128 saddr;
95 unsigned __int128 daddr;
96 u64 ports;
97 u64 rx_b;
98 u64 tx_b;
99 u64 span_us;
100 char task[TASK_COMM_LEN];
101};
102BPF_PERF_OUTPUT(ipv6_events);
103
104struct id_t {
105 u32 pid;
106 char task[TASK_COMM_LEN];
107};
108BPF_HASH(whoami, struct sock *, struct id_t);
Brendan Gregge023bc82017-12-29 22:46:27 -0800109"""
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700110
Brendan Gregge023bc82017-12-29 22:46:27 -0800111#
112# XXX: The following is temporary code for older kernels, Linux 4.14 and
113# older. It uses kprobes to instrument tcp_set_state(). On Linux 4.15 and
114# later, the tcp:tcp_set_state tracepoint should be used instead, as is
115# done by the code that follows this. In the distant future (2021?), this
116# kprobe code can be removed. This is why there is so much code
117# duplication: to make removal easier.
118#
119bpf_text_kprobe = """
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700120int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
121{
122 u32 pid = bpf_get_current_pid_tgid() >> 32;
123
124 // lport is either used in a filter here, or later
125 u16 lport = sk->__sk_common.skc_num;
126 FILTER_LPORT
127
128 // dport is either used in a filter here, or later
129 u16 dport = sk->__sk_common.skc_dport;
130 FILTER_DPORT
131
132 /*
133 * This tool includes PID and comm context. It's best effort, and may
134 * be wrong in some situations. It currently works like this:
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800135 * - record timestamp on any state < TCP_FIN_WAIT1
136 * - cache task context on:
137 * TCP_SYN_SENT: tracing from client
138 * TCP_LAST_ACK: client-closed from server
139 * - do output on TCP_CLOSE:
140 * fetch task context if cached, or use current task
141 */
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700142
Brendan Gregg42d00a42016-11-30 16:55:45 -0800143 // capture birth time
144 if (state < TCP_FIN_WAIT1) {
145 /*
146 * Matching just ESTABLISHED may be sufficient, provided no code-path
147 * sets ESTABLISHED without a tcp_set_state() call. Until we know
148 * that for sure, match all early states to increase chances a
149 * timestamp is set.
150 * Note that this needs to be set before the PID filter later on,
151 * since the PID isn't reliable for these early stages, so we must
152 * save all timestamps and do the PID filter later when we can.
153 */
154 u64 ts = bpf_ktime_get_ns();
155 birth.update(&sk, &ts);
156 }
157
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700158 // record PID & comm on SYN_SENT
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800159 if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
Brendan Gregg42d00a42016-11-30 16:55:45 -0800160 // now we can PID filter, both here and a little later on for CLOSE
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700161 FILTER_PID
162 struct id_t me = {.pid = pid};
163 bpf_get_current_comm(&me.task, sizeof(me.task));
164 whoami.update(&sk, &me);
165 }
166
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800167 if (state != TCP_CLOSE)
168 return 0;
169
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700170 // calculate lifespan
171 u64 *tsp, delta_us;
172 tsp = birth.lookup(&sk);
173 if (tsp == 0) {
Brendan Gregg42d00a42016-11-30 16:55:45 -0800174 whoami.delete(&sk); // may not exist
175 return 0; // missed create
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700176 }
177 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
Brendan Gregg42d00a42016-11-30 16:55:45 -0800178 birth.delete(&sk);
179
180 // fetch possible cached data, and filter
181 struct id_t *mep;
182 mep = whoami.lookup(&sk);
183 if (mep != 0)
184 pid = mep->pid;
185 FILTER_PID
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700186
187 // get throughput stats. see tcp_get_info().
188 u64 rx_b = 0, tx_b = 0, sport = 0;
189 struct tcp_sock *tp = (struct tcp_sock *)sk;
190 rx_b = tp->bytes_received;
191 tx_b = tp->bytes_acked;
192
193 u16 family = sk->__sk_common.skc_family;
194
195 if (family == AF_INET) {
196 struct ipv4_data_t data4 = {.span_us = delta_us,
197 .rx_b = rx_b, .tx_b = tx_b};
198 data4.ts_us = bpf_ktime_get_ns() / 1000;
199 data4.saddr = sk->__sk_common.skc_rcv_saddr;
200 data4.daddr = sk->__sk_common.skc_daddr;
201 // a workaround until data4 compiles with separate lport/dport
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800202 data4.pid = pid;
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700203 data4.ports = ntohs(dport) + ((0ULL + lport) << 32);
204 if (mep == 0) {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700205 bpf_get_current_comm(&data4.task, sizeof(data4.task));
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700206 } else {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700207 bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
208 }
209 ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
210
211 } else /* 6 */ {
212 struct ipv6_data_t data6 = {.span_us = delta_us,
213 .rx_b = rx_b, .tx_b = tx_b};
214 data6.ts_us = bpf_ktime_get_ns() / 1000;
215 bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
216 sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
217 bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
218 sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
219 // a workaround until data6 compiles with separate lport/dport
220 data6.ports = ntohs(dport) + ((0ULL + lport) << 32);
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800221 data6.pid = pid;
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700222 if (mep == 0) {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700223 bpf_get_current_comm(&data6.task, sizeof(data6.task));
224 } else {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700225 bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
226 }
227 ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
228 }
229
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700230 if (mep != 0)
231 whoami.delete(&sk);
232
233 return 0;
234}
235"""
236
Brendan Gregge023bc82017-12-29 22:46:27 -0800237bpf_text_tracepoint = """
238TRACEPOINT_PROBE(tcp, tcp_set_state)
239{
240 u32 pid = bpf_get_current_pid_tgid() >> 32;
241 // sk is mostly used as a UUID, once for skc_family, and two tcp stats:
242 struct sock *sk = (struct sock *)args->skaddr;
243
244 // lport is either used in a filter here, or later
245 u16 lport = args->sport;
246 FILTER_LPORT
247
248 // dport is either used in a filter here, or later
249 u16 dport = args->dport;
250 FILTER_DPORT
251
252 /*
253 * This tool includes PID and comm context. It's best effort, and may
254 * be wrong in some situations. It currently works like this:
255 * - record timestamp on any state < TCP_FIN_WAIT1
256 * - cache task context on:
257 * TCP_SYN_SENT: tracing from client
258 * TCP_LAST_ACK: client-closed from server
259 * - do output on TCP_CLOSE:
260 * fetch task context if cached, or use current task
261 */
262
263 // capture birth time
264 if (args->newstate < TCP_FIN_WAIT1) {
265 /*
266 * Matching just ESTABLISHED may be sufficient, provided no code-path
267 * sets ESTABLISHED without a tcp_set_state() call. Until we know
268 * that for sure, match all early states to increase chances a
269 * timestamp is set.
270 * Note that this needs to be set before the PID filter later on,
271 * since the PID isn't reliable for these early stages, so we must
272 * save all timestamps and do the PID filter later when we can.
273 */
274 u64 ts = bpf_ktime_get_ns();
275 birth.update(&sk, &ts);
276 }
277
278 // record PID & comm on SYN_SENT
279 if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
280 // now we can PID filter, both here and a little later on for CLOSE
281 FILTER_PID
282 struct id_t me = {.pid = pid};
283 bpf_get_current_comm(&me.task, sizeof(me.task));
284 whoami.update(&sk, &me);
285 }
286
287 if (args->newstate != TCP_CLOSE)
288 return 0;
289
290 // calculate lifespan
291 u64 *tsp, delta_us;
292 tsp = birth.lookup(&sk);
293 if (tsp == 0) {
294 whoami.delete(&sk); // may not exist
295 return 0; // missed create
296 }
297 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
298 birth.delete(&sk);
299
300 // fetch possible cached data, and filter
301 struct id_t *mep;
302 mep = whoami.lookup(&sk);
303 if (mep != 0)
304 pid = mep->pid;
305 FILTER_PID
306
307 // get throughput stats. see tcp_get_info().
308 u64 rx_b = 0, tx_b = 0, sport = 0;
309 struct tcp_sock *tp = (struct tcp_sock *)sk;
310 bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received);
311 bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked);
312
313 u16 family = 0;
314 bpf_probe_read(&family, sizeof(family), &sk->__sk_common.skc_family);
315
316 if (family == AF_INET) {
317 struct ipv4_data_t data4 = {.span_us = delta_us,
318 .rx_b = rx_b, .tx_b = tx_b};
319 data4.ts_us = bpf_ktime_get_ns() / 1000;
320 bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr);
321 bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr);
322 // a workaround until data4 compiles with separate lport/dport
323 data4.ports = dport + ((0ULL + lport) << 32);
324 data4.pid = pid;
325
326 if (mep == 0) {
327 bpf_get_current_comm(&data4.task, sizeof(data4.task));
328 } else {
329 bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
330 }
331 ipv4_events.perf_submit(args, &data4, sizeof(data4));
332
333 } else /* 6 */ {
334 struct ipv6_data_t data6 = {.span_us = delta_us,
335 .rx_b = rx_b, .tx_b = tx_b};
336 data6.ts_us = bpf_ktime_get_ns() / 1000;
337 bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6);
338 bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6);
339 // a workaround until data6 compiles with separate lport/dport
340 data6.ports = dport + ((0ULL + lport) << 32);
341 data6.pid = pid;
342 if (mep == 0) {
343 bpf_get_current_comm(&data6.task, sizeof(data6.task));
344 } else {
345 bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
346 }
347 ipv6_events.perf_submit(args, &data6, sizeof(data6));
348 }
349
350 if (mep != 0)
351 whoami.delete(&sk);
352
353 return 0;
354}
355"""
356
Brendan Greggefa6ee92017-12-30 12:48:48 -0800357if (BPF.tracepoint_exists("tcp", "tcp_set_state")):
Brendan Gregge023bc82017-12-29 22:46:27 -0800358 bpf_text += bpf_text_tracepoint
359else:
360 bpf_text += bpf_text_kprobe
361
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700362# code substitutions
363if args.pid:
364 bpf_text = bpf_text.replace('FILTER_PID',
365 'if (pid != %s) { return 0; }' % args.pid)
366if args.remoteport:
367 dports = [int(dport) for dport in args.remoteport.split(',')]
368 dports_if = ' && '.join(['dport != %d' % ntohs(dport) for dport in dports])
369 bpf_text = bpf_text.replace('FILTER_DPORT',
370 'if (%s) { birth.delete(&sk); return 0; }' % dports_if)
371if args.localport:
372 lports = [int(lport) for lport in args.localport.split(',')]
373 lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
374 bpf_text = bpf_text.replace('FILTER_LPORT',
375 'if (%s) { birth.delete(&sk); return 0; }' % lports_if)
376bpf_text = bpf_text.replace('FILTER_PID', '')
377bpf_text = bpf_text.replace('FILTER_DPORT', '')
378bpf_text = bpf_text.replace('FILTER_LPORT', '')
379
Nathan Scottca4ba552018-01-16 11:02:58 +1100380if debug or args.ebpf:
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700381 print(bpf_text)
Nathan Scottca4ba552018-01-16 11:02:58 +1100382 if args.ebpf:
383 exit()
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700384
385# event data
386TASK_COMM_LEN = 16 # linux/sched.h
387
388class Data_ipv4(ct.Structure):
389 _fields_ = [
390 ("ts_us", ct.c_ulonglong),
391 ("pid", ct.c_ulonglong),
392 ("saddr", ct.c_ulonglong),
393 ("daddr", ct.c_ulonglong),
394 ("ports", ct.c_ulonglong),
395 ("rx_b", ct.c_ulonglong),
396 ("tx_b", ct.c_ulonglong),
397 ("span_us", ct.c_ulonglong),
398 ("task", ct.c_char * TASK_COMM_LEN)
399 ]
400
401class Data_ipv6(ct.Structure):
402 _fields_ = [
403 ("ts_us", ct.c_ulonglong),
404 ("pid", ct.c_ulonglong),
405 ("saddr", (ct.c_ulonglong * 2)),
406 ("daddr", (ct.c_ulonglong * 2)),
407 ("ports", ct.c_ulonglong),
408 ("rx_b", ct.c_ulonglong),
409 ("tx_b", ct.c_ulonglong),
410 ("span_us", ct.c_ulonglong),
411 ("task", ct.c_char * TASK_COMM_LEN)
412 ]
413
414#
415# Setup output formats
416#
417# Don't change the default output (next 2 lines): this fits in 80 chars. I
418# know it doesn't have NS or UIDs etc. I know. If you really, really, really
419# need to add columns, columns that solve real actual problems, I'd start by
420# adding an extended mode (-x) to included those columns.
421#
422header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
423format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
424if args.wide:
425 header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
426 format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
427if args.csv:
428 header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
429 format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
430
431# process event
432def print_ipv4_event(cpu, data, size):
433 event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
434 global start_ts
435 if args.time:
436 if args.csv:
437 print("%s," % strftime("%H:%M:%S"), end="")
438 else:
439 print("%-8s " % strftime("%H:%M:%S"), end="")
440 if args.timestamp:
441 if start_ts == 0:
442 start_ts = event.ts_us
443 delta_s = (float(event.ts_us) - start_ts) / 1000000
444 if args.csv:
445 print("%.6f," % delta_s, end="")
446 else:
447 print("%-9.6f " % delta_s, end="")
Rafael F78948e42017-03-26 14:54:25 +0200448 print(format_string % (event.pid, event.task.decode(),
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700449 "4" if args.wide or args.csv else "",
450 inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
451 inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
452 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
453
454def print_ipv6_event(cpu, data, size):
455 event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
456 global start_ts
457 if args.time:
458 if args.csv:
459 print("%s," % strftime("%H:%M:%S"), end="")
460 else:
461 print("%-8s " % strftime("%H:%M:%S"), end="")
462 if args.timestamp:
463 if start_ts == 0:
464 start_ts = event.ts_us
465 delta_s = (float(event.ts_us) - start_ts) / 1000000
466 if args.csv:
467 print("%.6f," % delta_s, end="")
468 else:
469 print("%-9.6f " % delta_s, end="")
Rafael F78948e42017-03-26 14:54:25 +0200470 print(format_string % (event.pid, event.task.decode(),
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700471 "6" if args.wide or args.csv else "",
472 inet_ntop(AF_INET6, event.saddr), event.ports >> 32,
473 inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
474 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
475
476# initialize BPF
477b = BPF(text=bpf_text)
478
479# header
480if args.time:
481 if args.csv:
482 print("%s," % ("TIME"), end="")
483 else:
484 print("%-8s " % ("TIME"), end="")
485if args.timestamp:
486 if args.csv:
487 print("%s," % ("TIME(s)"), end="")
488 else:
489 print("%-9s " % ("TIME(s)"), end="")
490print(header_string % ("PID", "COMM",
491 "IP" if args.wide or args.csv else "", "LADDR",
492 "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"))
493
494start_ts = 0
495
496# read events
Mark Drayton5f5687e2017-02-20 18:13:03 +0000497b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
498b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700499while 1:
500 b.kprobe_poll()