blob: 560bb6f38f57686f97ae49125a4f4de3b92fff5c [file] [log] [blame]
Brendan Gregg797c3ec2016-10-19 18:55:10 -07001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcplife Trace the lifespan of TCP sessions and summarize.
5# For Linux, uses BCC, BPF. Embedded C.
6#
7# USAGE: tcplife [-h] [-C] [-S] [-p PID] [interval [count]]
8#
Brendan Greggfd93dc02018-03-19 15:33:48 -07009# This uses the sock:inet_sock_set_state tracepoint if it exists (added to
10# Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses
11# kernel dynamic tracing of tcp_set_state().
Brendan Gregg797c3ec2016-10-19 18:55:10 -070012#
13# While throughput counters are emitted, they are fetched in a low-overhead
14# manner: reading members of the tcp_info struct on TCP close. ie, we do not
15# trace send/receive.
16#
17# Copyright 2016 Netflix, Inc.
18# Licensed under the Apache License, Version 2.0 (the "License")
19#
20# IDEA: Julia Evans
21#
22# 18-Oct-2016 Brendan Gregg Created this.
Brendan Gregge023bc82017-12-29 22:46:27 -080023# 29-Dec-2017 " " Added tracepoint support.
Brendan Gregg797c3ec2016-10-19 18:55:10 -070024
25from __future__ import print_function
26from bcc import BPF
27import argparse
28from socket import inet_ntop, ntohs, AF_INET, AF_INET6
29from struct import pack
30import ctypes as ct
31from time import strftime
32
33# arguments
34examples = """examples:
35 ./tcplife # trace all TCP connect()s
36 ./tcplife -t # include time column (HH:MM:SS)
37 ./tcplife -w # wider colums (fit IPv6)
38 ./tcplife -stT # csv output, with times & timestamps
39 ./tcplife -p 181 # only trace PID 181
40 ./tcplife -L 80 # only trace local port 80
41 ./tcplife -L 80,81 # only trace local ports 80 and 81
42 ./tcplife -D 80 # only trace remote port 80
43"""
44parser = argparse.ArgumentParser(
45 description="Trace the lifespan of TCP sessions and summarize",
46 formatter_class=argparse.RawDescriptionHelpFormatter,
Nathan Scott1a197db2018-01-21 09:14:27 +110047 epilog=examples)
Brendan Gregg797c3ec2016-10-19 18:55:10 -070048parser.add_argument("-T", "--time", action="store_true",
49 help="include time column on output (HH:MM:SS)")
50parser.add_argument("-t", "--timestamp", action="store_true",
51 help="include timestamp on output (seconds)")
52parser.add_argument("-w", "--wide", action="store_true",
53 help="wide column output (fits IPv6 addresses)")
54parser.add_argument("-s", "--csv", action="store_true",
Edward Bettsfdf9b082017-10-10 21:13:28 +010055 help="comma separated values output")
Brendan Gregg797c3ec2016-10-19 18:55:10 -070056parser.add_argument("-p", "--pid",
57 help="trace this PID only")
58parser.add_argument("-L", "--localport",
59 help="comma-separated list of local ports to trace.")
60parser.add_argument("-D", "--remoteport",
61 help="comma-separated list of remote ports to trace.")
Nathan Scottf5fb9af2018-01-17 09:39:59 +110062parser.add_argument("--ebpf", action="store_true",
63 help=argparse.SUPPRESS)
Brendan Gregg797c3ec2016-10-19 18:55:10 -070064args = parser.parse_args()
65debug = 0
66
67# define BPF program
68bpf_text = """
69#include <uapi/linux/ptrace.h>
70#define KBUILD_MODNAME "foo"
71#include <linux/tcp.h>
72#include <net/sock.h>
73#include <bcc/proto.h>
74
75BPF_HASH(birth, struct sock *, u64);
76
77// separate data structs for ipv4 and ipv6
78struct ipv4_data_t {
79 // XXX: switch some to u32's when supported
80 u64 ts_us;
81 u64 pid;
82 u64 saddr;
83 u64 daddr;
84 u64 ports;
85 u64 rx_b;
86 u64 tx_b;
87 u64 span_us;
88 char task[TASK_COMM_LEN];
89};
90BPF_PERF_OUTPUT(ipv4_events);
91
92struct ipv6_data_t {
93 u64 ts_us;
94 u64 pid;
95 unsigned __int128 saddr;
96 unsigned __int128 daddr;
97 u64 ports;
98 u64 rx_b;
99 u64 tx_b;
100 u64 span_us;
101 char task[TASK_COMM_LEN];
102};
103BPF_PERF_OUTPUT(ipv6_events);
104
105struct id_t {
106 u32 pid;
107 char task[TASK_COMM_LEN];
108};
109BPF_HASH(whoami, struct sock *, struct id_t);
Brendan Gregge023bc82017-12-29 22:46:27 -0800110"""
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700111
Brendan Gregge023bc82017-12-29 22:46:27 -0800112#
113# XXX: The following is temporary code for older kernels, Linux 4.14 and
Brendan Greggfd93dc02018-03-19 15:33:48 -0700114# older. It uses kprobes to instrument tcp_set_state(). On Linux 4.16 and
115# later, the sock:inet_sock_set_state tracepoint should be used instead, as
116# is done by the code that follows this. In the distant future (2021?), this
Brendan Gregge023bc82017-12-29 22:46:27 -0800117# kprobe code can be removed. This is why there is so much code
118# duplication: to make removal easier.
119#
120bpf_text_kprobe = """
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700121int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
122{
123 u32 pid = bpf_get_current_pid_tgid() >> 32;
124
125 // lport is either used in a filter here, or later
126 u16 lport = sk->__sk_common.skc_num;
127 FILTER_LPORT
128
129 // dport is either used in a filter here, or later
130 u16 dport = sk->__sk_common.skc_dport;
131 FILTER_DPORT
132
133 /*
134 * This tool includes PID and comm context. It's best effort, and may
135 * be wrong in some situations. It currently works like this:
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800136 * - record timestamp on any state < TCP_FIN_WAIT1
137 * - cache task context on:
138 * TCP_SYN_SENT: tracing from client
139 * TCP_LAST_ACK: client-closed from server
140 * - do output on TCP_CLOSE:
141 * fetch task context if cached, or use current task
142 */
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700143
Brendan Gregg42d00a42016-11-30 16:55:45 -0800144 // capture birth time
145 if (state < TCP_FIN_WAIT1) {
146 /*
147 * Matching just ESTABLISHED may be sufficient, provided no code-path
148 * sets ESTABLISHED without a tcp_set_state() call. Until we know
149 * that for sure, match all early states to increase chances a
150 * timestamp is set.
151 * Note that this needs to be set before the PID filter later on,
152 * since the PID isn't reliable for these early stages, so we must
153 * save all timestamps and do the PID filter later when we can.
154 */
155 u64 ts = bpf_ktime_get_ns();
156 birth.update(&sk, &ts);
157 }
158
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700159 // record PID & comm on SYN_SENT
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800160 if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
Brendan Gregg42d00a42016-11-30 16:55:45 -0800161 // now we can PID filter, both here and a little later on for CLOSE
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700162 FILTER_PID
163 struct id_t me = {.pid = pid};
164 bpf_get_current_comm(&me.task, sizeof(me.task));
165 whoami.update(&sk, &me);
166 }
167
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800168 if (state != TCP_CLOSE)
169 return 0;
170
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700171 // calculate lifespan
172 u64 *tsp, delta_us;
173 tsp = birth.lookup(&sk);
174 if (tsp == 0) {
Brendan Gregg42d00a42016-11-30 16:55:45 -0800175 whoami.delete(&sk); // may not exist
176 return 0; // missed create
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700177 }
178 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
Brendan Gregg42d00a42016-11-30 16:55:45 -0800179 birth.delete(&sk);
180
181 // fetch possible cached data, and filter
182 struct id_t *mep;
183 mep = whoami.lookup(&sk);
184 if (mep != 0)
185 pid = mep->pid;
186 FILTER_PID
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700187
188 // get throughput stats. see tcp_get_info().
189 u64 rx_b = 0, tx_b = 0, sport = 0;
190 struct tcp_sock *tp = (struct tcp_sock *)sk;
191 rx_b = tp->bytes_received;
192 tx_b = tp->bytes_acked;
193
194 u16 family = sk->__sk_common.skc_family;
195
196 if (family == AF_INET) {
197 struct ipv4_data_t data4 = {.span_us = delta_us,
198 .rx_b = rx_b, .tx_b = tx_b};
199 data4.ts_us = bpf_ktime_get_ns() / 1000;
200 data4.saddr = sk->__sk_common.skc_rcv_saddr;
201 data4.daddr = sk->__sk_common.skc_daddr;
202 // a workaround until data4 compiles with separate lport/dport
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800203 data4.pid = pid;
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700204 data4.ports = ntohs(dport) + ((0ULL + lport) << 32);
205 if (mep == 0) {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700206 bpf_get_current_comm(&data4.task, sizeof(data4.task));
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700207 } else {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700208 bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
209 }
210 ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
211
212 } else /* 6 */ {
213 struct ipv6_data_t data6 = {.span_us = delta_us,
214 .rx_b = rx_b, .tx_b = tx_b};
215 data6.ts_us = bpf_ktime_get_ns() / 1000;
216 bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
217 sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
218 bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
219 sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
220 // a workaround until data6 compiles with separate lport/dport
221 data6.ports = ntohs(dport) + ((0ULL + lport) << 32);
Brendan Gregg4fd7d322016-11-28 17:57:20 -0800222 data6.pid = pid;
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700223 if (mep == 0) {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700224 bpf_get_current_comm(&data6.task, sizeof(data6.task));
225 } else {
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700226 bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
227 }
228 ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
229 }
230
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700231 if (mep != 0)
232 whoami.delete(&sk);
233
234 return 0;
235}
236"""
237
Brendan Gregge023bc82017-12-29 22:46:27 -0800238bpf_text_tracepoint = """
Brendan Greggfd93dc02018-03-19 15:33:48 -0700239TRACEPOINT_PROBE(sock, inet_sock_set_state)
Brendan Gregge023bc82017-12-29 22:46:27 -0800240{
Brendan Greggfd93dc02018-03-19 15:33:48 -0700241 if (args->protocol != IPPROTO_TCP)
242 return 0;
243
Brendan Gregge023bc82017-12-29 22:46:27 -0800244 u32 pid = bpf_get_current_pid_tgid() >> 32;
Brendan Greggfd93dc02018-03-19 15:33:48 -0700245 // sk is mostly used as a UUID, and for two tcp stats:
Brendan Gregge023bc82017-12-29 22:46:27 -0800246 struct sock *sk = (struct sock *)args->skaddr;
247
248 // lport is either used in a filter here, or later
249 u16 lport = args->sport;
250 FILTER_LPORT
251
252 // dport is either used in a filter here, or later
253 u16 dport = args->dport;
254 FILTER_DPORT
255
256 /*
257 * This tool includes PID and comm context. It's best effort, and may
258 * be wrong in some situations. It currently works like this:
259 * - record timestamp on any state < TCP_FIN_WAIT1
260 * - cache task context on:
261 * TCP_SYN_SENT: tracing from client
262 * TCP_LAST_ACK: client-closed from server
263 * - do output on TCP_CLOSE:
264 * fetch task context if cached, or use current task
265 */
266
267 // capture birth time
268 if (args->newstate < TCP_FIN_WAIT1) {
269 /*
270 * Matching just ESTABLISHED may be sufficient, provided no code-path
271 * sets ESTABLISHED without a tcp_set_state() call. Until we know
272 * that for sure, match all early states to increase chances a
273 * timestamp is set.
274 * Note that this needs to be set before the PID filter later on,
275 * since the PID isn't reliable for these early stages, so we must
276 * save all timestamps and do the PID filter later when we can.
277 */
278 u64 ts = bpf_ktime_get_ns();
279 birth.update(&sk, &ts);
280 }
281
282 // record PID & comm on SYN_SENT
283 if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) {
284 // now we can PID filter, both here and a little later on for CLOSE
285 FILTER_PID
286 struct id_t me = {.pid = pid};
287 bpf_get_current_comm(&me.task, sizeof(me.task));
288 whoami.update(&sk, &me);
289 }
290
291 if (args->newstate != TCP_CLOSE)
292 return 0;
293
294 // calculate lifespan
295 u64 *tsp, delta_us;
296 tsp = birth.lookup(&sk);
297 if (tsp == 0) {
298 whoami.delete(&sk); // may not exist
299 return 0; // missed create
300 }
301 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
302 birth.delete(&sk);
303
304 // fetch possible cached data, and filter
305 struct id_t *mep;
306 mep = whoami.lookup(&sk);
307 if (mep != 0)
308 pid = mep->pid;
309 FILTER_PID
310
311 // get throughput stats. see tcp_get_info().
312 u64 rx_b = 0, tx_b = 0, sport = 0;
313 struct tcp_sock *tp = (struct tcp_sock *)sk;
314 bpf_probe_read(&rx_b, sizeof(rx_b), &tp->bytes_received);
315 bpf_probe_read(&tx_b, sizeof(tx_b), &tp->bytes_acked);
316
Brendan Greggfd93dc02018-03-19 15:33:48 -0700317 if (args->family == AF_INET) {
Brendan Gregge023bc82017-12-29 22:46:27 -0800318 struct ipv4_data_t data4 = {.span_us = delta_us,
319 .rx_b = rx_b, .tx_b = tx_b};
320 data4.ts_us = bpf_ktime_get_ns() / 1000;
321 bpf_probe_read(&data4.saddr, sizeof(u32), args->saddr);
322 bpf_probe_read(&data4.daddr, sizeof(u32), args->daddr);
323 // a workaround until data4 compiles with separate lport/dport
324 data4.ports = dport + ((0ULL + lport) << 32);
325 data4.pid = pid;
326
327 if (mep == 0) {
328 bpf_get_current_comm(&data4.task, sizeof(data4.task));
329 } else {
330 bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
331 }
332 ipv4_events.perf_submit(args, &data4, sizeof(data4));
333
334 } else /* 6 */ {
335 struct ipv6_data_t data6 = {.span_us = delta_us,
336 .rx_b = rx_b, .tx_b = tx_b};
337 data6.ts_us = bpf_ktime_get_ns() / 1000;
338 bpf_probe_read(&data6.saddr, sizeof(data6.saddr), args->saddr_v6);
339 bpf_probe_read(&data6.daddr, sizeof(data6.daddr), args->saddr_v6);
340 // a workaround until data6 compiles with separate lport/dport
341 data6.ports = dport + ((0ULL + lport) << 32);
342 data6.pid = pid;
343 if (mep == 0) {
344 bpf_get_current_comm(&data6.task, sizeof(data6.task));
345 } else {
346 bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
347 }
348 ipv6_events.perf_submit(args, &data6, sizeof(data6));
349 }
350
351 if (mep != 0)
352 whoami.delete(&sk);
353
354 return 0;
355}
356"""
357
Brendan Greggfd93dc02018-03-19 15:33:48 -0700358if (BPF.tracepoint_exists("sock", "inet_sock_set_state")):
Brendan Gregge023bc82017-12-29 22:46:27 -0800359 bpf_text += bpf_text_tracepoint
360else:
361 bpf_text += bpf_text_kprobe
362
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700363# code substitutions
364if args.pid:
365 bpf_text = bpf_text.replace('FILTER_PID',
366 'if (pid != %s) { return 0; }' % args.pid)
367if args.remoteport:
368 dports = [int(dport) for dport in args.remoteport.split(',')]
369 dports_if = ' && '.join(['dport != %d' % ntohs(dport) for dport in dports])
370 bpf_text = bpf_text.replace('FILTER_DPORT',
371 'if (%s) { birth.delete(&sk); return 0; }' % dports_if)
372if args.localport:
373 lports = [int(lport) for lport in args.localport.split(',')]
374 lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
375 bpf_text = bpf_text.replace('FILTER_LPORT',
376 'if (%s) { birth.delete(&sk); return 0; }' % lports_if)
377bpf_text = bpf_text.replace('FILTER_PID', '')
378bpf_text = bpf_text.replace('FILTER_DPORT', '')
379bpf_text = bpf_text.replace('FILTER_LPORT', '')
380
Nathan Scottca4ba552018-01-16 11:02:58 +1100381if debug or args.ebpf:
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700382 print(bpf_text)
Nathan Scottca4ba552018-01-16 11:02:58 +1100383 if args.ebpf:
384 exit()
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700385
386# event data
387TASK_COMM_LEN = 16 # linux/sched.h
388
389class Data_ipv4(ct.Structure):
390 _fields_ = [
391 ("ts_us", ct.c_ulonglong),
392 ("pid", ct.c_ulonglong),
393 ("saddr", ct.c_ulonglong),
394 ("daddr", ct.c_ulonglong),
395 ("ports", ct.c_ulonglong),
396 ("rx_b", ct.c_ulonglong),
397 ("tx_b", ct.c_ulonglong),
398 ("span_us", ct.c_ulonglong),
399 ("task", ct.c_char * TASK_COMM_LEN)
400 ]
401
402class Data_ipv6(ct.Structure):
403 _fields_ = [
404 ("ts_us", ct.c_ulonglong),
405 ("pid", ct.c_ulonglong),
406 ("saddr", (ct.c_ulonglong * 2)),
407 ("daddr", (ct.c_ulonglong * 2)),
408 ("ports", ct.c_ulonglong),
409 ("rx_b", ct.c_ulonglong),
410 ("tx_b", ct.c_ulonglong),
411 ("span_us", ct.c_ulonglong),
412 ("task", ct.c_char * TASK_COMM_LEN)
413 ]
414
415#
416# Setup output formats
417#
418# Don't change the default output (next 2 lines): this fits in 80 chars. I
419# know it doesn't have NS or UIDs etc. I know. If you really, really, really
420# need to add columns, columns that solve real actual problems, I'd start by
421# adding an extended mode (-x) to included those columns.
422#
423header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
424format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
425if args.wide:
426 header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
427 format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
428if args.csv:
429 header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
430 format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
431
432# process event
433def print_ipv4_event(cpu, data, size):
434 event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
435 global start_ts
436 if args.time:
437 if args.csv:
438 print("%s," % strftime("%H:%M:%S"), end="")
439 else:
440 print("%-8s " % strftime("%H:%M:%S"), end="")
441 if args.timestamp:
442 if start_ts == 0:
443 start_ts = event.ts_us
444 delta_s = (float(event.ts_us) - start_ts) / 1000000
445 if args.csv:
446 print("%.6f," % delta_s, end="")
447 else:
448 print("%-9.6f " % delta_s, end="")
Rafael F78948e42017-03-26 14:54:25 +0200449 print(format_string % (event.pid, event.task.decode(),
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700450 "4" if args.wide or args.csv else "",
451 inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32,
452 inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff,
453 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
454
455def print_ipv6_event(cpu, data, size):
456 event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
457 global start_ts
458 if args.time:
459 if args.csv:
460 print("%s," % strftime("%H:%M:%S"), end="")
461 else:
462 print("%-8s " % strftime("%H:%M:%S"), end="")
463 if args.timestamp:
464 if start_ts == 0:
465 start_ts = event.ts_us
466 delta_s = (float(event.ts_us) - start_ts) / 1000000
467 if args.csv:
468 print("%.6f," % delta_s, end="")
469 else:
470 print("%-9.6f " % delta_s, end="")
Rafael F78948e42017-03-26 14:54:25 +0200471 print(format_string % (event.pid, event.task.decode(),
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700472 "6" if args.wide or args.csv else "",
473 inet_ntop(AF_INET6, event.saddr), event.ports >> 32,
474 inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff,
475 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000))
476
477# initialize BPF
478b = BPF(text=bpf_text)
479
480# header
481if args.time:
482 if args.csv:
483 print("%s," % ("TIME"), end="")
484 else:
485 print("%-8s " % ("TIME"), end="")
486if args.timestamp:
487 if args.csv:
488 print("%s," % ("TIME(s)"), end="")
489 else:
490 print("%-9s " % ("TIME(s)"), end="")
491print(header_string % ("PID", "COMM",
492 "IP" if args.wide or args.csv else "", "LADDR",
493 "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"))
494
495start_ts = 0
496
497# read events
Mark Drayton5f5687e2017-02-20 18:13:03 +0000498b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
499b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
Brendan Gregg797c3ec2016-10-19 18:55:10 -0700500while 1:
Teng Qindbf00292018-02-28 21:47:50 -0800501 b.perf_buffer_poll()