blob: 5d323c80bffb8ebecd31af6e414c57615b1b085a [file] [log] [blame]
Brendan Greggafc97252016-02-19 16:07:36 -08001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcpconnlat Trace TCP active connection latency (connect).
5# For Linux, uses BCC, eBPF. Embedded C.
6#
7# USAGE: tcpconnlat [-h] [-t] [-p PID]
8#
9# This uses dynamic tracing of kernel functions, and will need to be updated
10# to match kernel changes.
11#
12# IPv4 addresses are printed as dotted quads. For IPv6 addresses, the last four
13# bytes are printed after "..."; check for future versions with better IPv6
14# support.
15#
16# Copyright 2016 Netflix, Inc.
17# Licensed under the Apache License, Version 2.0 (the "License")
18#
19# 19-Feb-2016 Brendan Gregg Created this.
20
21from __future__ import print_function
22from bcc import BPF
23import argparse
24import ctypes as ct
25
26# arguments
27examples = """examples:
28 ./tcpconnlat # trace all TCP connect()s
29 ./tcpconnlat -t # include timestamps
30 ./tcpconnlat -p 181 # only trace PID 181
31"""
32parser = argparse.ArgumentParser(
33 description="Trace TCP connects and show connection latency",
34 formatter_class=argparse.RawDescriptionHelpFormatter,
35 epilog=examples)
36parser.add_argument("-t", "--timestamp", action="store_true",
37 help="include timestamp on output")
38parser.add_argument("-p", "--pid",
39 help="trace this PID only")
40args = parser.parse_args()
41debug = 0
42
43# define BPF program
44bpf_text = """
45#include <uapi/linux/ptrace.h>
46#include <net/sock.h>
47#include <bcc/proto.h>
48
49struct info_t {
50 u64 ts;
51 u64 pid;
52 char task[TASK_COMM_LEN];
53};
54BPF_HASH(start, struct sock *, struct info_t);
55
56// separate data structs for ipv4 and ipv6
57struct ipv4_data_t {
58 // XXX: switch some to u32's when supported
59 u64 ts_us;
60 u64 pid;
61 u64 ip;
62 u64 saddr;
63 u64 daddr;
64 u64 dport;
65 u64 delta_us;
66 char task[TASK_COMM_LEN];
67};
68BPF_PERF_OUTPUT(ipv4_events);
69
70struct ipv6_data_t {
71 // XXX: update to transfer full ipv6 addrs
72 u64 ts_us;
73 u64 pid;
74 u64 ip;
75 u64 saddr;
76 u64 daddr;
77 u64 dport;
78 u64 delta_us;
79 char task[TASK_COMM_LEN];
80};
81BPF_PERF_OUTPUT(ipv6_events);
82
83int trace_connect(struct pt_regs *ctx, struct sock *sk)
84{
85 u32 pid = bpf_get_current_pid_tgid();
86 FILTER
87 struct info_t info = {.pid = pid};
88 info.ts = bpf_ktime_get_ns();
89 bpf_get_current_comm(&info.task, sizeof(info.task));
90 start.update(&sk, &info);
91 return 0;
92};
93
94// See tcp_v4_do_rcv() and tcp_v6_do_rcv(). So TCP_ESTBALISHED and TCP_LISTEN
95// are fast path and processed elsewhere, and leftovers are processed by
Brendan Gregg2482c1c2016-02-19 16:14:01 -080096// tcp_rcv_state_process(). We can trace this for handshake completion.
97// This should all be switched to static tracepoints when available.
Brendan Greggafc97252016-02-19 16:07:36 -080098int trace_tcp_rcv_state_process(struct pt_regs *ctx, struct sock *sk)
99{
100 u32 pid = bpf_get_current_pid_tgid();
101
102 // check start and calculate delta
103 struct info_t *infop = start.lookup(&sk);
104 if (infop == 0) {
105 return 0; // missed entry or filtered
106 }
107 u64 ts = infop->ts;
108 u64 now = bpf_ktime_get_ns();
109
110 // pull in details
111 u16 family = 0, dport = 0;
112 struct sock *skp = NULL;
113 bpf_probe_read(&skp, sizeof(skp), &sk);
114 bpf_probe_read(&family, sizeof(family), &skp->__sk_common.skc_family);
115 bpf_probe_read(&dport, sizeof(dport), &skp->__sk_common.skc_dport);
116
117 // emit to appropriate data path
118 if (family == AF_INET) {
119 struct ipv4_data_t data4 = {.pid = infop->pid, .ip = 4};
120 data4.ts_us = now / 1000;
121 bpf_probe_read(&data4.saddr, sizeof(u32),
122 &skp->__sk_common.skc_rcv_saddr);
123 bpf_probe_read(&data4.daddr, sizeof(u32),
124 &skp->__sk_common.skc_daddr);
125 data4.dport = ntohs(dport);
126 data4.delta_us = (now - ts) / 1000;
127 __builtin_memcpy(&data4.task, infop->task, sizeof(data4.task));
128 ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
129
130 } else /* AF_INET6 */ {
131 struct ipv6_data_t data6 = {.pid = infop->pid, .ip = 6};
132 data6.ts_us = now / 1000;
133 // just grab the last 4 bytes for now
134 u32 saddr = 0, daddr = 0;
135 bpf_probe_read(&saddr, sizeof(saddr),
136 &skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[3]);
137 bpf_probe_read(&daddr, sizeof(daddr),
138 &skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32[3]);
139 data6.saddr = bpf_ntohl(saddr);
140 data6.daddr = bpf_ntohl(daddr);
141 data6.dport = ntohs(dport);
142 data6.delta_us = (now - ts) / 1000;
143 __builtin_memcpy(&data6.task, infop->task, sizeof(data6.task));
144 ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
145 }
146
147 start.delete(&sk);
148
149 return 0;
150}
151"""
152
153# code substitutions
154if args.pid:
155 bpf_text = bpf_text.replace('FILTER',
156 'if (pid != %s) { return 0; }' % args.pid)
157else:
158 bpf_text = bpf_text.replace('FILTER', '')
159if debug:
160 print(bpf_text)
161
162# initialize BPF
163b = BPF(text=bpf_text)
164b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect")
165b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect")
166b.attach_kprobe(event="tcp_rcv_state_process",
167 fn_name="trace_tcp_rcv_state_process")
168
169# event data
170TASK_COMM_LEN = 16 # linux/sched.h
171class Data_ipv4(ct.Structure):
172 _fields_ = [
173 ("ts_us", ct.c_ulonglong),
174 ("pid", ct.c_ulonglong),
175 ("ip", ct.c_ulonglong),
176 ("saddr", ct.c_ulonglong),
177 ("daddr", ct.c_ulonglong),
178 ("dport", ct.c_ulonglong),
179 ("delta_us", ct.c_ulonglong),
180 ("task", ct.c_char * TASK_COMM_LEN)
181 ]
182class Data_ipv6(ct.Structure):
183 _fields_ = [
184 ("ts_us", ct.c_ulonglong),
185 ("pid", ct.c_ulonglong),
186 ("ip", ct.c_ulonglong),
187 ("saddr", ct.c_ulonglong),
188 ("daddr", ct.c_ulonglong),
189 ("dport", ct.c_ulonglong),
190 ("delta_us", ct.c_ulonglong),
191 ("task", ct.c_char * TASK_COMM_LEN)
192 ]
193
194# functions
195def inet_ntoa(addr):
196 dq = ''
197 for i in range(0, 4):
198 dq = dq + str(addr & 0xff)
199 if (i != 3):
200 dq = dq + '.'
201 addr = addr >> 8
202 return dq
203
204# process event
205start_ts = 0
206def print_ipv4_event(cpu, data, size):
207 event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
208 global start_ts
209 if args.timestamp:
210 if start_ts == 0:
211 start_ts = event.ts_us
212 print("%-9.3f" % ((event.ts_us - start_ts) / 100000), end="")
213 print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid, event.task,
214 event.ip, inet_ntoa(event.saddr), inet_ntoa(event.daddr),
215 event.dport, float(event.delta_us) / 1000))
216def print_ipv6_event(cpu, data, size):
217 event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
218 global start_ts
219 if args.timestamp:
220 if start_ts == 0:
221 start_ts = event.ts_us
222 print("%-9.3f" % ((event.ts_us - start_ts) / 100000), end="")
223 print("%-6d %-12.12s %-2d ...%-13x ...%-13x %-5d %.2f" % (event.pid,
224 event.task, event.ip, event.saddr, event.daddr, event.dport,
225 float(event.delta_us) / 1000))
226
227# header
228if args.timestamp:
229 print("%-9s" % ("TIME(s)"), end="")
230print("%-6s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP", "SADDR",
231 "DADDR", "DPORT", "LAT(ms)"))
232
233# read events
234b["ipv4_events"].open_perf_buffer(print_ipv4_event)
235b["ipv6_events"].open_perf_buffer(print_ipv6_event)
236while 1:
237 b.kprobe_poll()