blob: 26add2ac17233d095f3c51207524a5765e8480b6 [file] [log] [blame]
Brendan Greggafc97252016-02-19 16:07:36 -08001#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcpconnlat Trace TCP active connection latency (connect).
5# For Linux, uses BCC, eBPF. Embedded C.
6#
7# USAGE: tcpconnlat [-h] [-t] [-p PID]
8#
9# This uses dynamic tracing of kernel functions, and will need to be updated
10# to match kernel changes.
11#
12# IPv4 addresses are printed as dotted quads. For IPv6 addresses, the last four
13# bytes are printed after "..."; check for future versions with better IPv6
14# support.
15#
16# Copyright 2016 Netflix, Inc.
17# Licensed under the Apache License, Version 2.0 (the "License")
18#
19# 19-Feb-2016 Brendan Gregg Created this.
20
21from __future__ import print_function
22from bcc import BPF
23import argparse
24import ctypes as ct
25
26# arguments
27examples = """examples:
28 ./tcpconnlat # trace all TCP connect()s
29 ./tcpconnlat -t # include timestamps
30 ./tcpconnlat -p 181 # only trace PID 181
31"""
32parser = argparse.ArgumentParser(
33 description="Trace TCP connects and show connection latency",
34 formatter_class=argparse.RawDescriptionHelpFormatter,
35 epilog=examples)
36parser.add_argument("-t", "--timestamp", action="store_true",
37 help="include timestamp on output")
38parser.add_argument("-p", "--pid",
39 help="trace this PID only")
40args = parser.parse_args()
41debug = 0
42
43# define BPF program
44bpf_text = """
45#include <uapi/linux/ptrace.h>
46#include <net/sock.h>
47#include <bcc/proto.h>
48
49struct info_t {
50 u64 ts;
51 u64 pid;
52 char task[TASK_COMM_LEN];
53};
54BPF_HASH(start, struct sock *, struct info_t);
55
56// separate data structs for ipv4 and ipv6
57struct ipv4_data_t {
58 // XXX: switch some to u32's when supported
59 u64 ts_us;
60 u64 pid;
61 u64 ip;
62 u64 saddr;
63 u64 daddr;
64 u64 dport;
65 u64 delta_us;
66 char task[TASK_COMM_LEN];
67};
68BPF_PERF_OUTPUT(ipv4_events);
69
70struct ipv6_data_t {
71 // XXX: update to transfer full ipv6 addrs
72 u64 ts_us;
73 u64 pid;
74 u64 ip;
75 u64 saddr;
76 u64 daddr;
77 u64 dport;
78 u64 delta_us;
79 char task[TASK_COMM_LEN];
80};
81BPF_PERF_OUTPUT(ipv6_events);
82
83int trace_connect(struct pt_regs *ctx, struct sock *sk)
84{
85 u32 pid = bpf_get_current_pid_tgid();
86 FILTER
87 struct info_t info = {.pid = pid};
88 info.ts = bpf_ktime_get_ns();
89 bpf_get_current_comm(&info.task, sizeof(info.task));
90 start.update(&sk, &info);
91 return 0;
92};
93
94// See tcp_v4_do_rcv() and tcp_v6_do_rcv(). So TCP_ESTBALISHED and TCP_LISTEN
95// are fast path and processed elsewhere, and leftovers are processed by
Brendan Gregg2482c1c2016-02-19 16:14:01 -080096// tcp_rcv_state_process(). We can trace this for handshake completion.
97// This should all be switched to static tracepoints when available.
Brendan Greggafc97252016-02-19 16:07:36 -080098int trace_tcp_rcv_state_process(struct pt_regs *ctx, struct sock *sk)
99{
Brendan Gregg4db6d442016-02-19 16:56:47 -0800100 // will be in TCP_SYN_SENT for handshake
101 if (sk->__sk_common.skc_state != TCP_SYN_SENT)
102 return 0;
Brendan Greggafc97252016-02-19 16:07:36 -0800103
104 // check start and calculate delta
105 struct info_t *infop = start.lookup(&sk);
106 if (infop == 0) {
107 return 0; // missed entry or filtered
108 }
109 u64 ts = infop->ts;
110 u64 now = bpf_ktime_get_ns();
111
112 // pull in details
113 u16 family = 0, dport = 0;
114 struct sock *skp = NULL;
115 bpf_probe_read(&skp, sizeof(skp), &sk);
116 bpf_probe_read(&family, sizeof(family), &skp->__sk_common.skc_family);
117 bpf_probe_read(&dport, sizeof(dport), &skp->__sk_common.skc_dport);
118
119 // emit to appropriate data path
120 if (family == AF_INET) {
121 struct ipv4_data_t data4 = {.pid = infop->pid, .ip = 4};
122 data4.ts_us = now / 1000;
123 bpf_probe_read(&data4.saddr, sizeof(u32),
124 &skp->__sk_common.skc_rcv_saddr);
125 bpf_probe_read(&data4.daddr, sizeof(u32),
126 &skp->__sk_common.skc_daddr);
127 data4.dport = ntohs(dport);
128 data4.delta_us = (now - ts) / 1000;
129 __builtin_memcpy(&data4.task, infop->task, sizeof(data4.task));
130 ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
131
132 } else /* AF_INET6 */ {
133 struct ipv6_data_t data6 = {.pid = infop->pid, .ip = 6};
134 data6.ts_us = now / 1000;
135 // just grab the last 4 bytes for now
136 u32 saddr = 0, daddr = 0;
137 bpf_probe_read(&saddr, sizeof(saddr),
138 &skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32[3]);
139 bpf_probe_read(&daddr, sizeof(daddr),
140 &skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32[3]);
141 data6.saddr = bpf_ntohl(saddr);
142 data6.daddr = bpf_ntohl(daddr);
143 data6.dport = ntohs(dport);
144 data6.delta_us = (now - ts) / 1000;
145 __builtin_memcpy(&data6.task, infop->task, sizeof(data6.task));
146 ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
147 }
148
149 start.delete(&sk);
150
151 return 0;
152}
153"""
154
155# code substitutions
156if args.pid:
157 bpf_text = bpf_text.replace('FILTER',
158 'if (pid != %s) { return 0; }' % args.pid)
159else:
160 bpf_text = bpf_text.replace('FILTER', '')
161if debug:
162 print(bpf_text)
163
164# initialize BPF
165b = BPF(text=bpf_text)
166b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect")
167b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect")
168b.attach_kprobe(event="tcp_rcv_state_process",
169 fn_name="trace_tcp_rcv_state_process")
170
171# event data
172TASK_COMM_LEN = 16 # linux/sched.h
173class Data_ipv4(ct.Structure):
174 _fields_ = [
175 ("ts_us", ct.c_ulonglong),
176 ("pid", ct.c_ulonglong),
177 ("ip", ct.c_ulonglong),
178 ("saddr", ct.c_ulonglong),
179 ("daddr", ct.c_ulonglong),
180 ("dport", ct.c_ulonglong),
181 ("delta_us", ct.c_ulonglong),
182 ("task", ct.c_char * TASK_COMM_LEN)
183 ]
184class Data_ipv6(ct.Structure):
185 _fields_ = [
186 ("ts_us", ct.c_ulonglong),
187 ("pid", ct.c_ulonglong),
188 ("ip", ct.c_ulonglong),
189 ("saddr", ct.c_ulonglong),
190 ("daddr", ct.c_ulonglong),
191 ("dport", ct.c_ulonglong),
192 ("delta_us", ct.c_ulonglong),
193 ("task", ct.c_char * TASK_COMM_LEN)
194 ]
195
196# functions
197def inet_ntoa(addr):
198 dq = ''
199 for i in range(0, 4):
200 dq = dq + str(addr & 0xff)
201 if (i != 3):
202 dq = dq + '.'
203 addr = addr >> 8
204 return dq
205
206# process event
207start_ts = 0
208def print_ipv4_event(cpu, data, size):
209 event = ct.cast(data, ct.POINTER(Data_ipv4)).contents
210 global start_ts
211 if args.timestamp:
212 if start_ts == 0:
213 start_ts = event.ts_us
214 print("%-9.3f" % ((event.ts_us - start_ts) / 100000), end="")
215 print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid, event.task,
216 event.ip, inet_ntoa(event.saddr), inet_ntoa(event.daddr),
217 event.dport, float(event.delta_us) / 1000))
218def print_ipv6_event(cpu, data, size):
219 event = ct.cast(data, ct.POINTER(Data_ipv6)).contents
220 global start_ts
221 if args.timestamp:
222 if start_ts == 0:
223 start_ts = event.ts_us
224 print("%-9.3f" % ((event.ts_us - start_ts) / 100000), end="")
225 print("%-6d %-12.12s %-2d ...%-13x ...%-13x %-5d %.2f" % (event.pid,
226 event.task, event.ip, event.saddr, event.daddr, event.dport,
227 float(event.delta_us) / 1000))
228
229# header
230if args.timestamp:
231 print("%-9s" % ("TIME(s)"), end="")
232print("%-6s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP", "SADDR",
233 "DADDR", "DPORT", "LAT(ms)"))
234
235# read events
236b["ipv4_events"].open_perf_buffer(print_ipv4_event)
237b["ipv6_events"].open_perf_buffer(print_ipv6_event)
238while 1:
239 b.kprobe_poll()