introduce {attach|detach}_raw_tracepoint API
The motivation comes from pull request #1689.
It attached a kprobe bpf program to kernel function
ttwu_do_wakeup for more accurate tracing.
Unfortunately, it broke runqlat.py in my
4.17 environment since ttwu_do_wakeup function
is inlined in my kernel with gcc 7.3.1.
4.17 introduced raw_tracepoint and this patch
added the relevant API to bcc. With this,
we can use tracepoints
sched:{sched_wakeup, sched_wakeup_new, sched_switch}
to measure runq latency more reliably.
Signed-off-by: Yonghong Song <yhs@fb.com>
diff --git a/docs/reference_guide.md b/docs/reference_guide.md
index e24ac8a..63fb73c 100644
--- a/docs/reference_guide.md
+++ b/docs/reference_guide.md
@@ -14,6 +14,7 @@
- [4. uprobes](#4-uprobes)
- [5. uretprobes](#5-uretprobes)
- [6. USDT probes](#6-usdt-probes)
+ - [7. Raw Tracepoints](#7-raw-tracepoints)
- [Data](#data)
- [1. bpf_probe_read()](#1-bpf_probe_read)
- [2. bpf_probe_read_str()](#2-bpf_probe_read_str)
@@ -61,6 +62,7 @@
- [4. attach_uprobe()](#4-attach_uprobe)
- [5. attach_uretprobe()](#5-attach_uretprobe)
- [6. USDT.enable_probe()](#6-usdtenable_probe)
+ - [7. attach_raw_tracepoint()](#7-attach_raw_tracepoint)
- [Debug Output](#debug-output)
- [1. trace_print()](#1-trace_print)
- [2. trace_fields()](#2-trace_fields)
@@ -237,6 +239,35 @@
[search /examples](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Aexamples&type=Code),
[search /tools](https://github.com/iovisor/bcc/search?q=bpf_usdt_readarg+path%3Atools&type=Code)
+### 7. Raw Tracepoints
+
+Syntax: RAW_TRACEPOINT_PROBE(*event*)
+
+This is a macro that instruments the raw tracepoint defined by *event*.
+
+The argument is a pointer to struct ```bpf_raw_tracepoint_args```, which is defined in [bpf.h](https://github.com/iovisor/bcc/blob/master/src/cc/compat/linux/bpf.h). The struct field ```args``` contains all parameters of the raw tracepoint where you can found at linux tree [include/trace/events](https://github.com/torvalds/linux/tree/master/include/trace/events)
+directory.
+
+For example:
+```C
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+ // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev = (struct task_struct *)ctx->args[1];
+ struct task_struct *next= (struct task_struct *)ctx->args[2];
+ s32 prev_tgid, next_tgid;
+
+ bpf_probe_read(&prev_tgid, sizeof(prev->tgid), &prev->tgid);
+ bpf_probe_read(&next_tgid, sizeof(next->tgid), &next->tgid);
+ bpf_trace_printk("%d -> %d\\n", prev_tgid, next_tgid);
+}
+```
+
+This instruments the sched:sched_switch tracepoint, and prints the prev and next tgid.
+
+Examples in situ:
+[search /tools](https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code)
+
## Data
### 1. bpf_probe_read()
@@ -993,6 +1024,23 @@
[search /examples](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Aexamples+language%3Apython&type=Code),
[search /tools](https://github.com/iovisor/bcc/search?q=enable_probe+path%3Atools+language%3Apython&type=Code)
+### 7. attach_raw_tracepoint()
+
+Syntax: ```BPF.attach_raw_tracepoint(tp="tracepoint", fn_name="name")```
+
+Instruments the kernel raw tracepoint described by ```tracepoint``` (```event``` only, no ```category```), and when hit, runs the BPF function ```name()```.
+
+This is an explicit way to instrument tracepoints. The ```RAW_TRACEPOINT_PROBE``` syntax, covered in the earlier raw tracepoints section, is an alternate method.
+
+For example:
+
+```Python
+b.attach_raw_tracepoint("sched_swtich", "do_trace")
+```
+
+Examples in situ:
+[search /tools](https://github.com/iovisor/bcc/search?q=attach_raw_tracepoint+path%3Atools+language%3Apython&type=Code)
+
## Debug Output
### 1. trace_print()
diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h
index 2ed5ae1..11044cf 100644
--- a/src/cc/export/helpers.h
+++ b/src/cc/export/helpers.h
@@ -683,6 +683,9 @@
#define TRACEPOINT_PROBE(category, event) \
int tracepoint__##category##__##event(struct tracepoint__##category##__##event *args)
+#define RAW_TRACEPOINT_PROBE(event) \
+int raw_tracepoint__##event(struct bpf_raw_tracepoint_args *ctx)
+
#define TP_DATA_LOC_READ_CONST(dst, field, length) \
do { \
unsigned short __offset = args->data_loc_##field & 0xFFFF; \
diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c
index dc50270..722350e 100644
--- a/src/cc/libbpf.c
+++ b/src/cc/libbpf.c
@@ -1061,6 +1061,21 @@
return 0;
}
+int bpf_attach_raw_tracepoint(int progfd, char *tp_name)
+{
+ union bpf_attr attr;
+ int ret;
+
+ bzero(&attr, sizeof(attr));
+ attr.raw_tracepoint.name = ptr_to_u64(tp_name);
+ attr.raw_tracepoint.prog_fd = progfd;
+
+ ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr));
+ if (ret < 0)
+ fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno));
+ return ret;
+}
+
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
perf_reader_lost_cb lost_cb, void *cb_cookie,
int pid, int cpu, int page_cnt) {
diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h
index e59d48a..589006f 100644
--- a/src/cc/libbpf.h
+++ b/src/cc/libbpf.h
@@ -81,6 +81,8 @@
const char *tp_name);
int bpf_detach_tracepoint(const char *tp_category, const char *tp_name);
+int bpf_attach_raw_tracepoint(int progfd, char *tp_name);
+
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
perf_reader_lost_cb lost_cb, void *cb_cookie,
int pid, int cpu, int page_cnt);
diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py
index a1552e7..5dccd65 100644
--- a/src/python/bcc/__init__.py
+++ b/src/python/bcc/__init__.py
@@ -135,6 +135,17 @@
TRACEPOINT = 5
XDP = 6
PERF_EVENT = 7
+ CGROUP_SKB = 8
+ CGROUP_SOCK = 9
+ LWT_IN = 10
+ LWT_OUT = 11
+ LWT_XMIT = 12
+ SOCK_OPS = 13
+ SK_SKB = 14
+ CGROUP_DEVICE = 15
+ SK_MSG = 16
+ RAW_TRACEPOINT = 17
+ CGROUP_SOCK_ADDR = 18
# from xdp_action uapi/linux/bpf.h
XDP_ABORTED = 0
@@ -267,6 +278,7 @@
self.kprobe_fds = {}
self.uprobe_fds = {}
self.tracepoint_fds = {}
+ self.raw_tracepoint_fds = {}
self.perf_buffers = {}
self.open_perf_events = {}
self.tracefile = None
@@ -310,7 +322,8 @@
for usdt_context in usdt_contexts:
usdt_context.attach_uprobes(self)
- # If any "kprobe__" or "tracepoint__" prefixed functions were defined,
+ # If any "kprobe__" or "tracepoint__" or "raw_tracepoint__"
+ # prefixed functions were defined,
# they will be loaded and attached here.
self._trace_autoload()
@@ -725,6 +738,52 @@
self.tracepoint_fds[tp] = fd
return self
+ def attach_raw_tracepoint(self, tp=b"", fn_name=b""):
+ """attach_raw_tracepoint(self, tp=b"", fn_name=b"")
+
+ Run the bpf function denoted by fn_name every time the kernel tracepoint
+ specified by 'tp' is hit. The bpf function should be loaded as a
+ RAW_TRACEPOINT type. The fn_name is the kernel tracepoint name,
+ e.g., sched_switch, sys_enter_bind, etc.
+
+ Examples:
+ BPF(text).attach_raw_tracepoint(tp="sched_switch", fn_name="on_switch")
+ """
+
+ tp = _assert_is_bytes(tp)
+ if tp in self.raw_tracepoint_fds:
+ raise Exception("Raw tracepoint %s has been attached" % tp)
+
+ fn_name = _assert_is_bytes(fn_name)
+ fn = self.load_func(fn_name, BPF.RAW_TRACEPOINT)
+ fd = lib.bpf_attach_raw_tracepoint(fn.fd, tp)
+ if fd < 0:
+ raise Exception("Failed to attach BPF to raw tracepoint")
+ self.raw_tracepoint_fds[tp] = fd;
+ return self
+
+ def detach_raw_tracepoint(self, tp=b""):
+ """detach_raw_tracepoint(tp="")
+
+ Stop running the bpf function that is attached to the kernel tracepoint
+ specified by 'tp'.
+
+ Example: bpf.detach_raw_tracepoint("sched_switch")
+ """
+
+ tp = _assert_is_bytes(tp)
+ if tp not in self.raw_tracepoint_fds:
+ raise Exception("Raw tracepoint %s is not attached" % tp)
+ os.close(self.raw_tracepoint_fds[tp])
+ del self.raw_tracepoint_fds[tp]
+
+ @staticmethod
+ def support_raw_tracepoint():
+ # kernel symbol "bpf_find_raw_tracepoint" indicates raw_tracepint support
+ if BPF.ksymname("bpf_find_raw_tracepoint") != -1:
+ return True
+ return False
+
def detach_tracepoint(self, tp=b""):
"""detach_tracepoint(tp="")
@@ -954,6 +1013,10 @@
fn = self.load_func(func_name, BPF.TRACEPOINT)
tp = fn.name[len(b"tracepoint__"):].replace(b"__", b":")
self.attach_tracepoint(tp=tp, fn_name=fn.name)
+ elif func_name.startswith(b"raw_tracepoint__"):
+ fn = self.load_func(func_name, BPF.RAW_TRACEPOINT)
+ tp = fn.name[len(b"raw_tracepoint__"):]
+ self.attach_raw_tracepoint(tp=tp, fn_name=fn.name)
def trace_open(self, nonblocking=False):
"""trace_open(nonblocking=False)
@@ -1154,6 +1217,8 @@
self.detach_uprobe_event(k)
for k, v in list(self.tracepoint_fds.items()):
self.detach_tracepoint(k)
+ for k, v in list(self.raw_tracepoint_fds.items()):
+ self.detach_raw_tracepoint(k)
# Clean up opened perf ring buffer and perf events
table_keys = list(self.tables.keys())
diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py
index fd9f72b..e61227e 100644
--- a/src/python/bcc/libbcc.py
+++ b/src/python/bcc/libbcc.py
@@ -100,6 +100,8 @@
lib.bpf_attach_tracepoint.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p]
lib.bpf_detach_tracepoint.restype = ct.c_int
lib.bpf_detach_tracepoint.argtypes = [ct.c_char_p, ct.c_char_p]
+lib.bpf_attach_raw_tracepoint.restype = ct.c_int
+lib.bpf_attach_raw_tracepoint.argtypes = [ct.c_int, ct.c_char_p]
lib.bpf_open_perf_buffer.restype = ct.c_void_p
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, _LOST_CB_TYPE, ct.py_object, ct.c_int, ct.c_int, ct.c_int]
lib.bpf_open_perf_event.restype = ct.c_int
diff --git a/tools/runqlat.py b/tools/runqlat.py
index 95657cd..ebda11d 100755
--- a/tools/runqlat.py
+++ b/tools/runqlat.py
@@ -95,7 +95,9 @@
start.update(&pid, &ts);
return 0;
}
+"""
+bpf_text_kprobe = """
int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
{
return trace_enqueue(p->tgid, p->pid);
@@ -144,6 +146,76 @@
}
"""
+bpf_text_raw_tp = """
+RAW_TRACEPOINT_PROBE(sched_wakeup)
+{
+ // TP_PROTO(struct task_struct *p)
+ struct task_struct *p = (struct task_struct *)ctx->args[0];
+ u32 tgid, pid;
+
+ bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
+ bpf_probe_read(&pid, sizeof(pid), &p->pid);
+ return trace_enqueue(tgid, pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup_new)
+{
+ // TP_PROTO(struct task_struct *p)
+ struct task_struct *p = (struct task_struct *)ctx->args[0];
+ u32 tgid, pid;
+
+ bpf_probe_read(&tgid, sizeof(tgid), &p->tgid);
+ bpf_probe_read(&pid, sizeof(pid), &p->pid);
+ return trace_enqueue(tgid, pid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+ // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
+ struct task_struct *prev = (struct task_struct *)ctx->args[1];
+ struct task_struct *next= (struct task_struct *)ctx->args[2];
+ u32 pid, tgid;
+ long state;
+
+ // ivcsw: treat like an enqueue event and store timestamp
+ bpf_probe_read(&state, sizeof(long), &prev->state);
+ if (state == TASK_RUNNING) {
+ bpf_probe_read(&tgid, sizeof(prev->tgid), &prev->tgid);
+ bpf_probe_read(&pid, sizeof(prev->pid), &prev->pid);
+ if (!(FILTER)) {
+ u64 ts = bpf_ktime_get_ns();
+ start.update(&pid, &ts);
+ }
+ }
+
+ bpf_probe_read(&tgid, sizeof(next->tgid), &next->tgid);
+ bpf_probe_read(&pid, sizeof(next->pid), &next->pid);
+ if (FILTER)
+ return 0;
+ u64 *tsp, delta;
+
+ // fetch timestamp and calculate delta
+ tsp = start.lookup(&pid);
+ if (tsp == 0) {
+ return 0; // missed enqueue
+ }
+ delta = bpf_ktime_get_ns() - *tsp;
+ FACTOR
+
+ // store as histogram
+ STORE
+
+ start.delete(&pid);
+ return 0;
+}
+"""
+
+is_support_raw_tp = BPF.support_raw_tracepoint()
+if is_support_raw_tp:
+ bpf_text += bpf_text_raw_tp
+else:
+ bpf_text += bpf_text_kprobe
+
# code substitutions
if args.pid:
# pid from userspace point of view is thread group from kernel pov
@@ -186,9 +258,10 @@
# load BPF program
b = BPF(text=bpf_text)
-b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
-b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
-b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
+if not is_support_raw_tp:
+ b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
+ b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
+ b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
print("Tracing run queue latency... Hit Ctrl-C to end.")