tools: add filtering by mount namespace
In previous patches, I added the option --cgroupmap to filter events
belonging to a set of cgroup-v2. Although this approach works fine with
systemd services and containers when cgroup-v2 is enabled, it does not
work with containers when only cgroup-v1 is enabled because
bpf_get_current_cgroup_id() only works with cgroup-v2. It also requires
Linux 4.18 to get this bpf helper function.
This patch adds an additional way to filter by containers, using mount
namespaces.
Note that this does not help with systemd services since they normally
don't create a new mount namespace (unless you set some options like
'ReadOnlyPaths=', see "man 5 systemd.exec").
My goal with this patch is to filter Kubernetes pods, even on
distributions with an older kernel (<4.18) or without cgroup-v2 enabled.
- This is only implemented for tools that already support filtering by
cgroup id (bindsnoop, capable, execsnoop, profile, tcpaccept, tcpconnect,
tcptop and tcptracer).
- I picked the mount namespace because the other namespaces could be
disabled in Kubernetes (e.g. HostNetwork, HostPID, HostIPC).
It can be tested by following the example in docs/special_filtering added
in this commit, to avoid compiling locally the following command can be used
```
sudo bpftool map create /sys/fs/bpf/mnt_ns_set type hash key 8 value 4 \
entries 128 name mnt_ns_set flags 0
docker run -ti --rm --privileged \
-v /usr/src:/usr/src -v /lib/modules:/lib/modules \
-v /sys/fs/bpf:/sys/fs/bpf --pid=host kinvolk/bcc:alban-containers-filters \
/usr/share/bcc/tools/execsnoop --mntnsmap /sys/fs/bpf/mnt_ns_set
```
Co-authored-by: Alban Crequy <alban@kinvolk.io>
Co-authored-by: Mauricio Vásquez <mauricio@kinvolk.io>
diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py
index 03b05e0..4aa7fd7 100755
--- a/tools/tcpaccept.py
+++ b/tools/tcpaccept.py
@@ -16,6 +16,7 @@
# 14-Feb-2016 " " Switch to bpf_perf_output.
from __future__ import print_function
+from bcc.containers import filter_by_containers
from bcc import BPF
from socket import inet_ntop, AF_INET, AF_INET6
from struct import pack
@@ -29,7 +30,8 @@
./tcpaccept -t # include timestamps
./tcpaccept -P 80,81 # only trace port 80 and 81
./tcpaccept -p 181 # only trace PID 181
- ./tcpaccept --cgroupmap ./mappath # only trace cgroups in this BPF map
+ ./tcpaccept --cgroupmap mappath # only trace cgroups in this BPF map
+ ./tcpaccept --mntnsmap mappath # only trace mount namespaces in the map
"""
parser = argparse.ArgumentParser(
description="Trace TCP accepts",
@@ -45,6 +47,8 @@
help="comma-separated list of local ports to trace")
parser.add_argument("--cgroupmap",
help="trace cgroups in this BPF map only")
+parser.add_argument("--mntnsmap",
+ help="trace mount namespaces in this BPF map only")
parser.add_argument("--ebpf", action="store_true",
help=argparse.SUPPRESS)
args = parser.parse_args()
@@ -80,11 +84,6 @@
char task[TASK_COMM_LEN];
};
BPF_PERF_OUTPUT(ipv6_events);
-
-#if CGROUPSET
-BPF_TABLE_PINNED("hash", u64, u64, cgroupset, 1024, "CGROUPPATH");
-#endif
-
"""
#
@@ -97,12 +96,9 @@
bpf_text_kprobe = """
int kretprobe__inet_csk_accept(struct pt_regs *ctx)
{
-#if CGROUPSET
- u64 cgroupid = bpf_get_current_cgroup_id();
- if (cgroupset.lookup(&cgroupid) == NULL) {
+ if (container_should_be_filtered()) {
return 0;
}
-#endif
struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
u32 pid = bpf_get_current_pid_tgid() >> 32;
@@ -115,21 +111,21 @@
// check this is TCP
u8 protocol = 0;
// workaround for reading the sk_protocol bitfield:
-
+
// Following comments add by Joe Yin:
// Unfortunately,it can not work since Linux 4.10,
// because the sk_wmem_queued is not following the bitfield of sk_protocol.
// And the following member is sk_gso_max_segs.
// So, we can use this:
// bpf_probe_read(&protocol, 1, (void *)((u64)&newsk->sk_gso_max_segs) - 3);
- // In order to diff the pre-4.10 and 4.10+ ,introduce the variables gso_max_segs_offset,sk_lingertime,
- // sk_lingertime is closed to the gso_max_segs_offset,and
- // the offset between the two members is 4
+ // In order to diff the pre-4.10 and 4.10+ ,introduce the variables gso_max_segs_offset,sk_lingertime,
+ // sk_lingertime is closed to the gso_max_segs_offset,and
+ // the offset between the two members is 4
int gso_max_segs_offset = offsetof(struct sock, sk_gso_max_segs);
int sk_lingertime_offset = offsetof(struct sock, sk_lingertime);
- if (sk_lingertime_offset - gso_max_segs_offset == 4)
+ if (sk_lingertime_offset - gso_max_segs_offset == 4)
// 4.10+ with little endian
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
protocol = *(u8 *)((u64)&newsk->sk_gso_max_segs - 3);
@@ -199,11 +195,7 @@
lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
bpf_text = bpf_text.replace('##FILTER_PORT##',
'if (%s) { return 0; }' % lports_if)
-if args.cgroupmap:
- bpf_text = bpf_text.replace('CGROUPSET', '1')
- bpf_text = bpf_text.replace('CGROUPPATH', args.cgroupmap)
-else:
- bpf_text = bpf_text.replace('CGROUPSET', '0')
+bpf_text = filter_by_containers(args) + bpf_text
if debug or args.ebpf:
print(bpf_text)
if args.ebpf: