blob: 2a5c8e72c9a6216ded9cb859e899d17004139d12 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Google LLC.
*
* Based on klockstat from BCC by Jiri Olsa and others
* 2021-10-26 Barret Rhoden Created this.
*/
#include "vmlinux.h"
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "klockstat.h"
#include "bits.bpf.h"
const volatile pid_t targ_tgid = 0;
const volatile pid_t targ_pid = 0;
struct mutex *const volatile targ_lock = NULL;
struct {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__uint(max_entries, MAX_ENTRIES);
__uint(key_size, sizeof(u32));
__uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
} stack_map SEC(".maps");
/*
* Uniquely identifies a task grabbing a particular lock; a task can only hold
* the same lock once (non-recursive mutexes).
*/
struct task_lock {
u64 task_id;
u64 lock_ptr;
};
struct lockholder_info {
s32 stack_id;
u64 task_id;
u64 try_at;
u64 acq_at;
u64 rel_at;
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, MAX_ENTRIES);
__type(key, struct task_lock);
__type(value, struct lockholder_info);
} lockholder_map SEC(".maps");
/*
* Keyed by stack_id.
*
* Multiple call sites may have the same underlying lock, but we only know the
* stats for a particular stack frame. Multiple tasks may have the same
* stackframe.
*/
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, MAX_ENTRIES);
__type(key, s32);
__type(value, struct lock_stat);
} stat_map SEC(".maps");
static bool tracing_task(u64 task_id)
{
u32 tgid = task_id >> 32;
u32 pid = task_id;
if (targ_tgid && targ_tgid != tgid)
return false;
if (targ_pid && targ_pid != pid)
return false;
return true;
}
static void lock_contended(void *ctx, struct mutex *lock)
{
u64 task_id;
struct lockholder_info li[1] = {0};
struct task_lock tl = {};
if (targ_lock && targ_lock != lock)
return;
task_id = bpf_get_current_pid_tgid();
if (!tracing_task(task_id))
return;
li->task_id = task_id;
/*
* Skip 4 frames, e.g.:
* __this_module+0x34ef
* __this_module+0x34ef
* __this_module+0x8c44
* mutex_lock+0x5
*
* Note: if you make major changes to this bpf program, double check
* that you aren't skipping too many frames.
*/
li->stack_id = bpf_get_stackid(ctx, &stack_map,
4 | BPF_F_FAST_STACK_CMP);
/* Legit failures include EEXIST */
if (li->stack_id < 0)
return;
li->try_at = bpf_ktime_get_ns();
tl.task_id = task_id;
tl.lock_ptr = (u64)lock;
bpf_map_update_elem(&lockholder_map, &tl, li, BPF_ANY);
}
static void lock_aborted(struct mutex *lock)
{
u64 task_id;
struct task_lock tl = {};
if (targ_lock && targ_lock != lock)
return;
task_id = bpf_get_current_pid_tgid();
if (!tracing_task(task_id))
return;
tl.task_id = task_id;
tl.lock_ptr = (u64)lock;
bpf_map_delete_elem(&lockholder_map, &tl);
}
static void lock_acquired(struct mutex *lock)
{
u64 task_id;
struct lockholder_info *li;
struct task_lock tl = {};
if (targ_lock && targ_lock != lock)
return;
task_id = bpf_get_current_pid_tgid();
if (!tracing_task(task_id))
return;
tl.task_id = task_id;
tl.lock_ptr = (u64)lock;
li = bpf_map_lookup_elem(&lockholder_map, &tl);
if (!li)
return;
li->acq_at = bpf_ktime_get_ns();
}
static void account(struct lockholder_info *li)
{
struct lock_stat *ls;
u64 delta;
/*
* Multiple threads may have the same stack_id. Even though we are
* holding the lock, dynamically allocated mutexes can have the same
* callgraph but represent different locks. They will be accounted as
* the same lock, which is what we want, but we need to use atomics to
* avoid corruption, especially for the total_time variables.
*/
ls = bpf_map_lookup_elem(&stat_map, &li->stack_id);
if (!ls) {
struct lock_stat fresh = {0};
bpf_map_update_elem(&stat_map, &li->stack_id, &fresh, BPF_ANY);
ls = bpf_map_lookup_elem(&stat_map, &li->stack_id);
if (!ls)
return;
}
delta = li->acq_at - li->try_at;
__sync_fetch_and_add(&ls->acq_count, 1);
__sync_fetch_and_add(&ls->acq_total_time, delta);
if (delta > READ_ONCE(ls->acq_max_time)) {
WRITE_ONCE(ls->acq_max_time, delta);
WRITE_ONCE(ls->acq_max_id, li->task_id);
/*
* Potentially racy, if multiple threads think they are the max,
* so you may get a clobbered write.
*/
bpf_get_current_comm(ls->acq_max_comm, TASK_COMM_LEN);
}
delta = li->rel_at - li->acq_at;
__sync_fetch_and_add(&ls->hld_count, 1);
__sync_fetch_and_add(&ls->hld_total_time, delta);
if (delta > READ_ONCE(ls->hld_max_time)) {
WRITE_ONCE(ls->hld_max_time, delta);
WRITE_ONCE(ls->hld_max_id, li->task_id);
bpf_get_current_comm(ls->hld_max_comm, TASK_COMM_LEN);
}
}
static void lock_released(struct mutex *lock)
{
u64 task_id;
struct lockholder_info *li;
struct task_lock tl = {};
if (targ_lock && targ_lock != lock)
return;
task_id = bpf_get_current_pid_tgid();
if (!tracing_task(task_id))
return;
tl.task_id = task_id;
tl.lock_ptr = (u64)lock;
li = bpf_map_lookup_elem(&lockholder_map, &tl);
if (!li)
return;
li->rel_at = bpf_ktime_get_ns();
account(li);
bpf_map_delete_elem(&lockholder_map, &tl);
}
SEC("fentry/mutex_lock")
int BPF_PROG(mutex_lock, struct mutex *lock)
{
lock_contended(ctx, lock);
return 0;
}
SEC("fexit/mutex_lock")
int BPF_PROG(mutex_lock_exit, struct mutex *lock, long ret)
{
lock_acquired(lock);
return 0;
}
SEC("fexit/mutex_trylock")
int BPF_PROG(mutex_trylock_exit, struct mutex *lock, long ret)
{
if (ret) {
lock_contended(ctx, lock);
lock_acquired(lock);
}
return 0;
}
SEC("fentry/mutex_lock_interruptible")
int BPF_PROG(mutex_lock_interruptible, struct mutex *lock)
{
lock_contended(ctx, lock);
return 0;
}
SEC("fexit/mutex_lock_interruptible")
int BPF_PROG(mutex_lock_interruptible_exit, struct mutex *lock, long ret)
{
if (ret)
lock_aborted(lock);
else
lock_acquired(lock);
return 0;
}
SEC("fentry/mutex_lock_killable")
int BPF_PROG(mutex_lock_killable, struct mutex *lock)
{
lock_contended(ctx, lock);
return 0;
}
SEC("fexit/mutex_lock_killable")
int BPF_PROG(mutex_lock_killable_exit, struct mutex *lock, long ret)
{
if (ret)
lock_aborted(lock);
else
lock_acquired(lock);
return 0;
}
SEC("fentry/mutex_unlock")
int BPF_PROG(mutex_unlock, struct mutex *lock)
{
lock_released(lock);
return 0;
}
char LICENSE[] SEC("license") = "GPL";