Blame - tools/zfsslower.py - platform/external/bcc

blob: 4337ead4202964625f1d18be82d0821c4d45328d [file] [log] [blame]

Brendan Gregg	bc54bb6	2016-02-14 23:13:13 -0800	[diff] [blame^]	1	#!/usr/bin/python
				2	# @lint-avoid-python-3-compatibility-imports
				3	#
				4	# zfsslower Trace slow ZFS operations.
				5	# For Linux, uses BCC, eBPF.
				6	#
				7	# USAGE: zfsslower [-h] [-j] [-p PID] [min_ms]
				8	#
				9	# This script traces common ZFS file operations: reads, writes, opens, and
				10	# syncs. It measures the time spent in these operations, and prints details
				11	# for each that exceeded a threshold.
				12	#
				13	# WARNING: This adds low-overhead instrumentation to these ZFS operations,
				14	# including reads and writes from the file system cache. Such reads and writes
				15	# can be very frequent (depending on the workload; eg, 1M/sec), at which
				16	# point the overhead of this tool (even if it prints no "slower" events) can
				17	# begin to become significant.
				18	#
				19	# This works by using kernel dynamic tracing of the ZPL interface, and will
				20	# need updates to match any changes to this interface.
				21	#
				22	# By default, a minimum millisecond threshold of 10 is used.
				23	#
				24	# Copyright 2016 Netflix, Inc.
				25	# Licensed under the Apache License, Version 2.0 (the "License")
				26	#
				27	# 14-Feb-2016 Brendan Gregg Created this.
				28
				29	from __future__ import print_function
				30	from bcc import BPF
				31	import argparse
				32	from time import strftime
				33	import ctypes as ct
				34
				35	# arguments
				36	examples = """examples:
				37	./zfsslower # trace operations slower than 10 ms (default)
				38	./zfsslower 1 # trace operations slower than 1 ms
				39	./zfsslower -j 1 # ... 1 ms, parsable output (csv)
				40	./zfsslower 0 # trace all operations (warning: verbose)
				41	./zfsslower -p 185 # trace PID 185 only
				42	"""
				43	parser = argparse.ArgumentParser(
				44	description="Trace common ZFS file operations slower than a threshold",
				45	formatter_class=argparse.RawDescriptionHelpFormatter,
				46	epilog=examples)
				47	parser.add_argument("-j", "--csv", action="store_true",
				48	help="just print fields: comma-separated values")
				49	parser.add_argument("-p", "--pid",
				50	help="trace this PID only")
				51	parser.add_argument("min_ms", nargs="?", default='10',
				52	help="minimum I/O duration to trace, in ms (default 10)")
				53	args = parser.parse_args()
				54	min_ms = int(args.min_ms)
				55	pid = args.pid
				56	csv = args.csv
				57	debug = 0
				58
				59	# define BPF program
				60	bpf_text = """
				61	#include <uapi/linux/ptrace.h>
				62	#include <linux/fs.h>
				63	#include <linux/sched.h>
				64	#include <linux/dcache.h>
				65
				66	// XXX: switch these to char's when supported
				67	#define TRACE_READ 0
				68	#define TRACE_WRITE 1
				69	#define TRACE_OPEN 2
				70	#define TRACE_FSYNC 3
				71
				72	struct val_t {
				73	u64 ts;
				74	u64 offset;
				75	struct file *fp;
				76	};
				77
				78	struct data_t {
				79	// XXX: switch some to u32's when supported
				80	u64 ts_us;
				81	u64 type;
				82	u64 size;
				83	u64 offset;
				84	u64 delta_us;
				85	u64 pid;
				86	char task[TASK_COMM_LEN];
				87	char file[DNAME_INLINE_LEN];
				88	};
				89
				90	BPF_HASH(entryinfo, pid_t, struct val_t);
				91	BPF_PERF_OUTPUT(events);
				92
				93	//
				94	// Store timestamp and size on entry
				95	//
				96
				97	// zpl_read(), zpl_write():
				98	int trace_rw_entry(struct pt_regs ctx, struct file filp, char __user *buf,
				99	size_t len, loff_t *ppos)
				100	{
				101	u32 pid;
				102	pid = bpf_get_current_pid_tgid();
				103	if (FILTER_PID)
				104	return 0;
				105
				106	// store filep and timestamp by pid
				107	struct val_t val = {};
				108	val.ts = bpf_ktime_get_ns();
				109	val.fp = filp;
				110	val.offset = *ppos;
				111	if (val.fp)
				112	entryinfo.update(&pid, &val);
				113
				114	return 0;
				115	}
				116
				117	// zpl_open():
				118	int trace_open_entry(struct pt_regs ctx, struct inode inode,
				119	struct file *filp)
				120	{
				121	u32 pid;
				122	pid = bpf_get_current_pid_tgid();
				123	if (FILTER_PID)
				124	return 0;
				125
				126	// store filep and timestamp by pid
				127	struct val_t val = {};
				128	val.ts = bpf_ktime_get_ns();
				129	val.fp = filp;
				130	val.offset = 0;
				131	if (val.fp)
				132	entryinfo.update(&pid, &val);
				133
				134	return 0;
				135	}
				136
				137	// zpl_fsync():
				138	int trace_fsync_entry(struct pt_regs ctx, struct file filp)
				139	{
				140	u32 pid;
				141	pid = bpf_get_current_pid_tgid();
				142	if (FILTER_PID)
				143	return 0;
				144
				145	// store filp and timestamp by pid
				146	struct val_t val = {};
				147	val.ts = bpf_ktime_get_ns();
				148	val.fp = filp;
				149	val.offset = 0;
				150	if (val.fp)
				151	entryinfo.update(&pid, &val);
				152
				153	return 0;
				154	}
				155
				156	//
				157	// Output
				158	//
				159
				160	static int trace_return(struct pt_regs *ctx, int type)
				161	{
				162	struct val_t *valp;
				163	u32 pid = bpf_get_current_pid_tgid();
				164
				165	valp = entryinfo.lookup(&pid);
				166	if (valp == 0) {
				167	// missed tracing issue or filtered
				168	return 0;
				169	}
				170
				171	// calculate delta
				172	u64 ts = bpf_ktime_get_ns();
				173	u64 delta_us = (ts - valp->ts) / 1000;
				174	entryinfo.delete(&pid);
				175	if (FILTER_US)
				176	return 0;
				177
				178	// workaround (rewriter should handle file to d_iname in one step):
				179	struct dentry *de = NULL;
				180	bpf_probe_read(&de, sizeof(de), &valp->fp->f_path.dentry);
				181
				182	// populate output struct
				183	u32 size = ctx->ax;
				184	struct data_t data = {.type = type, .size = size, .delta_us = delta_us,
				185	.pid = pid};
				186	data.ts_us = ts / 1000;
				187	data.offset = valp->offset;
				188	bpf_probe_read(&data.file, sizeof(data.file), de->d_iname);
				189	bpf_get_current_comm(&data.task, sizeof(data.task));
				190
				191	events.perf_submit(ctx, &data, sizeof(data));
				192
				193	return 0;
				194	}
				195
				196	int trace_read_return(struct pt_regs *ctx)
				197	{
				198	return trace_return(ctx, TRACE_READ);
				199	}
				200
				201	int trace_write_return(struct pt_regs *ctx)
				202	{
				203	return trace_return(ctx, TRACE_WRITE);
				204	}
				205
				206	int trace_open_return(struct pt_regs *ctx)
				207	{
				208	return trace_return(ctx, TRACE_OPEN);
				209	}
				210
				211	int trace_fsync_return(struct pt_regs *ctx)
				212	{
				213	return trace_return(ctx, TRACE_FSYNC);
				214	}
				215
				216	"""
				217	if min_ms == 0:
				218	bpf_text = bpf_text.replace('FILTER_US', '0')
				219	else:
				220	bpf_text = bpf_text.replace('FILTER_US',
				221	'delta_us <= %s' % str(min_ms * 1000))
				222	if args.pid:
				223	bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid)
				224	else:
				225	bpf_text = bpf_text.replace('FILTER_PID', '0')
				226	if debug:
				227	print(bpf_text)
				228
				229	# kernel->user event data: struct data_t
				230	DNAME_INLINE_LEN = 32 # linux/dcache.h
				231	TASK_COMM_LEN = 16 # linux/sched.h
				232	class Data(ct.Structure):
				233	_fields_ = [
				234	("ts_us", ct.c_ulonglong),
				235	("type", ct.c_ulonglong),
				236	("size", ct.c_ulonglong),
				237	("offset", ct.c_ulonglong),
				238	("delta_us", ct.c_ulonglong),
				239	("pid", ct.c_ulonglong),
				240	("task", ct.c_char * TASK_COMM_LEN),
				241	("file", ct.c_char * DNAME_INLINE_LEN)
				242	]
				243
				244	# process event
				245	def print_event(cpu, data, size):
				246	event = ct.cast(data, ct.POINTER(Data)).contents
				247
				248	type = 'R'
				249	if event.type == 1:
				250	type = 'W'
				251	elif event.type == 2:
				252	type = 'O'
				253	elif event.type == 3:
				254	type = 'S'
				255
				256	if (csv):
				257	print("%d,%s,%d,%s,%d,%d,%d,%s" % (
				258	event.ts_us, event.task, event.pid, type, event.size,
				259	event.offset, event.delta_us, event.file))
				260	return
				261	print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"),
				262	event.task, event.pid, type, event.size, event.offset / 1024,
				263	float(event.delta_us) / 1000, event.file))
				264
				265	# initialize BPF
				266	b = BPF(text=bpf_text)
				267
				268	# common file functions
				269	b.attach_kprobe(event="zpl_read", fn_name="trace_rw_entry")
				270	b.attach_kprobe(event="zpl_write", fn_name="trace_rw_entry")
				271	b.attach_kprobe(event="zpl_open", fn_name="trace_open_entry")
				272	b.attach_kprobe(event="zpl_fsync", fn_name="trace_fsync_entry")
				273	b.attach_kretprobe(event="zpl_read", fn_name="trace_read_return")
				274	b.attach_kretprobe(event="zpl_write", fn_name="trace_write_return")
				275	b.attach_kretprobe(event="zpl_open", fn_name="trace_open_return")
				276	b.attach_kretprobe(event="zpl_fsync", fn_name="trace_fsync_return")
				277
				278	# header
				279	if (csv):
				280	print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
				281	else:
				282	if min_ms == 0:
				283	print("Tracing ZFS operations")
				284	else:
				285	print("Tracing ZFS operations slower than %d ms" % min_ms)
				286	print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T",
				287	"BYTES", "OFF_KB", "LAT(ms)", "FILENAME"))
				288
				289	# read events
				290	b["events"].open_perf_buffer(print_event)
				291	while 1:
				292	b.kprobe_poll()