| #!/usr/bin/env python |
| # |
| # deadlock_detector Detects potential deadlocks (lock order inversions) |
| # on a running process. For Linux, uses BCC, eBPF. |
| # |
| # USAGE: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH] |
| # [--verbose] [--lock-symbols LOCK_SYMBOLS] |
| # [--unlock-symbols UNLOCK_SYMBOLS] |
| # pid |
| # |
| # This traces pthread mutex lock and unlock calls to build a directed graph |
| # representing the mutex wait graph: |
| # |
| # - Nodes in the graph represent mutexes. |
| # - Edge (A, B) exists if there exists some thread T where lock(A) was called |
| # and lock(B) was called before unlock(A) was called. |
| # |
| # If the program finds a potential lock order inversion, the program will dump |
| # the cycle of mutexes and the stack traces where each mutex was acquired, and |
| # then exit. |
| # |
| # This program can only find potential deadlocks that occur while the program |
| # is tracing the process. It cannot find deadlocks that may have occurred |
| # before the program was attached to the process. |
| # |
| # Since this traces all mutex lock and unlock events and all thread creation |
| # events on the traced process, the overhead of this bpf program can be very |
| # high if the process has many threads and mutexes. You should only run this on |
| # a process where the slowdown is acceptable. |
| # |
| # Note: This tool does not work for shared mutexes or recursive mutexes. |
| # |
| # For shared (read-write) mutexes, a deadlock requires a cycle in the wait |
| # graph where at least one of the mutexes in the cycle is acquiring exclusive |
| # (write) ownership. |
| # |
| # For recursive mutexes, lock() is called multiple times on the same mutex. |
| # However, there is no way to determine if a mutex is a recursive mutex |
| # after the mutex has been created. As a result, this tool will not find |
| # potential deadlocks that involve only one mutex. |
| # |
| # Copyright 2017 Facebook, Inc. |
| # Licensed under the Apache License, Version 2.0 (the "License") |
| # |
| # 01-Feb-2017 Kenny Yu Created this. |
| |
| from __future__ import ( |
| absolute_import, division, unicode_literals, print_function |
| ) |
| from bcc import BPF |
| from collections import defaultdict |
| import argparse |
| import json |
| import os |
| import subprocess |
| import sys |
| import time |
| |
| |
| class DiGraph(object): |
| ''' |
| Adapted from networkx: http://networkx.github.io/ |
| Represents a directed graph. Edges can store (key, value) attributes. |
| ''' |
| |
| def __init__(self): |
| # Map of node -> set of nodes |
| self.adjacency_map = {} |
| # Map of (node1, node2) -> map string -> arbitrary attribute |
| # This will not be copied in subgraph() |
| self.attributes_map = {} |
| |
| def neighbors(self, node): |
| return self.adjacency_map.get(node, set()) |
| |
| def edges(self): |
| edges = [] |
| for node, neighbors in self.adjacency_map.items(): |
| for neighbor in neighbors: |
| edges.append((node, neighbor)) |
| return edges |
| |
| def nodes(self): |
| return self.adjacency_map.keys() |
| |
| def attributes(self, node1, node2): |
| return self.attributes_map[(node1, node2)] |
| |
| def add_edge(self, node1, node2, **kwargs): |
| if node1 not in self.adjacency_map: |
| self.adjacency_map[node1] = set() |
| if node2 not in self.adjacency_map: |
| self.adjacency_map[node2] = set() |
| self.adjacency_map[node1].add(node2) |
| self.attributes_map[(node1, node2)] = kwargs |
| |
| def remove_node(self, node): |
| self.adjacency_map.pop(node, None) |
| for _, neighbors in self.adjacency_map.items(): |
| neighbors.discard(node) |
| |
| def subgraph(self, nodes): |
| graph = DiGraph() |
| for node in nodes: |
| for neighbor in self.neighbors(node): |
| if neighbor in nodes: |
| graph.add_edge(node, neighbor) |
| return graph |
| |
| def node_link_data(self): |
| ''' |
| Returns the graph as a dictionary in a format that can be |
| serialized. |
| ''' |
| data = { |
| 'directed': True, |
| 'multigraph': False, |
| 'graph': {}, |
| 'links': [], |
| 'nodes': [], |
| } |
| |
| # Do one pass to build a map of node -> position in nodes |
| node_to_number = {} |
| for node in self.adjacency_map.keys(): |
| node_to_number[node] = len(data['nodes']) |
| data['nodes'].append({'id': node}) |
| |
| # Do another pass to build the link information |
| for node, neighbors in self.adjacency_map.items(): |
| for neighbor in neighbors: |
| link = self.attributes_map[(node, neighbor)].copy() |
| link['source'] = node_to_number[node] |
| link['target'] = node_to_number[neighbor] |
| data['links'].append(link) |
| return data |
| |
| |
| def strongly_connected_components(G): |
| ''' |
| Adapted from networkx: http://networkx.github.io/ |
| Parameters |
| ---------- |
| G : DiGraph |
| Returns |
| ------- |
| comp : generator of sets |
| A generator of sets of nodes, one for each strongly connected |
| component of G. |
| ''' |
| preorder = {} |
| lowlink = {} |
| scc_found = {} |
| scc_queue = [] |
| i = 0 # Preorder counter |
| for source in G.nodes(): |
| if source not in scc_found: |
| queue = [source] |
| while queue: |
| v = queue[-1] |
| if v not in preorder: |
| i = i + 1 |
| preorder[v] = i |
| done = 1 |
| v_nbrs = G.neighbors(v) |
| for w in v_nbrs: |
| if w not in preorder: |
| queue.append(w) |
| done = 0 |
| break |
| if done == 1: |
| lowlink[v] = preorder[v] |
| for w in v_nbrs: |
| if w not in scc_found: |
| if preorder[w] > preorder[v]: |
| lowlink[v] = min([lowlink[v], lowlink[w]]) |
| else: |
| lowlink[v] = min([lowlink[v], preorder[w]]) |
| queue.pop() |
| if lowlink[v] == preorder[v]: |
| scc_found[v] = True |
| scc = {v} |
| while ( |
| scc_queue and preorder[scc_queue[-1]] > preorder[v] |
| ): |
| k = scc_queue.pop() |
| scc_found[k] = True |
| scc.add(k) |
| yield scc |
| else: |
| scc_queue.append(v) |
| |
| |
| def simple_cycles(G): |
| ''' |
| Adapted from networkx: http://networkx.github.io/ |
| Parameters |
| ---------- |
| G : DiGraph |
| Returns |
| ------- |
| cycle_generator: generator |
| A generator that produces elementary cycles of the graph. |
| Each cycle is represented by a list of nodes along the cycle. |
| ''' |
| |
| def _unblock(thisnode, blocked, B): |
| stack = set([thisnode]) |
| while stack: |
| node = stack.pop() |
| if node in blocked: |
| blocked.remove(node) |
| stack.update(B[node]) |
| B[node].clear() |
| |
| # Johnson's algorithm requires some ordering of the nodes. |
| # We assign the arbitrary ordering given by the strongly connected comps |
| # There is no need to track the ordering as each node removed as processed. |
| # save the actual graph so we can mutate it here |
| # We only take the edges because we do not want to |
| # copy edge and node attributes here. |
| subG = G.subgraph(G.nodes()) |
| sccs = list(strongly_connected_components(subG)) |
| while sccs: |
| scc = sccs.pop() |
| # order of scc determines ordering of nodes |
| startnode = scc.pop() |
| # Processing node runs 'circuit' routine from recursive version |
| path = [startnode] |
| blocked = set() # vertex: blocked from search? |
| closed = set() # nodes involved in a cycle |
| blocked.add(startnode) |
| B = defaultdict(set) # graph portions that yield no elementary circuit |
| stack = [(startnode, list(subG.neighbors(startnode)))] |
| while stack: |
| thisnode, nbrs = stack[-1] |
| if nbrs: |
| nextnode = nbrs.pop() |
| if nextnode == startnode: |
| yield path[:] |
| closed.update(path) |
| elif nextnode not in blocked: |
| path.append(nextnode) |
| stack.append((nextnode, list(subG.neighbors(nextnode)))) |
| closed.discard(nextnode) |
| blocked.add(nextnode) |
| continue |
| # done with nextnode... look for more neighbors |
| if not nbrs: # no more nbrs |
| if thisnode in closed: |
| _unblock(thisnode, blocked, B) |
| else: |
| for nbr in subG.neighbors(thisnode): |
| if thisnode not in B[nbr]: |
| B[nbr].add(thisnode) |
| stack.pop() |
| path.pop() |
| # done processing this node |
| subG.remove_node(startnode) |
| H = subG.subgraph(scc) # make smaller to avoid work in SCC routine |
| sccs.extend(list(strongly_connected_components(H))) |
| |
| |
| def find_cycle(graph): |
| ''' |
| Looks for a cycle in the graph. If found, returns the first cycle. |
| If nodes a1, a2, ..., an are in a cycle, then this returns: |
| [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)] |
| Otherwise returns an empty list. |
| ''' |
| cycles = list(simple_cycles(graph)) |
| if cycles: |
| nodes = cycles[0] |
| nodes.append(nodes[0]) |
| edges = [] |
| prev = nodes[0] |
| for node in nodes[1:]: |
| edges.append((prev, node)) |
| prev = node |
| return edges |
| else: |
| return [] |
| |
| |
| def print_cycle(binary, graph, edges, thread_info, print_stack_trace_fn): |
| ''' |
| Prints the cycle in the mutex graph in the following format: |
| |
| Potential Deadlock Detected! |
| |
| Cycle in lock order graph: M0 => M1 => M2 => M0 |
| |
| for (m, n) in cycle: |
| Mutex n acquired here while holding Mutex m in thread T: |
| [ stack trace ] |
| |
| Mutex m previously acquired by thread T here: |
| [ stack trace ] |
| |
| for T in all threads: |
| Thread T was created here: |
| [ stack trace ] |
| ''' |
| |
| # List of mutexes in the cycle, first and last repeated |
| nodes_in_order = [] |
| # Map mutex address -> readable alias |
| node_addr_to_name = {} |
| for counter, (m, n) in enumerate(edges): |
| nodes_in_order.append(m) |
| # For global or static variables, try to symbolize the mutex address. |
| symbol = symbolize_with_objdump(binary, m) |
| if symbol: |
| symbol += ' ' |
| node_addr_to_name[m] = 'Mutex M%d (%s0x%016x)' % (counter, symbol, m) |
| nodes_in_order.append(nodes_in_order[0]) |
| |
| print('----------------\nPotential Deadlock Detected!\n') |
| print( |
| 'Cycle in lock order graph: %s\n' % |
| (' => '.join([node_addr_to_name[n] for n in nodes_in_order])) |
| ) |
| |
| # Set of threads involved in the lock inversion |
| thread_pids = set() |
| |
| # For each edge in the cycle, print where the two mutexes were held |
| for (m, n) in edges: |
| thread_pid = graph.attributes(m, n)['thread_pid'] |
| thread_comm = graph.attributes(m, n)['thread_comm'] |
| first_mutex_stack_id = graph.attributes(m, n)['first_mutex_stack_id'] |
| second_mutex_stack_id = graph.attributes(m, n)['second_mutex_stack_id'] |
| thread_pids.add(thread_pid) |
| print( |
| '%s acquired here while holding %s in Thread %d (%s):' % ( |
| node_addr_to_name[n], node_addr_to_name[m], thread_pid, |
| thread_comm |
| ) |
| ) |
| print_stack_trace_fn(second_mutex_stack_id) |
| print('') |
| print( |
| '%s previously acquired by the same Thread %d (%s) here:' % |
| (node_addr_to_name[m], thread_pid, thread_comm) |
| ) |
| print_stack_trace_fn(first_mutex_stack_id) |
| print('') |
| |
| # Print where the threads were created, if available |
| for thread_pid in thread_pids: |
| parent_pid, stack_id, parent_comm = thread_info.get( |
| thread_pid, (None, None, None) |
| ) |
| if parent_pid: |
| print( |
| 'Thread %d created by Thread %d (%s) here: ' % |
| (thread_pid, parent_pid, parent_comm) |
| ) |
| print_stack_trace_fn(stack_id) |
| else: |
| print( |
| 'Could not find stack trace where Thread %d was created' % |
| thread_pid |
| ) |
| print('') |
| |
| |
| def symbolize_with_objdump(binary, addr): |
| ''' |
| Searches the binary for the address using objdump. Returns the symbol if |
| it is found, otherwise returns empty string. |
| ''' |
| try: |
| command = ( |
| 'objdump -tT %s | grep %x | awk {\'print $NF\'} | c++filt' % |
| (binary, addr) |
| ) |
| output = subprocess.check_output(command, shell=True) |
| return output.decode('utf-8').strip() |
| except subprocess.CalledProcessError: |
| return '' |
| |
| |
| def strlist(s): |
| '''Given a comma-separated string, returns a list of substrings''' |
| return s.strip().split(',') |
| |
| |
| def main(): |
| examples = '''Examples: |
| deadlock_detector 181 # Analyze PID 181 |
| |
| deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 |
| # Analyze PID 181 and locks from this binary. |
| # If tracing a process that is running from |
| # a dynamically-linked binary, this argument |
| # is required and should be the path to the |
| # pthread library. |
| |
| deadlock_detector 181 --verbose |
| # Analyze PID 181 and print statistics about |
| # the mutex wait graph. |
| |
| deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\ |
| --unlock-symbols my_mutex_unlock1,my_mutex_unlock2 |
| # Analyze PID 181 and trace custom mutex |
| # symbols instead of pthread mutexes. |
| |
| deadlock_detector 181 --dump-graph graph.json |
| # Analyze PID 181 and dump the mutex wait |
| # graph to graph.json. |
| ''' |
| parser = argparse.ArgumentParser( |
| description=( |
| 'Detect potential deadlocks (lock inversions) in a running binary.' |
| '\nMust be run as root.' |
| ), |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=examples, |
| ) |
| parser.add_argument('pid', type=int, help='Pid to trace') |
| # Binaries with `:` in the path will fail to attach uprobes on kernels |
| # running without this patch: https://lkml.org/lkml/2017/1/13/585. |
| # Symlinks to the binary without `:` in the path can get around this issue. |
| parser.add_argument( |
| '--binary', |
| type=str, |
| default='', |
| help='If set, trace the mutexes from the binary at this path. ' |
| 'For statically-linked binaries, this argument is not required. ' |
| 'For dynamically-linked binaries, this argument is required and ' |
| 'should be the path of the pthread library the binary is using. ' |
| 'Example: /lib/x86_64-linux-gnu/libpthread.so.0', |
| ) |
| parser.add_argument( |
| '--dump-graph', |
| type=str, |
| default='', |
| help='If set, this will dump the mutex graph to the specified file.', |
| ) |
| parser.add_argument( |
| '--verbose', |
| action='store_true', |
| help='Print statistics about the mutex wait graph.', |
| ) |
| parser.add_argument( |
| '--lock-symbols', |
| type=strlist, |
| default=['pthread_mutex_lock'], |
| help='Comma-separated list of lock symbols to trace. Default is ' |
| 'pthread_mutex_lock. These symbols cannot be inlined in the binary.', |
| ) |
| parser.add_argument( |
| '--unlock-symbols', |
| type=strlist, |
| default=['pthread_mutex_unlock'], |
| help='Comma-separated list of unlock symbols to trace. Default is ' |
| 'pthread_mutex_unlock. These symbols cannot be inlined in the binary.', |
| ) |
| args = parser.parse_args() |
| if not args.binary: |
| try: |
| args.binary = os.readlink('/proc/%d/exe' % args.pid) |
| except OSError as e: |
| print('%s. Is the process (pid=%d) running?' % (str(e), args.pid)) |
| sys.exit(1) |
| |
| bpf = BPF(src_file='deadlock_detector.c') |
| |
| # Trace where threads are created |
| bpf.attach_kretprobe(event='sys_clone', fn_name='trace_clone') |
| |
| # We must trace unlock first, otherwise in the time we attached the probe |
| # on lock() and have not yet attached the probe on unlock(), a thread can |
| # acquire mutexes and release them, but the release events will not be |
| # traced, resulting in noisy reports. |
| for symbol in args.unlock_symbols: |
| try: |
| bpf.attach_uprobe( |
| name=args.binary, |
| sym=symbol, |
| fn_name='trace_mutex_release', |
| pid=args.pid, |
| ) |
| except Exception as e: |
| print('%s. Failed to attach to symbol: %s' % (str(e), symbol)) |
| sys.exit(1) |
| for symbol in args.lock_symbols: |
| try: |
| bpf.attach_uprobe( |
| name=args.binary, |
| sym=symbol, |
| fn_name='trace_mutex_acquire', |
| pid=args.pid, |
| ) |
| except Exception as e: |
| print('%s. Failed to attach to symbol: %s' % (str(e), symbol)) |
| sys.exit(1) |
| |
| def print_stack_trace(stack_id): |
| '''Closure that prints the symbolized stack trace.''' |
| for addr in bpf.get_table('stack_traces').walk(stack_id): |
| line = bpf.sym(addr, args.pid) |
| # Try to symbolize with objdump if we cannot with bpf. |
| if line == '[unknown]': |
| symbol = symbolize_with_objdump(args.binary, addr) |
| if symbol: |
| line = symbol |
| print('@ %016x %s' % (addr, line)) |
| |
| print('Tracing... Hit Ctrl-C to end.') |
| while True: |
| try: |
| # Map of child thread pid -> parent info |
| thread_info = { |
| child.value: (parent.parent_pid, parent.stack_id, parent.comm) |
| for child, parent in bpf.get_table('thread_to_parent').items() |
| } |
| |
| # Mutex wait directed graph. Nodes are mutexes. Edge (A,B) exists |
| # if there exists some thread T where lock(A) was called and |
| # lock(B) was called before unlock(A) was called. |
| graph = DiGraph() |
| for key, leaf in bpf.get_table('edges').items(): |
| graph.add_edge( |
| key.mutex1, |
| key.mutex2, |
| thread_pid=leaf.thread_pid, |
| thread_comm=leaf.comm.decode('utf-8'), |
| first_mutex_stack_id=leaf.mutex1_stack_id, |
| second_mutex_stack_id=leaf.mutex2_stack_id, |
| ) |
| if args.verbose: |
| print( |
| 'Mutexes: %d, Edges: %d' % |
| (len(graph.nodes()), len(graph.edges())) |
| ) |
| if args.dump_graph: |
| with open(args.dump_graph, 'w') as f: |
| data = graph.node_link_data() |
| f.write(json.dumps(data, indent=2)) |
| |
| cycle = find_cycle(graph) |
| if cycle: |
| print_cycle( |
| args.binary, graph, cycle, thread_info, print_stack_trace |
| ) |
| sys.exit(1) |
| |
| time.sleep(1) |
| except KeyboardInterrupt: |
| break |
| |
| |
| if __name__ == '__main__': |
| main() |