blob: cbc06912f52d5e5709253203e08389cd35c24a6d [file] [log] [blame]
Kenny Yu66fb4d22017-02-02 10:05:31 -08001#!/usr/bin/env python
2#
3# deadlock_detector Detects potential deadlocks (lock order inversions)
4# on a running process. For Linux, uses BCC, eBPF.
5#
Kenny Yue7dff432017-02-03 09:39:39 -08006# USAGE: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH]
7# [--verbose] [--lock-symbols LOCK_SYMBOLS]
Kenny Yu66fb4d22017-02-02 10:05:31 -08008# [--unlock-symbols UNLOCK_SYMBOLS]
Kenny Yue7dff432017-02-03 09:39:39 -08009# pid
Kenny Yu66fb4d22017-02-02 10:05:31 -080010#
11# This traces pthread mutex lock and unlock calls to build a directed graph
12# representing the mutex wait graph:
13#
14# - Nodes in the graph represent mutexes.
15# - Edge (A, B) exists if there exists some thread T where lock(A) was called
16# and lock(B) was called before unlock(A) was called.
17#
18# If the program finds a potential lock order inversion, the program will dump
19# the cycle of mutexes and the stack traces where each mutex was acquired, and
20# then exit.
21#
Kenny Yue7dff432017-02-03 09:39:39 -080022# This program can only find potential deadlocks that occur while the program
23# is tracing the process. It cannot find deadlocks that may have occurred
24# before the program was attached to the process.
25#
26# Since this traces all mutex lock and unlock events and all thread creation
27# events on the traced process, the overhead of this bpf program can be very
28# high if the process has many threads and mutexes. You should only run this on
29# a process where the slowdown is acceptable.
30#
Kenny Yu66fb4d22017-02-02 10:05:31 -080031# Note: This tool does not work for shared mutexes or recursive mutexes.
32#
33# For shared (read-write) mutexes, a deadlock requires a cycle in the wait
34# graph where at least one of the mutexes in the cycle is acquiring exclusive
35# (write) ownership.
36#
37# For recursive mutexes, lock() is called multiple times on the same mutex.
38# However, there is no way to determine if a mutex is a recursive mutex
39# after the mutex has been created. As a result, this tool will not find
40# potential deadlocks that involve only one mutex.
41#
42# Copyright 2017 Facebook, Inc.
43# Licensed under the Apache License, Version 2.0 (the "License")
44#
45# 01-Feb-2017 Kenny Yu Created this.
46
47from __future__ import (
48 absolute_import, division, unicode_literals, print_function
49)
50from bcc import BPF
Kenny Yue7dff432017-02-03 09:39:39 -080051from collections import defaultdict
Kenny Yu66fb4d22017-02-02 10:05:31 -080052import argparse
53import json
Kenny Yue7dff432017-02-03 09:39:39 -080054import os
55import subprocess
Kenny Yu66fb4d22017-02-02 10:05:31 -080056import sys
57import time
58
59
Kenny Yue7dff432017-02-03 09:39:39 -080060class DiGraph(object):
61 '''
62 Adapted from networkx: http://networkx.github.io/
63 Represents a directed graph. Edges can store (key, value) attributes.
64 '''
65
66 def __init__(self):
67 # Map of node -> set of nodes
68 self.adjacency_map = {}
69 # Map of (node1, node2) -> map string -> arbitrary attribute
70 # This will not be copied in subgraph()
71 self.attributes_map = {}
72
73 def neighbors(self, node):
74 return self.adjacency_map.get(node, set())
75
76 def edges(self):
77 edges = []
78 for node, neighbors in self.adjacency_map.items():
79 for neighbor in neighbors:
80 edges.append((node, neighbor))
81 return edges
82
83 def nodes(self):
84 return self.adjacency_map.keys()
85
86 def attributes(self, node1, node2):
87 return self.attributes_map[(node1, node2)]
88
89 def add_edge(self, node1, node2, **kwargs):
90 if node1 not in self.adjacency_map:
91 self.adjacency_map[node1] = set()
92 if node2 not in self.adjacency_map:
93 self.adjacency_map[node2] = set()
94 self.adjacency_map[node1].add(node2)
95 self.attributes_map[(node1, node2)] = kwargs
96
97 def remove_node(self, node):
98 self.adjacency_map.pop(node, None)
99 for _, neighbors in self.adjacency_map.items():
100 neighbors.discard(node)
101
102 def subgraph(self, nodes):
103 graph = DiGraph()
104 for node in nodes:
105 for neighbor in self.neighbors(node):
106 if neighbor in nodes:
107 graph.add_edge(node, neighbor)
108 return graph
109
110 def node_link_data(self):
111 '''
112 Returns the graph as a dictionary in a format that can be
113 serialized.
114 '''
115 data = {
116 'directed': True,
117 'multigraph': False,
118 'graph': {},
119 'links': [],
120 'nodes': [],
121 }
122
123 # Do one pass to build a map of node -> position in nodes
124 node_to_number = {}
125 for node in self.adjacency_map.keys():
126 node_to_number[node] = len(data['nodes'])
127 data['nodes'].append({'id': node})
128
129 # Do another pass to build the link information
130 for node, neighbors in self.adjacency_map.items():
131 for neighbor in neighbors:
132 link = self.attributes_map[(node, neighbor)].copy()
133 link['source'] = node_to_number[node]
134 link['target'] = node_to_number[neighbor]
135 data['links'].append(link)
136 return data
137
138
139def strongly_connected_components(G):
140 '''
141 Adapted from networkx: http://networkx.github.io/
142 Parameters
143 ----------
144 G : DiGraph
145 Returns
146 -------
147 comp : generator of sets
148 A generator of sets of nodes, one for each strongly connected
149 component of G.
150 '''
151 preorder = {}
152 lowlink = {}
153 scc_found = {}
154 scc_queue = []
155 i = 0 # Preorder counter
156 for source in G.nodes():
157 if source not in scc_found:
158 queue = [source]
159 while queue:
160 v = queue[-1]
161 if v not in preorder:
162 i = i + 1
163 preorder[v] = i
164 done = 1
165 v_nbrs = G.neighbors(v)
166 for w in v_nbrs:
167 if w not in preorder:
168 queue.append(w)
169 done = 0
170 break
171 if done == 1:
172 lowlink[v] = preorder[v]
173 for w in v_nbrs:
174 if w not in scc_found:
175 if preorder[w] > preorder[v]:
176 lowlink[v] = min([lowlink[v], lowlink[w]])
177 else:
178 lowlink[v] = min([lowlink[v], preorder[w]])
179 queue.pop()
180 if lowlink[v] == preorder[v]:
181 scc_found[v] = True
182 scc = {v}
183 while (
184 scc_queue and preorder[scc_queue[-1]] > preorder[v]
185 ):
186 k = scc_queue.pop()
187 scc_found[k] = True
188 scc.add(k)
189 yield scc
190 else:
191 scc_queue.append(v)
192
193
194def simple_cycles(G):
195 '''
196 Adapted from networkx: http://networkx.github.io/
197 Parameters
198 ----------
199 G : DiGraph
200 Returns
201 -------
202 cycle_generator: generator
203 A generator that produces elementary cycles of the graph.
204 Each cycle is represented by a list of nodes along the cycle.
205 '''
206
207 def _unblock(thisnode, blocked, B):
208 stack = set([thisnode])
209 while stack:
210 node = stack.pop()
211 if node in blocked:
212 blocked.remove(node)
213 stack.update(B[node])
214 B[node].clear()
215
216 # Johnson's algorithm requires some ordering of the nodes.
217 # We assign the arbitrary ordering given by the strongly connected comps
218 # There is no need to track the ordering as each node removed as processed.
219 # save the actual graph so we can mutate it here
220 # We only take the edges because we do not want to
221 # copy edge and node attributes here.
222 subG = G.subgraph(G.nodes())
223 sccs = list(strongly_connected_components(subG))
224 while sccs:
225 scc = sccs.pop()
226 # order of scc determines ordering of nodes
227 startnode = scc.pop()
228 # Processing node runs 'circuit' routine from recursive version
229 path = [startnode]
230 blocked = set() # vertex: blocked from search?
231 closed = set() # nodes involved in a cycle
232 blocked.add(startnode)
233 B = defaultdict(set) # graph portions that yield no elementary circuit
234 stack = [(startnode, list(subG.neighbors(startnode)))]
235 while stack:
236 thisnode, nbrs = stack[-1]
237 if nbrs:
238 nextnode = nbrs.pop()
239 if nextnode == startnode:
240 yield path[:]
241 closed.update(path)
242 elif nextnode not in blocked:
243 path.append(nextnode)
244 stack.append((nextnode, list(subG.neighbors(nextnode))))
245 closed.discard(nextnode)
246 blocked.add(nextnode)
247 continue
248 # done with nextnode... look for more neighbors
249 if not nbrs: # no more nbrs
250 if thisnode in closed:
251 _unblock(thisnode, blocked, B)
252 else:
253 for nbr in subG.neighbors(thisnode):
254 if thisnode not in B[nbr]:
255 B[nbr].add(thisnode)
256 stack.pop()
257 path.pop()
258 # done processing this node
259 subG.remove_node(startnode)
260 H = subG.subgraph(scc) # make smaller to avoid work in SCC routine
261 sccs.extend(list(strongly_connected_components(H)))
262
263
Kenny Yu66fb4d22017-02-02 10:05:31 -0800264def find_cycle(graph):
265 '''
266 Looks for a cycle in the graph. If found, returns the first cycle.
267 If nodes a1, a2, ..., an are in a cycle, then this returns:
Kenny Yue7dff432017-02-03 09:39:39 -0800268 [(a1,a2), (a2,a3), ... (an-1,an), (an, a1)]
Kenny Yu66fb4d22017-02-02 10:05:31 -0800269 Otherwise returns an empty list.
270 '''
Kenny Yue7dff432017-02-03 09:39:39 -0800271 cycles = list(simple_cycles(graph))
Kenny Yu66fb4d22017-02-02 10:05:31 -0800272 if cycles:
273 nodes = cycles[0]
274 nodes.append(nodes[0])
275 edges = []
276 prev = nodes[0]
277 for node in nodes[1:]:
278 edges.append((prev, node))
279 prev = node
280 return edges
281 else:
282 return []
283
284
Kenny Yue7dff432017-02-03 09:39:39 -0800285def print_cycle(binary, graph, edges, thread_info, print_stack_trace_fn):
Kenny Yu66fb4d22017-02-02 10:05:31 -0800286 '''
Kenny Yue7dff432017-02-03 09:39:39 -0800287 Prints the cycle in the mutex graph in the following format:
Kenny Yu66fb4d22017-02-02 10:05:31 -0800288
289 Potential Deadlock Detected!
290
291 Cycle in lock order graph: M0 => M1 => M2 => M0
292
293 for (m, n) in cycle:
294 Mutex n acquired here while holding Mutex m in thread T:
295 [ stack trace ]
296
297 Mutex m previously acquired by thread T here:
298 [ stack trace ]
299
300 for T in all threads:
301 Thread T was created here:
302 [ stack trace ]
303 '''
Kenny Yu66fb4d22017-02-02 10:05:31 -0800304
305 # List of mutexes in the cycle, first and last repeated
306 nodes_in_order = []
307 # Map mutex address -> readable alias
308 node_addr_to_name = {}
309 for counter, (m, n) in enumerate(edges):
310 nodes_in_order.append(m)
Kenny Yue7dff432017-02-03 09:39:39 -0800311 # For global or static variables, try to symbolize the mutex address.
312 symbol = symbolize_with_objdump(binary, m)
313 if symbol:
314 symbol += ' '
315 node_addr_to_name[m] = 'Mutex M%d (%s0x%016x)' % (counter, symbol, m)
Kenny Yu66fb4d22017-02-02 10:05:31 -0800316 nodes_in_order.append(nodes_in_order[0])
317
318 print('----------------\nPotential Deadlock Detected!\n')
319 print(
320 'Cycle in lock order graph: %s\n' %
321 (' => '.join([node_addr_to_name[n] for n in nodes_in_order]))
322 )
323
324 # Set of threads involved in the lock inversion
325 thread_pids = set()
326
327 # For each edge in the cycle, print where the two mutexes were held
328 for (m, n) in edges:
Kenny Yue7dff432017-02-03 09:39:39 -0800329 thread_pid = graph.attributes(m, n)['thread_pid']
330 thread_comm = graph.attributes(m, n)['thread_comm']
331 first_mutex_stack_id = graph.attributes(m, n)['first_mutex_stack_id']
332 second_mutex_stack_id = graph.attributes(m, n)['second_mutex_stack_id']
Kenny Yu66fb4d22017-02-02 10:05:31 -0800333 thread_pids.add(thread_pid)
334 print(
335 '%s acquired here while holding %s in Thread %d (%s):' % (
336 node_addr_to_name[n], node_addr_to_name[m], thread_pid,
337 thread_comm
338 )
339 )
340 print_stack_trace_fn(second_mutex_stack_id)
341 print('')
342 print(
343 '%s previously acquired by the same Thread %d (%s) here:' %
344 (node_addr_to_name[m], thread_pid, thread_comm)
345 )
346 print_stack_trace_fn(first_mutex_stack_id)
347 print('')
348
349 # Print where the threads were created, if available
350 for thread_pid in thread_pids:
Kenny Yue7dff432017-02-03 09:39:39 -0800351 parent_pid, stack_id, parent_comm = thread_info.get(
Kenny Yu66fb4d22017-02-02 10:05:31 -0800352 thread_pid, (None, None, None)
353 )
354 if parent_pid:
355 print(
356 'Thread %d created by Thread %d (%s) here: ' %
357 (thread_pid, parent_pid, parent_comm)
358 )
359 print_stack_trace_fn(stack_id)
360 else:
361 print(
362 'Could not find stack trace where Thread %d was created' %
363 thread_pid
364 )
365 print('')
Kenny Yue7dff432017-02-03 09:39:39 -0800366
367
368def symbolize_with_objdump(binary, addr):
369 '''
Kenny Yud07b7592017-02-03 13:33:20 -0800370 Searches the binary for the address using objdump. Returns the symbol if
Kenny Yue7dff432017-02-03 09:39:39 -0800371 it is found, otherwise returns empty string.
372 '''
373 try:
374 command = (
375 'objdump -tT %s | grep %x | awk {\'print $NF\'} | c++filt' %
376 (binary, addr)
377 )
378 output = subprocess.check_output(command, shell=True)
379 return output.decode('utf-8').strip()
380 except subprocess.CalledProcessError:
381 return ''
Kenny Yu66fb4d22017-02-02 10:05:31 -0800382
383
384def strlist(s):
Kenny Yue7dff432017-02-03 09:39:39 -0800385 '''Given a comma-separated string, returns a list of substrings'''
Kenny Yu66fb4d22017-02-02 10:05:31 -0800386 return s.strip().split(',')
387
388
389def main():
Kenny Yuc44e92c2017-02-03 18:50:33 -0800390 examples = '''Examples:
391 deadlock_detector 181 # Analyze PID 181
392
393 deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0
394 # Analyze PID 181 and locks from this binary.
395 # If tracing a process that is running from
396 # a dynamically-linked binary, this argument
397 # is required and should be the path to the
398 # pthread library.
399
400 deadlock_detector 181 --verbose
401 # Analyze PID 181 and print statistics about
402 # the mutex wait graph.
403
404 deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\
405 --unlock-symbols my_mutex_unlock1,my_mutex_unlock2
406 # Analyze PID 181 and trace custom mutex
407 # symbols instead of pthread mutexes.
408
409 deadlock_detector 181 --dump-graph graph.json
410 # Analyze PID 181 and dump the mutex wait
411 # graph to graph.json.
412 '''
Kenny Yu66fb4d22017-02-02 10:05:31 -0800413 parser = argparse.ArgumentParser(
414 description=(
415 'Detect potential deadlocks (lock inversions) in a running binary.'
Kenny Yuc44e92c2017-02-03 18:50:33 -0800416 '\nMust be run as root.'
417 ),
418 formatter_class=argparse.RawDescriptionHelpFormatter,
419 epilog=examples,
Kenny Yu66fb4d22017-02-02 10:05:31 -0800420 )
Kenny Yu66fb4d22017-02-02 10:05:31 -0800421 parser.add_argument('pid', type=int, help='Pid to trace')
Kenny Yue7dff432017-02-03 09:39:39 -0800422 # Binaries with `:` in the path will fail to attach uprobes on kernels
423 # running without this patch: https://lkml.org/lkml/2017/1/13/585.
424 # Symlinks to the binary without `:` in the path can get around this issue.
425 parser.add_argument(
426 '--binary',
427 type=str,
428 default='',
Kenny Yuc44e92c2017-02-03 18:50:33 -0800429 help='If set, trace the mutexes from the binary at this path. '
430 'For statically-linked binaries, this argument is not required. '
431 'For dynamically-linked binaries, this argument is required and '
432 'should be the path of the pthread library the binary is using. '
433 'Example: /lib/x86_64-linux-gnu/libpthread.so.0',
Kenny Yue7dff432017-02-03 09:39:39 -0800434 )
Kenny Yu66fb4d22017-02-02 10:05:31 -0800435 parser.add_argument(
436 '--dump-graph',
437 type=str,
438 default='',
439 help='If set, this will dump the mutex graph to the specified file.',
440 )
441 parser.add_argument(
Kenny Yue7dff432017-02-03 09:39:39 -0800442 '--verbose',
443 action='store_true',
444 help='Print statistics about the mutex wait graph.',
445 )
446 parser.add_argument(
Kenny Yu66fb4d22017-02-02 10:05:31 -0800447 '--lock-symbols',
448 type=strlist,
449 default=['pthread_mutex_lock'],
450 help='Comma-separated list of lock symbols to trace. Default is '
Kenny Yuc44e92c2017-02-03 18:50:33 -0800451 'pthread_mutex_lock. These symbols cannot be inlined in the binary.',
Kenny Yu66fb4d22017-02-02 10:05:31 -0800452 )
453 parser.add_argument(
454 '--unlock-symbols',
455 type=strlist,
456 default=['pthread_mutex_unlock'],
457 help='Comma-separated list of unlock symbols to trace. Default is '
Kenny Yuc44e92c2017-02-03 18:50:33 -0800458 'pthread_mutex_unlock. These symbols cannot be inlined in the binary.',
Kenny Yu66fb4d22017-02-02 10:05:31 -0800459 )
460 args = parser.parse_args()
Kenny Yue7dff432017-02-03 09:39:39 -0800461 if not args.binary:
462 try:
463 args.binary = os.readlink('/proc/%d/exe' % args.pid)
464 except OSError as e:
465 print('%s. Is the process (pid=%d) running?' % (str(e), args.pid))
466 sys.exit(1)
467
Yonghong Song64335692018-04-25 00:40:13 -0700468 bpf = BPF(src_file=b'deadlock_detector.c')
Kenny Yu66fb4d22017-02-02 10:05:31 -0800469
470 # Trace where threads are created
Yonghong Song64335692018-04-25 00:40:13 -0700471 bpf.attach_kretprobe(event=bpf.get_syscall_fnname('clone'), fn_name='trace_clone')
Kenny Yu66fb4d22017-02-02 10:05:31 -0800472
473 # We must trace unlock first, otherwise in the time we attached the probe
Kenny Yue7dff432017-02-03 09:39:39 -0800474 # on lock() and have not yet attached the probe on unlock(), a thread can
475 # acquire mutexes and release them, but the release events will not be
476 # traced, resulting in noisy reports.
Kenny Yu66fb4d22017-02-02 10:05:31 -0800477 for symbol in args.unlock_symbols:
478 try:
479 bpf.attach_uprobe(
480 name=args.binary,
481 sym=symbol,
482 fn_name='trace_mutex_release',
483 pid=args.pid,
484 )
485 except Exception as e:
Kenny Yue7dff432017-02-03 09:39:39 -0800486 print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
Kenny Yu66fb4d22017-02-02 10:05:31 -0800487 sys.exit(1)
488 for symbol in args.lock_symbols:
489 try:
490 bpf.attach_uprobe(
491 name=args.binary,
492 sym=symbol,
493 fn_name='trace_mutex_acquire',
494 pid=args.pid,
495 )
496 except Exception as e:
Kenny Yue7dff432017-02-03 09:39:39 -0800497 print('%s. Failed to attach to symbol: %s' % (str(e), symbol))
Kenny Yu66fb4d22017-02-02 10:05:31 -0800498 sys.exit(1)
499
500 def print_stack_trace(stack_id):
Kenny Yue7dff432017-02-03 09:39:39 -0800501 '''Closure that prints the symbolized stack trace.'''
Kenny Yu66fb4d22017-02-02 10:05:31 -0800502 for addr in bpf.get_table('stack_traces').walk(stack_id):
503 line = bpf.sym(addr, args.pid)
Kenny Yue7dff432017-02-03 09:39:39 -0800504 # Try to symbolize with objdump if we cannot with bpf.
505 if line == '[unknown]':
506 symbol = symbolize_with_objdump(args.binary, addr)
507 if symbol:
508 line = symbol
Kenny Yu66fb4d22017-02-02 10:05:31 -0800509 print('@ %016x %s' % (addr, line))
510
511 print('Tracing... Hit Ctrl-C to end.')
512 while True:
513 try:
Kenny Yue7dff432017-02-03 09:39:39 -0800514 # Map of child thread pid -> parent info
515 thread_info = {
516 child.value: (parent.parent_pid, parent.stack_id, parent.comm)
517 for child, parent in bpf.get_table('thread_to_parent').items()
518 }
519
Kenny Yu66fb4d22017-02-02 10:05:31 -0800520 # Mutex wait directed graph. Nodes are mutexes. Edge (A,B) exists
521 # if there exists some thread T where lock(A) was called and
522 # lock(B) was called before unlock(A) was called.
Kenny Yue7dff432017-02-03 09:39:39 -0800523 graph = DiGraph()
Kenny Yu66fb4d22017-02-02 10:05:31 -0800524 for key, leaf in bpf.get_table('edges').items():
525 graph.add_edge(
526 key.mutex1,
527 key.mutex2,
528 thread_pid=leaf.thread_pid,
529 thread_comm=leaf.comm.decode('utf-8'),
530 first_mutex_stack_id=leaf.mutex1_stack_id,
531 second_mutex_stack_id=leaf.mutex2_stack_id,
532 )
Kenny Yue7dff432017-02-03 09:39:39 -0800533 if args.verbose:
534 print(
535 'Mutexes: %d, Edges: %d' %
536 (len(graph.nodes()), len(graph.edges()))
537 )
Kenny Yu66fb4d22017-02-02 10:05:31 -0800538 if args.dump_graph:
539 with open(args.dump_graph, 'w') as f:
Kenny Yue7dff432017-02-03 09:39:39 -0800540 data = graph.node_link_data()
Kenny Yu66fb4d22017-02-02 10:05:31 -0800541 f.write(json.dumps(data, indent=2))
542
Kenny Yue7dff432017-02-03 09:39:39 -0800543 cycle = find_cycle(graph)
544 if cycle:
545 print_cycle(
546 args.binary, graph, cycle, thread_info, print_stack_trace
547 )
Kenny Yu66fb4d22017-02-02 10:05:31 -0800548 sys.exit(1)
549
550 time.sleep(1)
551 except KeyboardInterrupt:
552 break
553
554
555if __name__ == '__main__':
556 main()