blob: 2441e45598f12f64c1dfca99ed3e9218ef9a3edb [file] [log] [blame]
John Brawn3546c2f2016-05-26 11:16:43 +00001#!/usr/bin/env python
2
3"""A tool for extracting a list of symbols to export
4
5When exporting symbols from a dll or exe we either need to mark the symbols in
6the source code as __declspec(dllexport) or supply a list of symbols to the
7linker. This program automates the latter by inspecting the symbol tables of a
8list of link inputs and deciding which of those symbols need to be exported.
9
10We can't just export all the defined symbols, as there's a limit of 65535
11exported symbols and in clang we go way over that, particularly in a debug
12build. Therefore a large part of the work is pruning symbols either which can't
13be imported, or which we think are things that have definitions in public header
14files (i.e. template instantiations) and we would get defined in the thing
15importing these symbols anyway.
16"""
17
18import sys
19import re
20import os
21import subprocess
22import multiprocessing
23import argparse
24
25# Define functions which extract a list of symbols from a library using several
26# different tools. We use subprocess.Popen and yield a symbol at a time instead
27# of using subprocess.check_output and returning a list as, especially on
28# Windows, waiting for the entire output to be ready can take a significant
29# amount of time.
30
31def dumpbin_get_symbols(lib):
32 process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
33 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
34 process.stdin.close()
35 for line in process.stdout:
36 # Look for external symbols that are defined in some section
37 match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
38 if match:
39 yield match.group(1)
40 process.wait()
41
42def nm_get_symbols(lib):
43 process = subprocess.Popen(['nm',lib], bufsize=1,
44 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
45 process.stdin.close()
46 for line in process.stdout:
47 # Look for external symbols that are defined in some section
48 match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line)
49 if match:
50 yield match.group(1)
51 process.wait()
52
53def readobj_get_symbols(lib):
54 process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
55 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
56 process.stdin.close()
57 for line in process.stdout:
58 # When looking through the output of llvm-readobj we expect to see Name,
59 # Section, then StorageClass, so record Name and Section when we see
60 # them and decide if this is a defined external symbol when we see
61 # StorageClass.
62 match = re.search('Name: (\S+)', line)
63 if match:
64 name = match.group(1)
65 match = re.search('Section: (\S+)', line)
66 if match:
67 section = match.group(1)
68 match = re.search('StorageClass: (\S+)', line)
69 if match:
70 storageclass = match.group(1)
71 if section != 'IMAGE_SYM_ABSOLUTE' and \
72 section != 'IMAGE_SYM_UNDEFINED' and \
73 storageclass == 'External':
74 yield name
75 process.wait()
76
77# Define functions which determine if the target is 32-bit Windows (as that's
78# where calling convention name decoration happens).
79
80def dumpbin_is_32bit_windows(lib):
81 # dumpbin /headers can output a huge amount of data (>100MB in a debug
82 # build) so we read only up to the 'machine' line then close the output.
83 process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
84 stdout=subprocess.PIPE, stdin=subprocess.PIPE)
85 process.stdin.close()
86 retval = False
87 for line in process.stdout:
88 match = re.match('.+machine \((\S+)\)', line)
89 if match:
90 retval = (match.group(1) == 'x86')
91 break
92 process.stdout.close()
93 process.wait()
94 return retval
95
96def objdump_is_32bit_windows(lib):
97 output = subprocess.check_output(['objdump','-f',lib])
98 for line in output:
99 match = re.match('.+file format (\S+)', line)
100 if match:
101 return (match.group(1) == 'pe-i386')
102 return False
103
104def readobj_is_32bit_windows(lib):
105 output = subprocess.check_output(['llvm-readobj','-file-headers',lib])
106 for line in output:
107 match = re.match('Format: (\S+)', line)
108 if match:
109 return (match.group(1) == 'COFF-i386')
110 return False
111
112# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
113# identifier/type mangling we can decide which symbols could possibly be
114# required and which we can discard.
115def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
116 # Keep unmangled (i.e. extern "C") names
117 if not '?' in symbol:
118 if calling_convention_decoration:
119 # Remove calling convention decoration from names
120 match = re.match('[_@]([^@]+)', symbol)
121 if match:
122 return match.group(1)
123 return symbol
124 # Function template instantiations start with ?$, discard them as it's
125 # assumed that the definition is public
126 elif symbol.startswith('??$'):
127 return None
128 # Deleting destructors start with ?_G or ?_E and can be discarded because
129 # link.exe gives you a warning telling you they can't be exported if you
130 # don't
131 elif symbol.startswith('??_G') or symbol.startswith('??_E'):
132 return None
133 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
134 # defined in headers and not required to be kept
135 elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
136 return None
137 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
138 # that mentions an anonymous namespace can be discarded, as the anonymous
139 # namespace doesn't exist outside of that translation unit.
140 elif re.search('\?A(0x\w+)?@', symbol):
141 return None
142 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
143 # bit of a mess and imprecise, but that avoids having to completely demangle
144 # the symbol name. The outermost namespace is at the end of the identifier
145 # mangling, and the identifier mangling is followed by the type mangling, so
146 # we look for (llvm|clang)@@ followed by something that looks like a
147 # function type mangling. To spot a function type we use (this is derived
148 # from clang/lib/AST/MicrosoftMangle.cpp):
149 # <function-type> ::= <function-class> <this-cvr-qualifiers>
150 # <calling-convention> <return-type>
151 # <argument-list> <throw-spec>
152 # <function-class> ::= [A-Z]
153 # <this-cvr-qualifiers> ::= [A-Z0-9_]*
154 # <calling-convention> ::= [A-JQ]
155 # <return-type> ::= .+
156 # <argument-list> ::= X (void)
157 # ::= .+@ (list of types)
158 # ::= .*Z (list of types, varargs)
159 # <throw-spec> ::= exceptions are not allowed
160 elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
161 return symbol
162 return None
163
164# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
165# demangle the identifier mangling to identify symbols that can be safely
166# discarded.
167def should_keep_itanium_symbol(symbol, calling_convention_decoration):
168 # Start by removing any calling convention decoration (which we expect to
169 # see on all symbols, even mangled C++ symbols)
170 if calling_convention_decoration and symbol.startswith('_'):
171 symbol = symbol[1:]
172 # Keep unmangled names
173 if not symbol.startswith('_') and not symbol.startswith('.'):
174 return symbol
175 # Discard manglings that aren't nested names
176 match = re.match('_Z(T[VTIS])?(N.+)', symbol)
177 if not match:
178 return None
179 # Demangle the name. If the name is too complex then we don't need to keep
180 # it, but it the demangling fails then keep the symbol just in case.
181 try:
182 names, _ = parse_itanium_nested_name(match.group(2))
183 except TooComplexName:
184 return None
185 if not names:
186 return symbol
187 # Constructors and destructors of templates classes are assumed to be
188 # defined in headers and not required to be kept
189 if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
190 return None
191 # Discard function template instantiations as it's assumed that the
192 # definition is public
193 elif names[-1][1]:
194 return None
195 # Keep llvm:: and clang:: names
196 elif names[0][0] == '4llvm' or names[0][0] == '5clang':
197 return symbol
198 # Discard everything else
199 else:
200 return None
201
202# Certain kinds of complex manglings we assume cannot be part of a public
203# interface, and we handle them by raising an exception.
204class TooComplexName(Exception):
205 pass
206
207# Parse an itanium mangled name from the start of a string and return a
208# (name, rest of string) pair.
209def parse_itanium_name(arg):
210 # Check for a normal name
211 match = re.match('(\d+)(.+)', arg)
212 if match:
213 n = int(match.group(1))
214 name = match.group(1)+match.group(2)[:n]
215 rest = match.group(2)[n:]
216 return name, rest
217 # Check for constructor/destructor names
218 match = re.match('([CD][123])(.+)', arg)
219 if match:
220 return match.group(1), match.group(2)
221 # Assume that a sequence of characters that doesn't end a nesting is an
222 # operator (this is very imprecise, but appears to be good enough)
223 match = re.match('([^E]+)(.+)', arg)
224 if match:
225 return match.group(1), match.group(2)
226 # Anything else: we can't handle it
227 return None, arg
228
229# Parse an itanium mangled template argument list from the start of a string
230# and throw it away, returning the rest of the string.
231def skip_itanium_template(arg):
232 # A template argument list starts with I
233 assert arg.startswith('I'), arg
234 tmp = arg[1:]
235 while tmp:
236 # Check for names
237 match = re.match('(\d+)(.+)', tmp)
238 if match:
239 n = int(match.group(1))
240 tmp = match.group(2)[n:]
241 continue
242 # Check for substitutions
243 match = re.match('S[A-Z0-9]*_(.+)', tmp)
244 if match:
245 tmp = match.group(1)
246 # Start of a template
247 elif tmp.startswith('I'):
248 tmp = skip_itanium_template(tmp)
249 # Start of a nested name
250 elif tmp.startswith('N'):
251 _, tmp = parse_itanium_nested_name(tmp)
252 # Start of an expression: assume that it's too complicated
253 elif tmp.startswith('L') or tmp.startswith('X'):
254 raise TooComplexName
255 # End of the template
256 elif tmp.startswith('E'):
257 return tmp[1:]
258 # Something else: probably a type, skip it
259 else:
260 tmp = tmp[1:]
261 return None
262
263# Parse an itanium mangled nested name and transform it into a list of pairs of
264# (name, is_template), returning (list, rest of string).
265def parse_itanium_nested_name(arg):
266 # A nested name starts with N
267 assert arg.startswith('N'), arg
268 ret = []
269
270 # Skip past the N, and possibly a substitution
271 match = re.match('NS[A-Z0-9]*_(.+)', arg)
272 if match:
273 tmp = match.group(1)
274 else:
275 tmp = arg[1:]
276
277 # Skip past CV-qualifiers and ref qualifiers
278 match = re.match('[rVKRO]*(.+)', tmp);
279 if match:
280 tmp = match.group(1)
281
282 # Repeatedly parse names from the string until we reach the end of the
283 # nested name
284 while tmp:
285 # An E ends the nested name
286 if tmp.startswith('E'):
287 return ret, tmp[1:]
288 # Parse a name
289 name_part, tmp = parse_itanium_name(tmp)
290 if not name_part:
291 # If we failed then we don't know how to demangle this
292 return None, None
293 is_template = False
294 # If this name is a template record that, then skip the template
295 # arguments
296 if tmp.startswith('I'):
297 tmp = skip_itanium_template(tmp)
298 is_template = True
299 # Add the name to the list
300 ret.append((name_part, is_template))
301
302 # If we get here then something went wrong
303 return None, None
304
305def extract_symbols(arg):
306 get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
307 symbols = dict()
308 for symbol in get_symbols(lib):
309 symbol = should_keep_symbol(symbol, calling_convention_decoration)
310 if symbol:
311 symbols[symbol] = 1 + symbols.setdefault(symbol,0)
312 return symbols
313
314if __name__ == '__main__':
315 tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
316 parser = argparse.ArgumentParser(
317 description='Extract symbols to export from libraries')
318 parser.add_argument('--mangling', choices=['itanium','microsoft'],
319 required=True, help='expected symbol mangling scheme')
320 parser.add_argument('--tools', choices=tool_exes, nargs='*',
321 help='tools to use to extract symbols and determine the'
322 ' target')
323 parser.add_argument('libs', metavar='lib', type=str, nargs='+',
324 help='libraries to extract symbols from')
325 parser.add_argument('-o', metavar='file', type=str, help='output to file')
326 args = parser.parse_args()
327
328 # Determine the function to use to get the list of symbols from the inputs,
329 # and the function to use to determine if the target is 32-bit windows.
330 tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
331 'nm' : (nm_get_symbols, None),
332 'objdump' : (None, objdump_is_32bit_windows),
333 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
334 get_symbols = None
335 is_32bit_windows = None
336 # If we have a tools argument then use that for the list of tools to check
337 if args.tools:
338 tool_exes = args.tools
339 # Find a tool to use by trying each in turn until we find one that exists
340 # (subprocess.call will throw OSError when the program does not exist)
341 get_symbols = None
342 for exe in tool_exes:
343 try:
344 # Close std streams as we don't want any output and we don't
345 # want the process to wait for something on stdin.
346 p = subprocess.Popen([exe], stdout=subprocess.PIPE,
347 stderr=subprocess.PIPE,
348 stdin=subprocess.PIPE)
349 p.stdout.close()
350 p.stderr.close()
351 p.stdin.close()
352 p.wait()
353 # Keep going until we have a tool to use for both get_symbols and
354 # is_32bit_windows
355 if not get_symbols:
356 get_symbols = tools[exe][0]
357 if not is_32bit_windows:
358 is_32bit_windows = tools[exe][1]
359 if get_symbols and is_32bit_windows:
360 break
361 except OSError:
362 continue
363 if not get_symbols:
364 print >>sys.stderr, "Couldn't find a program to read symbols with"
365 exit(1)
366 if not is_32bit_windows:
367 print >>sys.stderr, "Couldn't find a program to determing the target"
368 exit(1)
369
370 # How we determine which symbols to keep and which to discard depends on
371 # the mangling scheme
372 if args.mangling == 'microsoft':
373 should_keep_symbol = should_keep_microsoft_symbol
374 else:
375 should_keep_symbol = should_keep_itanium_symbol
376
377 # Get the list of libraries to extract symbols from
378 libs = list()
379 for lib in args.libs:
380 # When invoked by cmake the arguments are the cmake target names of the
381 # libraries, so we need to add .lib/.a to the end and maybe lib to the
382 # start to get the filename. Also allow objects.
383 suffixes = ['.lib','.a','.obj','.o']
384 if not any([lib.endswith(s) for s in suffixes]):
385 for s in suffixes:
386 if os.path.exists(lib+s):
387 lib = lib+s
388 break
389 if os.path.exists('lib'+lib+s):
390 lib = 'lib'+lib+s
391 break
392 if not any([lib.endswith(s) for s in suffixes]):
393 print >>sys.stderr, "Don't know what to do with argument "+lib
394 exit(1)
395 libs.append(lib)
396
397 # Check if calling convention decoration is used by inspecting the first
398 # library in the list
399 calling_convention_decoration = is_32bit_windows(libs[0])
400
401 # Extract symbols from libraries in parallel. This is a huge time saver when
402 # doing a debug build, as there are hundreds of thousands of symbols in each
403 # library.
404 pool = multiprocessing.Pool()
405 try:
406 # Only one argument can be passed to the mapping function, and we can't
407 # use a lambda or local function definition as that doesn't work on
408 # windows, so create a list of tuples which duplicates the arguments
409 # that are the same in all calls.
410 vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
411 # Do an async map then wait for the result to make sure that
412 # KeyboardInterrupt gets caught correctly (see
413 # http://bugs.python.org/issue8296)
414 result = pool.map_async(extract_symbols, vals)
415 pool.close()
416 libs_symbols = result.get(3600)
417 except KeyboardInterrupt:
418 # On Ctrl-C terminate everything and exit
419 pool.terminate()
420 pool.join()
421 exit(1)
422
423 # Merge everything into a single dict
424 symbols = dict()
425 for this_lib_symbols in libs_symbols:
426 for k,v in this_lib_symbols.items():
427 symbols[k] = v + symbols.setdefault(k,0)
428
429 # Count instances of member functions of template classes, and map the
430 # symbol name to the function+class. We do this under the assumption that if
431 # a member function of a template class is instantiated many times it's
432 # probably declared in a public header file.
433 template_function_count = dict()
434 template_function_mapping = dict()
435 template_function_count[""] = 0
436 for k in symbols:
437 name = None
438 if args.mangling == 'microsoft':
439 # Member functions of templates start with
440 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
441 # As manglings go from the innermost scope to the outermost scope
442 # this means:
443 # * When we have a function member of a subclass of a template
444 # class then <fn_name> will actually contain the mangling of
445 # both the subclass and the function member. This is fine.
446 # * When we have a function member of a template subclass of a
447 # (possibly template) class then it's the innermost template
448 # subclass that becomes <class_name>. This should be OK so long
449 # as we don't have multiple classes with a template subclass of
450 # the same name.
451 match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
452 if match:
453 name = match.group(1)
454 else:
455 # Find member functions of templates by demangling the name and
456 # checking if the second-to-last name in the list is a template.
457 match = re.match('_Z(T[VTIS])?(N.+)', k)
458 if match:
459 try:
460 names, _ = parse_itanium_nested_name(match.group(2))
461 if names and names[-2][1]:
462 name = ''.join([x for x,_ in names])
463 except TooComplexName:
464 # Manglings that are too complex should already have been
465 # filtered out, but if we happen to somehow see one here
466 # just leave it as-is.
467 pass
468 if name:
469 old_count = template_function_count.setdefault(name,0)
470 template_function_count[name] = old_count + 1
471 template_function_mapping[k] = name
472 else:
473 template_function_mapping[k] = ""
474
475 # Print symbols which both:
476 # * Appear in exactly one input, as symbols defined in multiple
477 # objects/libraries are assumed to have public definitions.
478 # * Aren't instances of member functions of templates which have been
479 # instantiated 100 times or more, which are assumed to have public
480 # definitions. (100 is an arbitrary guess here.)
481 if args.o:
482 outfile = open(args.o,'w')
483 else:
484 outfile = sys.stdout
485 for k,v in symbols.items():
486 template_count = template_function_count[template_function_mapping[k]]
487 if v == 1 and template_count < 100:
488 print >>outfile, k