blob: 6062e2ea8ca9fea22fc3b0e80993fcfd0b0c9934 [file] [log] [blame]
Laszlo Nagybc687582016-01-12 22:38:41 +00001# -*- coding: utf-8 -*-
2# The LLVM Compiler Infrastructure
3#
4# This file is distributed under the University of Illinois Open Source
5# License. See LICENSE.TXT for details.
6""" This module is responsible to capture the compiler invocation of any
7build process. The result of that should be a compilation database.
8
9This implementation is using the LD_PRELOAD or DYLD_INSERT_LIBRARIES
10mechanisms provided by the dynamic linker. The related library is implemented
11in C language and can be found under 'libear' directory.
12
13The 'libear' library is capturing all child process creation and logging the
14relevant information about it into separate files in a specified directory.
15The parameter of this process is the output directory name, where the report
16files shall be placed. This parameter is passed as an environment variable.
17
18The module also implements compiler wrappers to intercept the compiler calls.
19
20The module implements the build command execution and the post-processing of
21the output files, which will condensates into a compilation database. """
22
23import sys
24import os
25import os.path
26import re
27import itertools
28import json
29import glob
30import argparse
31import logging
32import subprocess
33from libear import build_libear, TemporaryDirectory
34from libscanbuild import duplicate_check, tempdir, initialize_logging
35from libscanbuild import command_entry_point
36from libscanbuild.command import Action, classify_parameters
37from libscanbuild.shell import encode, decode
38
39__all__ = ['capture', 'intercept_build_main', 'intercept_build_wrapper']
40
41GS = chr(0x1d)
42RS = chr(0x1e)
43US = chr(0x1f)
44
45COMPILER_WRAPPER_CC = 'intercept-cc'
46COMPILER_WRAPPER_CXX = 'intercept-c++'
47
48
49@command_entry_point
50def intercept_build_main(bin_dir):
51 """ Entry point for 'intercept-build' command. """
52
53 parser = create_parser()
54 args = parser.parse_args()
55
56 initialize_logging(args.verbose)
57 logging.debug('Parsed arguments: %s', args)
58
59 if not args.build:
60 parser.print_help()
61 return 0
62
63 return capture(args, bin_dir)
64
65
66def capture(args, bin_dir):
67 """ The entry point of build command interception. """
68
69 def post_processing(commands):
70 """ To make a compilation database, it needs to filter out commands
71 which are not compiler calls. Needs to find the source file name
72 from the arguments. And do shell escaping on the command.
73
74 To support incremental builds, it is desired to read elements from
75 an existing compilation database from a previous run. These elemets
76 shall be merged with the new elements. """
77
78 # create entries from the current run
79 current = itertools.chain.from_iterable(
80 # creates a sequence of entry generators from an exec,
81 # but filter out non compiler calls before.
82 (format_entry(x) for x in commands if is_compiler_call(x)))
83 # read entries from previous run
84 if 'append' in args and args.append and os.path.exists(args.cdb):
85 with open(args.cdb) as handle:
86 previous = iter(json.load(handle))
87 else:
88 previous = iter([])
89 # filter out duplicate entries from both
90 duplicate = duplicate_check(entry_hash)
91 return (entry for entry in itertools.chain(previous, current)
92 if os.path.exists(entry['file']) and not duplicate(entry))
93
94 with TemporaryDirectory(prefix='intercept-', dir=tempdir()) as tmp_dir:
95 # run the build command
96 environment = setup_environment(args, tmp_dir, bin_dir)
97 logging.debug('run build in environment: %s', environment)
98 exit_code = subprocess.call(args.build, env=environment)
99 logging.info('build finished with exit code: %d', exit_code)
100 # read the intercepted exec calls
101 commands = itertools.chain.from_iterable(
102 parse_exec_trace(os.path.join(tmp_dir, filename))
103 for filename in sorted(glob.iglob(os.path.join(tmp_dir, '*.cmd'))))
104 # do post processing only if that was requested
105 if 'raw_entries' not in args or not args.raw_entries:
106 entries = post_processing(commands)
107 else:
108 entries = commands
109 # dump the compilation database
110 with open(args.cdb, 'w+') as handle:
111 json.dump(list(entries), handle, sort_keys=True, indent=4)
112 return exit_code
113
114
115def setup_environment(args, destination, bin_dir):
116 """ Sets up the environment for the build command.
117
118 It sets the required environment variables and execute the given command.
119 The exec calls will be logged by the 'libear' preloaded library or by the
120 'wrapper' programs. """
121
122 c_compiler = args.cc if 'cc' in args else 'cc'
123 cxx_compiler = args.cxx if 'cxx' in args else 'c++'
124
125 libear_path = None if args.override_compiler or is_preload_disabled(
126 sys.platform) else build_libear(c_compiler, destination)
127
128 environment = dict(os.environ)
129 environment.update({'INTERCEPT_BUILD_TARGET_DIR': destination})
130
131 if not libear_path:
132 logging.debug('intercept gonna use compiler wrappers')
133 environment.update({
134 'CC': os.path.join(bin_dir, COMPILER_WRAPPER_CC),
135 'CXX': os.path.join(bin_dir, COMPILER_WRAPPER_CXX),
136 'INTERCEPT_BUILD_CC': c_compiler,
137 'INTERCEPT_BUILD_CXX': cxx_compiler,
138 'INTERCEPT_BUILD_VERBOSE': 'DEBUG' if args.verbose > 2 else 'INFO'
139 })
140 elif sys.platform == 'darwin':
141 logging.debug('intercept gonna preload libear on OSX')
142 environment.update({
143 'DYLD_INSERT_LIBRARIES': libear_path,
144 'DYLD_FORCE_FLAT_NAMESPACE': '1'
145 })
146 else:
147 logging.debug('intercept gonna preload libear on UNIX')
148 environment.update({'LD_PRELOAD': libear_path})
149
150 return environment
151
152
153def intercept_build_wrapper(cplusplus):
154 """ Entry point for `intercept-cc` and `intercept-c++` compiler wrappers.
155
156 It does generate execution report into target directory. And execute
157 the wrapped compilation with the real compiler. The parameters for
158 report and execution are from environment variables.
159
160 Those parameters which for 'libear' library can't have meaningful
161 values are faked. """
162
163 # initialize wrapper logging
164 logging.basicConfig(format='intercept: %(levelname)s: %(message)s',
165 level=os.getenv('INTERCEPT_BUILD_VERBOSE', 'INFO'))
166 # write report
167 try:
168 target_dir = os.getenv('INTERCEPT_BUILD_TARGET_DIR')
169 if not target_dir:
170 raise UserWarning('exec report target directory not found')
171 pid = str(os.getpid())
172 target_file = os.path.join(target_dir, pid + '.cmd')
173 logging.debug('writing exec report to: %s', target_file)
174 with open(target_file, 'ab') as handler:
175 working_dir = os.getcwd()
176 command = US.join(sys.argv) + US
177 content = RS.join([pid, pid, 'wrapper', working_dir, command]) + GS
178 handler.write(content.encode('utf-8'))
179 except IOError:
180 logging.exception('writing exec report failed')
181 except UserWarning as warning:
182 logging.warning(warning)
183 # execute with real compiler
184 compiler = os.getenv('INTERCEPT_BUILD_CXX', 'c++') if cplusplus \
185 else os.getenv('INTERCEPT_BUILD_CC', 'cc')
186 compilation = [compiler] + sys.argv[1:]
187 logging.debug('execute compiler: %s', compilation)
188 return subprocess.call(compilation)
189
190
191def parse_exec_trace(filename):
192 """ Parse the file generated by the 'libear' preloaded library.
193
194 Given filename points to a file which contains the basic report
195 generated by the interception library or wrapper command. A single
196 report file _might_ contain multiple process creation info. """
197
198 logging.debug('parse exec trace file: %s', filename)
199 with open(filename, 'r') as handler:
200 content = handler.read()
201 for group in filter(bool, content.split(GS)):
202 records = group.split(RS)
203 yield {
204 'pid': records[0],
205 'ppid': records[1],
206 'function': records[2],
207 'directory': records[3],
208 'command': records[4].split(US)[:-1]
209 }
210
211
212def format_entry(entry):
213 """ Generate the desired fields for compilation database entries. """
214
215 def abspath(cwd, name):
216 """ Create normalized absolute path from input filename. """
217 fullname = name if os.path.isabs(name) else os.path.join(cwd, name)
218 return os.path.normpath(fullname)
219
220 logging.debug('format this command: %s', entry['command'])
221 atoms = classify_parameters(entry['command'])
222 if atoms['action'] <= Action.Compile:
223 for source in atoms['files']:
224 compiler = 'c++' if atoms['c++'] else 'cc'
225 flags = atoms['compile_options']
226 flags += ['-o', atoms['output']] if atoms['output'] else []
227 flags += ['-x', atoms['language']] if 'language' in atoms else []
228 flags += [elem
229 for arch in atoms.get('archs_seen', [])
230 for elem in ['-arch', arch]]
231 command = [compiler, '-c'] + flags + [source]
232 logging.debug('formated as: %s', command)
233 yield {
234 'directory': entry['directory'],
235 'command': encode(command),
236 'file': abspath(entry['directory'], source)
237 }
238
239
240def is_compiler_call(entry):
241 """ A predicate to decide the entry is a compiler call or not. """
242
243 patterns = [
244 re.compile(r'^([^/]*/)*intercept-c(c|\+\+)$'),
245 re.compile(r'^([^/]*/)*c(c|\+\+)$'),
246 re.compile(r'^([^/]*/)*([^-]*-)*[mg](cc|\+\+)(-\d+(\.\d+){0,2})?$'),
247 re.compile(r'^([^/]*/)*([^-]*-)*clang(\+\+)?(-\d+(\.\d+){0,2})?$'),
248 re.compile(r'^([^/]*/)*llvm-g(cc|\+\+)$'),
249 ]
250 executable = entry['command'][0]
251 return any((pattern.match(executable) for pattern in patterns))
252
253
254def is_preload_disabled(platform):
255 """ Library-based interposition will fail silently if SIP is enabled,
256 so this should be detected. You can detect whether SIP is enabled on
257 Darwin by checking whether (1) there is a binary called 'csrutil' in
258 the path and, if so, (2) whether the output of executing 'csrutil status'
259 contains 'System Integrity Protection status: enabled'.
260
261 Same problem on linux when SELinux is enabled. The status query program
262 'sestatus' and the output when it's enabled 'SELinux status: enabled'. """
263
264 if platform == 'darwin':
265 pattern = re.compile(r'System Integrity Protection status:\s+enabled')
266 command = ['csrutil', 'status']
267 elif platform in {'linux', 'linux2'}:
268 pattern = re.compile(r'SELinux status:\s+enabled')
269 command = ['sestatus']
270 else:
271 return False
272
273 try:
274 lines = subprocess.check_output(command).decode('utf-8')
275 return any((pattern.match(line) for line in lines.splitlines()))
276 except:
277 return False
278
279
280def entry_hash(entry):
281 """ Implement unique hash method for compilation database entries. """
282
283 # For faster lookup in set filename is reverted
284 filename = entry['file'][::-1]
285 # For faster lookup in set directory is reverted
286 directory = entry['directory'][::-1]
287 # On OS X the 'cc' and 'c++' compilers are wrappers for
288 # 'clang' therefore both call would be logged. To avoid
289 # this the hash does not contain the first word of the
290 # command.
291 command = ' '.join(decode(entry['command'])[1:])
292
293 return '<>'.join([filename, directory, command])
294
295
296def create_parser():
297 """ Command line argument parser factory method. """
298
299 parser = argparse.ArgumentParser(
300 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
301
302 parser.add_argument(
303 '--verbose', '-v',
304 action='count',
305 default=0,
306 help="""Enable verbose output from '%(prog)s'. A second and third
307 flag increases verbosity.""")
308 parser.add_argument(
309 '--cdb',
310 metavar='<file>',
311 default="compile_commands.json",
312 help="""The JSON compilation database.""")
313 group = parser.add_mutually_exclusive_group()
314 group.add_argument(
315 '--append',
316 action='store_true',
317 help="""Append new entries to existing compilation database.""")
318 group.add_argument(
319 '--disable-filter', '-n',
320 dest='raw_entries',
321 action='store_true',
322 help="""Intercepted child process creation calls (exec calls) are all
323 logged to the output. The output is not a compilation database.
324 This flag is for debug purposes.""")
325
326 advanced = parser.add_argument_group('advanced options')
327 advanced.add_argument(
328 '--override-compiler',
329 action='store_true',
330 help="""Always resort to the compiler wrapper even when better
331 intercept methods are available.""")
332 advanced.add_argument(
333 '--use-cc',
334 metavar='<path>',
335 dest='cc',
336 default='cc',
337 help="""When '%(prog)s' analyzes a project by interposing a compiler
338 wrapper, which executes a real compiler for compilation and
339 do other tasks (record the compiler invocation). Because of
340 this interposing, '%(prog)s' does not know what compiler your
341 project normally uses. Instead, it simply overrides the CC
342 environment variable, and guesses your default compiler.
343
344 If you need '%(prog)s' to use a specific compiler for
345 *compilation* then you can use this option to specify a path
346 to that compiler.""")
347 advanced.add_argument(
348 '--use-c++',
349 metavar='<path>',
350 dest='cxx',
351 default='c++',
352 help="""This is the same as "--use-cc" but for C++ code.""")
353
354 parser.add_argument(
355 dest='build',
356 nargs=argparse.REMAINDER,
357 help="""Command to run.""")
358
359 return parser