D17487: [analyzer][scan-build-py] flag filter modification for compilation database creation

llvm-svn: 266726
diff --git a/clang/tools/scan-build-py/libscanbuild/compilation.py b/clang/tools/scan-build-py/libscanbuild/compilation.py
new file mode 100644
index 0000000..ef906fa
--- /dev/null
+++ b/clang/tools/scan-build-py/libscanbuild/compilation.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+""" This module is responsible for to parse a compiler invocation. """
+
+import re
+import os
+import collections
+
+__all__ = ['split_command', 'classify_source', 'compiler_language']
+
+# Ignored compiler options map for compilation database creation.
+# The map is used in `split_command` method. (Which does ignore and classify
+# parameters.) Please note, that these are not the only parameters which
+# might be ignored.
+#
+# Keys are the option name, value number of options to skip
+IGNORED_FLAGS = {
+    # compiling only flag, ignored because the creator of compilation
+    # database will explicitly set it.
+    '-c': 0,
+    # preprocessor macros, ignored because would cause duplicate entries in
+    # the output (the only difference would be these flags). this is actual
+    # finding from users, who suffered longer execution time caused by the
+    # duplicates.
+    '-MD': 0,
+    '-MMD': 0,
+    '-MG': 0,
+    '-MP': 0,
+    '-MF': 1,
+    '-MT': 1,
+    '-MQ': 1,
+    # linker options, ignored because for compilation database will contain
+    # compilation commands only. so, the compiler would ignore these flags
+    # anyway. the benefit to get rid of them is to make the output more
+    # readable.
+    '-static': 0,
+    '-shared': 0,
+    '-s': 0,
+    '-rdynamic': 0,
+    '-l': 1,
+    '-L': 1,
+    '-u': 1,
+    '-z': 1,
+    '-T': 1,
+    '-Xlinker': 1
+}
+
+# Known C/C++ compiler executable name patterns
+COMPILER_PATTERNS = frozenset([
+    re.compile(r'^(intercept-|analyze-|)c(c|\+\+)$'),
+    re.compile(r'^([^-]*-)*[mg](cc|\+\+)(-\d+(\.\d+){0,2})?$'),
+    re.compile(r'^([^-]*-)*clang(\+\+)?(-\d+(\.\d+){0,2})?$'),
+    re.compile(r'^llvm-g(cc|\+\+)$'),
+])
+
+
+def split_command(command):
+    """ Returns a value when the command is a compilation, None otherwise.
+
+    The value on success is a named tuple with the following attributes:
+
+        files:    list of source files
+        flags:    list of compile options
+        compiler: string value of 'c' or 'c++' """
+
+    # the result of this method
+    result = collections.namedtuple('Compilation',
+                                    ['compiler', 'flags', 'files'])
+    result.compiler = compiler_language(command)
+    result.flags = []
+    result.files = []
+    # quit right now, if the program was not a C/C++ compiler
+    if not result.compiler:
+        return None
+    # iterate on the compile options
+    args = iter(command[1:])
+    for arg in args:
+        # quit when compilation pass is not involved
+        if arg in {'-E', '-S', '-cc1', '-M', '-MM', '-###'}:
+            return None
+        # ignore some flags
+        elif arg in IGNORED_FLAGS:
+            count = IGNORED_FLAGS[arg]
+            for _ in range(count):
+                next(args)
+        elif re.match(r'^-(l|L|Wl,).+', arg):
+            pass
+        # some parameters could look like filename, take as compile option
+        elif arg in {'-D', '-I'}:
+            result.flags.extend([arg, next(args)])
+        # parameter which looks source file is taken...
+        elif re.match(r'^[^-].+', arg) and classify_source(arg):
+            result.files.append(arg)
+        # and consider everything else as compile option.
+        else:
+            result.flags.append(arg)
+    # do extra check on number of source files
+    return result if result.files else None
+
+
+def classify_source(filename, c_compiler=True):
+    """ Return the language from file name extension. """
+
+    mapping = {
+        '.c': 'c' if c_compiler else 'c++',
+        '.i': 'c-cpp-output' if c_compiler else 'c++-cpp-output',
+        '.ii': 'c++-cpp-output',
+        '.m': 'objective-c',
+        '.mi': 'objective-c-cpp-output',
+        '.mm': 'objective-c++',
+        '.mii': 'objective-c++-cpp-output',
+        '.C': 'c++',
+        '.cc': 'c++',
+        '.CC': 'c++',
+        '.cp': 'c++',
+        '.cpp': 'c++',
+        '.cxx': 'c++',
+        '.c++': 'c++',
+        '.C++': 'c++',
+        '.txx': 'c++'
+    }
+
+    __, extension = os.path.splitext(os.path.basename(filename))
+    return mapping.get(extension)
+
+
+def compiler_language(command):
+    """ A predicate to decide the command is a compiler call or not.
+
+    Returns 'c' or 'c++' when it match. None otherwise. """
+
+    cplusplus = re.compile(r'^(.+)(\+\+)(-.+|)$')
+
+    if command:
+        executable = os.path.basename(command[0])
+        if any(pattern.match(executable) for pattern in COMPILER_PATTERNS):
+            return 'c++' if cplusplus.match(executable) else 'c'
+    return None