bpo-36876: Re-organize the c-analyzer tool code. (gh-16841)
This is partly a cleanup of the code. It also is preparation for getting the variables from the source (cross-platform) rather than from the symbols.
The change only touches the tool (and its tests).
diff --git a/Tools/c-analyzer/cpython/README b/Tools/c-analyzer/cpython/README
new file mode 100644
index 0000000..772b8be
--- /dev/null
+++ b/Tools/c-analyzer/cpython/README
@@ -0,0 +1,72 @@
+#######################################
+# C Globals and CPython Runtime State.
+
+CPython's C code makes extensive use of global variables (whether static
+globals or static locals). Each such variable falls into one of several
+categories:
+
+* strictly const data
+* used exclusively in main or in the REPL
+* process-global state (e.g. managing process-level resources
+ like signals and file descriptors)
+* Python "global" runtime state
+* per-interpreter runtime state
+
+The last one can be a problem as soon as anyone creates a second
+interpreter (AKA "subinterpreter") in a process. It is definitely a
+problem under subinterpreters if they are no longer sharing the GIL,
+since the GIL protects us from a lot of race conditions. Keep in mind
+that ultimately *all* objects (PyObject) should be treated as
+per-interpreter state. This includes "static types", freelists,
+_PyIdentifier, and singletons. Take that in for a second. It has
+significant implications on where we use static variables!
+
+Be aware that module-global state (stored in C statics) is a kind of
+per-interpreter state. There have been efforts across many years, and
+still going, to provide extension module authors mechanisms to store
+that state safely (see PEPs 3121, 489, etc.).
+
+(Note that there has been discussion around support for running multiple
+Python runtimes in the same process. That would ends up with the same
+problems, relative to static variables, that subinterpreters have.)
+
+Historically we have been bad at keeping per-interpreter state out of
+static variables, mostly because until recently subinterpreters were
+not widely used nor even factored in to solutions. However, the
+feature is growing in popularity and use in the community.
+
+Mandate: "Eliminate use of static variables for per-interpreter state."
+
+The "c-statics.py" script in this directory, along with its accompanying
+data files, are part of the effort to resolve existing problems with
+our use of static variables and to prevent future problems.
+
+#-------------------------
+## statics for actually-global state (and runtime state consolidation)
+
+In general, holding any kind of state in static variables
+increases maintenance burden and increases the complexity of code (e.g.
+we use TSS to identify the active thread state). So it is a good idea
+to avoid using statics for state even if for the "global" runtime or
+for process-global state.
+
+Relative to maintenance burden, one problem is where the runtime
+state is spread throughout the codebase in dozens of individual
+globals. Unlike the other globals, the runtime state represents a set
+of values that are constantly shifting in a complex way. When they are
+spread out it's harder to get a clear picture of what the runtime
+involves. Furthermore, when they are spread out it complicates efforts
+that change the runtime.
+
+Consequently, the globals for Python's runtime state have been
+consolidated under a single top-level _PyRuntime global. No new globals
+should be added for runtime state. Instead, they should be added to
+_PyRuntimeState or one of its sub-structs. The tools in this directory
+are run as part of the test suite to ensure that no new globals have
+been added. The script can be run manually as well:
+
+ ./python Lib/test/test_c_statics/c-statics.py check
+
+If it reports any globals then they should be resolved. If the globals
+are runtime state then they should be folded into _PyRuntimeState.
+Otherwise they should be marked as ignored.
diff --git a/Tools/c-analyzer/cpython/__init__.py b/Tools/c-analyzer/cpython/__init__.py
new file mode 100644
index 0000000..ae45b42
--- /dev/null
+++ b/Tools/c-analyzer/cpython/__init__.py
@@ -0,0 +1,29 @@
+import os.path
+import sys
+
+
+TOOL_ROOT = os.path.abspath(
+ os.path.dirname( # c-analyzer/
+ os.path.dirname(__file__))) # cpython/
+DATA_DIR = TOOL_ROOT
+REPO_ROOT = (
+ os.path.dirname( # ..
+ os.path.dirname(TOOL_ROOT))) # Tools/
+
+INCLUDE_DIRS = [os.path.join(REPO_ROOT, name) for name in [
+ 'Include',
+ ]]
+SOURCE_DIRS = [os.path.join(REPO_ROOT, name) for name in [
+ 'Python',
+ 'Parser',
+ 'Objects',
+ 'Modules',
+ ]]
+
+#PYTHON = os.path.join(REPO_ROOT, 'python')
+PYTHON = sys.executable
+
+
+# Clean up the namespace.
+del sys
+del os
diff --git a/Tools/c-analyzer/cpython/__main__.py b/Tools/c-analyzer/cpython/__main__.py
new file mode 100644
index 0000000..6b0f9bc
--- /dev/null
+++ b/Tools/c-analyzer/cpython/__main__.py
@@ -0,0 +1,212 @@
+import argparse
+import re
+import sys
+
+from c_analyzer.common import show
+from c_analyzer.common.info import UNKNOWN
+
+from . import SOURCE_DIRS
+from .find import supported_vars
+from .known import (
+ from_file as known_from_file,
+ DATA_FILE as KNOWN_FILE,
+ )
+from .supported import IGNORED_FILE
+
+
+def _check_results(unknown, knownvars, used):
+ def _match_unused_global(variable):
+ found = []
+ for varid in knownvars:
+ if varid in used:
+ continue
+ if varid.funcname is not None:
+ continue
+ if varid.name != variable.name:
+ continue
+ if variable.filename and variable.filename != UNKNOWN:
+ if variable.filename == varid.filename:
+ found.append(varid)
+ else:
+ found.append(varid)
+ return found
+
+ badknown = set()
+ for variable in sorted(unknown):
+ msg = None
+ if variable.funcname != UNKNOWN:
+ msg = f'could not find global symbol {variable.id}'
+ elif m := _match_unused_global(variable):
+ assert isinstance(m, list)
+ badknown.update(m)
+ elif variable.name in ('completed', 'id'): # XXX Figure out where these variables are.
+ unknown.remove(variable)
+ else:
+ msg = f'could not find local symbol {variable.id}'
+ if msg:
+ #raise Exception(msg)
+ print(msg)
+ if badknown:
+ print('---')
+ print(f'{len(badknown)} globals in known.tsv, but may actually be local:')
+ for varid in sorted(badknown):
+ print(f'{varid.filename:30} {varid.name}')
+ unused = sorted(varid
+ for varid in set(knownvars) - used
+ if varid.name != 'id') # XXX Figure out where these variables are.
+ if unused:
+ print('---')
+ print(f'did not use {len(unused)} known vars:')
+ for varid in unused:
+ print(f'{varid.filename:30} {varid.funcname or "-":20} {varid.name}')
+ raise Exception('not all known symbols used')
+ if unknown:
+ print('---')
+ raise Exception('could not find all symbols')
+
+
+# XXX Move this check to its own command.
+def cmd_check_cache(cmd, *,
+ known=KNOWN_FILE,
+ ignored=IGNORED_FILE,
+ _known_from_file=known_from_file,
+ _find=supported_vars,
+ ):
+ known = _known_from_file(known)
+
+ used = set()
+ unknown = set()
+ for var, supported in _find(known=known, ignored=ignored):
+ if supported is None:
+ unknown.add(var)
+ continue
+ used.add(var.id)
+ _check_results(unknown, known['variables'], used)
+
+
+def cmd_check(cmd, *,
+ known=KNOWN_FILE,
+ ignored=IGNORED_FILE,
+ _find=supported_vars,
+ _show=show.basic,
+ _print=print,
+ ):
+ """
+ Fail if there are unsupported globals variables.
+
+ In the failure case, the list of unsupported variables
+ will be printed out.
+ """
+ unsupported = []
+ for var, supported in _find(known=known, ignored=ignored):
+ if not supported:
+ unsupported.append(var)
+
+ if not unsupported:
+ #_print('okay')
+ return
+
+ _print('ERROR: found unsupported global variables')
+ _print()
+ _show(sorted(unsupported))
+ _print(f' ({len(unsupported)} total)')
+ sys.exit(1)
+
+
+def cmd_show(cmd, *,
+ known=KNOWN_FILE,
+ ignored=IGNORED_FILE,
+ skip_objects=False,
+ _find=supported_vars,
+ _show=show.basic,
+ _print=print,
+ ):
+ """
+ Print out the list of found global variables.
+
+ The variables will be distinguished as "supported" or "unsupported".
+ """
+ allsupported = []
+ allunsupported = []
+ for found, supported in _find(known=known,
+ ignored=ignored,
+ skip_objects=skip_objects,
+ ):
+ if supported is None:
+ continue
+ (allsupported if supported else allunsupported
+ ).append(found)
+
+ _print('supported:')
+ _print('----------')
+ _show(sorted(allsupported))
+ _print(f' ({len(allsupported)} total)')
+ _print()
+ _print('unsupported:')
+ _print('------------')
+ _show(sorted(allunsupported))
+ _print(f' ({len(allunsupported)} total)')
+
+
+#############################
+# the script
+
+COMMANDS = {
+ 'check': cmd_check,
+ 'show': cmd_show,
+ }
+
+PROG = sys.argv[0]
+PROG = 'c-globals.py'
+
+
+def parse_args(prog=PROG, argv=sys.argv[1:], *, _fail=None):
+ common = argparse.ArgumentParser(add_help=False)
+ common.add_argument('--ignored', metavar='FILE',
+ default=IGNORED_FILE,
+ help='path to file that lists ignored vars')
+ common.add_argument('--known', metavar='FILE',
+ default=KNOWN_FILE,
+ help='path to file that lists known types')
+ #common.add_argument('dirs', metavar='DIR', nargs='*',
+ # default=SOURCE_DIRS,
+ # help='a directory to check')
+
+ parser = argparse.ArgumentParser(
+ prog=prog,
+ )
+ subs = parser.add_subparsers(dest='cmd')
+
+ check = subs.add_parser('check', parents=[common])
+
+ show = subs.add_parser('show', parents=[common])
+ show.add_argument('--skip-objects', action='store_true')
+
+ if _fail is None:
+ def _fail(msg):
+ parser.error(msg)
+
+ # Now parse the args.
+ args = parser.parse_args(argv)
+ ns = vars(args)
+
+ cmd = ns.pop('cmd')
+ if not cmd:
+ _fail('missing command')
+
+ return cmd, ns
+
+
+def main(cmd, cmdkwargs=None, *, _COMMANDS=COMMANDS):
+ try:
+ cmdfunc = _COMMANDS[cmd]
+ except KeyError:
+ raise ValueError(
+ f'unsupported cmd {cmd!r}' if cmd else 'missing cmd')
+
+ cmdfunc(cmd, **cmdkwargs or {})
+
+
+if __name__ == '__main__':
+ cmd, cmdkwargs = parse_args()
+ main(cmd, cmdkwargs)
diff --git a/Tools/c-analyzer/cpython/_generate.py b/Tools/c-analyzer/cpython/_generate.py
new file mode 100644
index 0000000..4c340ac
--- /dev/null
+++ b/Tools/c-analyzer/cpython/_generate.py
@@ -0,0 +1,329 @@
+# The code here consists of hacks for pre-populating the known.tsv file.
+
+from c_analyzer.parser.preprocessor import _iter_clean_lines
+from c_analyzer.parser.naive import (
+ iter_variables, parse_variable_declaration, find_variables,
+ )
+from c_analyzer.common.known import HEADER as KNOWN_HEADER
+from c_analyzer.common.info import UNKNOWN, ID
+from c_analyzer.variables import Variable
+from c_analyzer.util import write_tsv
+
+from . import SOURCE_DIRS, REPO_ROOT
+from .known import DATA_FILE as KNOWN_FILE
+from .files import iter_cpython_files
+
+
+POTS = ('char ', 'wchar_t ', 'int ', 'Py_ssize_t ')
+POTS += tuple('const ' + v for v in POTS)
+STRUCTS = ('PyTypeObject', 'PyObject', 'PyMethodDef', 'PyModuleDef', 'grammar')
+
+
+def _parse_global(line, funcname=None):
+ line = line.strip()
+ if line.startswith('static '):
+ if '(' in line and '[' not in line and ' = ' not in line:
+ return None, None
+ name, decl = parse_variable_declaration(line)
+ elif line.startswith(('Py_LOCAL(', 'Py_LOCAL_INLINE(')):
+ name, decl = parse_variable_declaration(line)
+ elif line.startswith('_Py_static_string('):
+ decl = line.strip(';').strip()
+ name = line.split('(')[1].split(',')[0].strip()
+ elif line.startswith('_Py_IDENTIFIER('):
+ decl = line.strip(';').strip()
+ name = 'PyId_' + line.split('(')[1].split(')')[0].strip()
+ elif funcname:
+ return None, None
+
+ # global-only
+ elif line.startswith('PyAPI_DATA('): # only in .h files
+ name, decl = parse_variable_declaration(line)
+ elif line.startswith('extern '): # only in .h files
+ name, decl = parse_variable_declaration(line)
+ elif line.startswith('PyDoc_VAR('):
+ decl = line.strip(';').strip()
+ name = line.split('(')[1].split(')')[0].strip()
+ elif line.startswith(POTS): # implied static
+ if '(' in line and '[' not in line and ' = ' not in line:
+ return None, None
+ name, decl = parse_variable_declaration(line)
+ elif line.startswith(STRUCTS) and line.endswith(' = {'): # implied static
+ name, decl = parse_variable_declaration(line)
+ elif line.startswith(STRUCTS) and line.endswith(' = NULL;'): # implied static
+ name, decl = parse_variable_declaration(line)
+ elif line.startswith('struct '):
+ if not line.endswith(' = {'):
+ return None, None
+ if not line.partition(' ')[2].startswith(STRUCTS):
+ return None, None
+ # implied static
+ name, decl = parse_variable_declaration(line)
+
+ # file-specific
+ elif line.startswith(('SLOT1BINFULL(', 'SLOT1BIN(')):
+ # Objects/typeobject.c
+ funcname = line.split('(')[1].split(',')[0]
+ return [
+ ('op_id', funcname, '_Py_static_string(op_id, OPSTR)'),
+ ('rop_id', funcname, '_Py_static_string(op_id, OPSTR)'),
+ ]
+ elif line.startswith('WRAP_METHOD('):
+ # Objects/weakrefobject.c
+ funcname, name = (v.strip() for v in line.split('(')[1].split(')')[0].split(','))
+ return [
+ ('PyId_' + name, funcname, f'_Py_IDENTIFIER({name})'),
+ ]
+
+ else:
+ return None, None
+ return name, decl
+
+
+def _pop_cached(varcache, filename, funcname, name, *,
+ _iter_variables=iter_variables,
+ ):
+ # Look for the file.
+ try:
+ cached = varcache[filename]
+ except KeyError:
+ cached = varcache[filename] = {}
+ for variable in _iter_variables(filename,
+ parse_variable=_parse_global,
+ ):
+ variable._isglobal = True
+ cached[variable.id] = variable
+ for var in cached:
+ print(' ', var)
+
+ # Look for the variable.
+ if funcname == UNKNOWN:
+ for varid in cached:
+ if varid.name == name:
+ break
+ else:
+ return None
+ return cached.pop(varid)
+ else:
+ return cached.pop((filename, funcname, name), None)
+
+
+def find_matching_variable(varid, varcache, allfilenames, *,
+ _pop_cached=_pop_cached,
+ ):
+ if varid.filename and varid.filename != UNKNOWN:
+ filenames = [varid.filename]
+ else:
+ filenames = allfilenames
+ for filename in filenames:
+ variable = _pop_cached(varcache, filename, varid.funcname, varid.name)
+ if variable is not None:
+ return variable
+ else:
+ if varid.filename and varid.filename != UNKNOWN and varid.funcname is None:
+ for filename in allfilenames:
+ if not filename.endswith('.h'):
+ continue
+ variable = _pop_cached(varcache, filename, None, varid.name)
+ if variable is not None:
+ return variable
+ return None
+
+
+MULTILINE = {
+ # Python/Python-ast.c
+ 'Load_singleton': 'PyObject *',
+ 'Store_singleton': 'PyObject *',
+ 'Del_singleton': 'PyObject *',
+ 'AugLoad_singleton': 'PyObject *',
+ 'AugStore_singleton': 'PyObject *',
+ 'Param_singleton': 'PyObject *',
+ 'And_singleton': 'PyObject *',
+ 'Or_singleton': 'PyObject *',
+ 'Add_singleton': 'static PyObject *',
+ 'Sub_singleton': 'static PyObject *',
+ 'Mult_singleton': 'static PyObject *',
+ 'MatMult_singleton': 'static PyObject *',
+ 'Div_singleton': 'static PyObject *',
+ 'Mod_singleton': 'static PyObject *',
+ 'Pow_singleton': 'static PyObject *',
+ 'LShift_singleton': 'static PyObject *',
+ 'RShift_singleton': 'static PyObject *',
+ 'BitOr_singleton': 'static PyObject *',
+ 'BitXor_singleton': 'static PyObject *',
+ 'BitAnd_singleton': 'static PyObject *',
+ 'FloorDiv_singleton': 'static PyObject *',
+ 'Invert_singleton': 'static PyObject *',
+ 'Not_singleton': 'static PyObject *',
+ 'UAdd_singleton': 'static PyObject *',
+ 'USub_singleton': 'static PyObject *',
+ 'Eq_singleton': 'static PyObject *',
+ 'NotEq_singleton': 'static PyObject *',
+ 'Lt_singleton': 'static PyObject *',
+ 'LtE_singleton': 'static PyObject *',
+ 'Gt_singleton': 'static PyObject *',
+ 'GtE_singleton': 'static PyObject *',
+ 'Is_singleton': 'static PyObject *',
+ 'IsNot_singleton': 'static PyObject *',
+ 'In_singleton': 'static PyObject *',
+ 'NotIn_singleton': 'static PyObject *',
+ # Python/symtable.c
+ 'top': 'static identifier ',
+ 'lambda': 'static identifier ',
+ 'genexpr': 'static identifier ',
+ 'listcomp': 'static identifier ',
+ 'setcomp': 'static identifier ',
+ 'dictcomp': 'static identifier ',
+ '__class__': 'static identifier ',
+ # Python/compile.c
+ '__doc__': 'static PyObject *',
+ '__annotations__': 'static PyObject *',
+ # Objects/floatobject.c
+ 'double_format': 'static float_format_type ',
+ 'float_format': 'static float_format_type ',
+ 'detected_double_format': 'static float_format_type ',
+ 'detected_float_format': 'static float_format_type ',
+ # Parser/listnode.c
+ 'level': 'static int ',
+ 'atbol': 'static int ',
+ # Python/dtoa.c
+ 'private_mem': 'static double private_mem[PRIVATE_mem]',
+ 'pmem_next': 'static double *',
+ # Modules/_weakref.c
+ 'weakref_functions': 'static PyMethodDef ',
+}
+INLINE = {
+ # Modules/_tracemalloc.c
+ 'allocators': 'static struct { PyMemAllocatorEx mem; PyMemAllocatorEx raw; PyMemAllocatorEx obj; } ',
+ # Modules/faulthandler.c
+ 'fatal_error': 'static struct { int enabled; PyObject *file; int fd; int all_threads; PyInterpreterState *interp; void *exc_handler; } ',
+ 'thread': 'static struct { PyObject *file; int fd; PY_TIMEOUT_T timeout_us; int repeat; PyInterpreterState *interp; int exit; char *header; size_t header_len; PyThread_type_lock cancel_event; PyThread_type_lock running; } ',
+ # Modules/signalmodule.c
+ 'Handlers': 'static volatile struct { _Py_atomic_int tripped; PyObject *func; } Handlers[NSIG]',
+ 'wakeup': 'static volatile struct { SOCKET_T fd; int warn_on_full_buffer; int use_send; } ',
+ # Python/dynload_shlib.c
+ 'handles': 'static struct { dev_t dev; ino_t ino; void *handle; } handles[128]',
+ # Objects/obmalloc.c
+ '_PyMem_Debug': 'static struct { debug_alloc_api_t raw; debug_alloc_api_t mem; debug_alloc_api_t obj; } ',
+ # Python/bootstrap_hash.c
+ 'urandom_cache': 'static struct { int fd; dev_t st_dev; ino_t st_ino; } ',
+ }
+FUNC = {
+ # Objects/object.c
+ '_Py_abstract_hack': 'Py_ssize_t (*_Py_abstract_hack)(PyObject *)',
+ # Parser/myreadline.c
+ 'PyOS_InputHook': 'int (*PyOS_InputHook)(void)',
+ # Python/pylifecycle.c
+ '_PyOS_mystrnicmp_hack': 'int (*_PyOS_mystrnicmp_hack)(const char *, const char *, Py_ssize_t)',
+ # Parser/myreadline.c
+ 'PyOS_ReadlineFunctionPointer': 'char *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *)',
+ }
+IMPLIED = {
+ # Objects/boolobject.c
+ '_Py_FalseStruct': 'static struct _longobject ',
+ '_Py_TrueStruct': 'static struct _longobject ',
+ # Modules/config.c
+ '_PyImport_Inittab': 'struct _inittab _PyImport_Inittab[]',
+ }
+GLOBALS = {}
+GLOBALS.update(MULTILINE)
+GLOBALS.update(INLINE)
+GLOBALS.update(FUNC)
+GLOBALS.update(IMPLIED)
+
+LOCALS = {
+ 'buildinfo': ('Modules/getbuildinfo.c',
+ 'Py_GetBuildInfo',
+ 'static char buildinfo[50 + sizeof(GITVERSION) + ((sizeof(GITTAG) > sizeof(GITBRANCH)) ? sizeof(GITTAG) : sizeof(GITBRANCH))]'),
+ 'methods': ('Python/codecs.c',
+ '_PyCodecRegistry_Init',
+ 'static struct { char *name; PyMethodDef def; } methods[]'),
+ }
+
+
+def _known(symbol):
+ if symbol.funcname:
+ if symbol.funcname != UNKNOWN or symbol.filename != UNKNOWN:
+ raise KeyError(symbol.name)
+ filename, funcname, decl = LOCALS[symbol.name]
+ varid = ID(filename, funcname, symbol.name)
+ elif not symbol.filename or symbol.filename == UNKNOWN:
+ raise KeyError(symbol.name)
+ else:
+ varid = symbol.id
+ try:
+ decl = GLOBALS[symbol.name]
+ except KeyError:
+
+ if symbol.name.endswith('_methods'):
+ decl = 'static PyMethodDef '
+ elif symbol.filename == 'Objects/exceptions.c' and symbol.name.startswith(('PyExc_', '_PyExc_')):
+ decl = 'static PyTypeObject '
+ else:
+ raise
+ if symbol.name not in decl:
+ decl = decl + symbol.name
+ return Variable(varid, 'static', decl)
+
+
+def known_row(varid, decl):
+ return (
+ varid.filename,
+ varid.funcname or '-',
+ varid.name,
+ 'variable',
+ decl,
+ )
+
+
+def known_rows(symbols, *,
+ cached=True,
+ _get_filenames=iter_cpython_files,
+ _find_match=find_matching_variable,
+ _find_symbols=find_variables,
+ _as_known=known_row,
+ ):
+ filenames = list(_get_filenames())
+ cache = {}
+ if cached:
+ for symbol in symbols:
+ try:
+ found = _known(symbol)
+ except KeyError:
+ found = _find_match(symbol, cache, filenames)
+ if found is None:
+ found = Variable(symbol.id, UNKNOWN, UNKNOWN)
+ yield _as_known(found.id, found.vartype)
+ else:
+ raise NotImplementedError # XXX incorporate KNOWN
+ for variable in _find_symbols(symbols, filenames,
+ srccache=cache,
+ parse_variable=_parse_global,
+ ):
+ #variable = variable._replace(
+ # filename=os.path.relpath(variable.filename, REPO_ROOT))
+ if variable.funcname == UNKNOWN:
+ print(variable)
+ if variable.vartype== UNKNOWN:
+ print(variable)
+ yield _as_known(variable.id, variable.vartype)
+
+
+def generate(symbols, filename=None, *,
+ _generate_rows=known_rows,
+ _write_tsv=write_tsv,
+ ):
+ if not filename:
+ filename = KNOWN_FILE + '.new'
+
+ rows = _generate_rows(symbols)
+ _write_tsv(filename, KNOWN_HEADER, rows)
+
+
+if __name__ == '__main__':
+ from c_symbols import binary
+ symbols = binary.iter_symbols(
+ binary.PYTHON,
+ find_local_symbol=None,
+ )
+ generate(symbols)
diff --git a/Tools/c-analyzer/cpython/files.py b/Tools/c-analyzer/cpython/files.py
new file mode 100644
index 0000000..543097a
--- /dev/null
+++ b/Tools/c-analyzer/cpython/files.py
@@ -0,0 +1,29 @@
+from c_analyzer.common.files import (
+ C_SOURCE_SUFFIXES, walk_tree, iter_files_by_suffix,
+ )
+
+from . import SOURCE_DIRS, REPO_ROOT
+
+# XXX need tests:
+# * iter_files()
+
+
+def iter_files(*,
+ walk=walk_tree,
+ _files=iter_files_by_suffix,
+ ):
+ """Yield each file in the tree for each of the given directory names."""
+ excludedtrees = [
+ os.path.join('Include', 'cpython', ''),
+ ]
+ def is_excluded(filename):
+ for root in excludedtrees:
+ if filename.startswith(root):
+ return True
+ return False
+ for filename in _files(SOURCE_DIRS, C_SOURCE_SUFFIXES, REPO_ROOT,
+ walk=walk,
+ ):
+ if is_excluded(filename):
+ continue
+ yield filename
diff --git a/Tools/c-analyzer/cpython/find.py b/Tools/c-analyzer/cpython/find.py
new file mode 100644
index 0000000..a7bc0b4
--- /dev/null
+++ b/Tools/c-analyzer/cpython/find.py
@@ -0,0 +1,101 @@
+import os.path
+
+from c_analyzer.common import files
+from c_analyzer.common.info import UNKNOWN, ID
+from c_analyzer.variables import find as _common
+
+from . import SOURCE_DIRS, PYTHON, REPO_ROOT
+from .known import (
+ from_file as known_from_file,
+ DATA_FILE as KNOWN_FILE,
+ )
+from .supported import (
+ ignored_from_file, IGNORED_FILE, is_supported, _is_object,
+ )
+
+# XXX need tests:
+# * vars_from_binary()
+# * vars_from_source()
+# * supported_vars()
+
+
+def _handle_id(filename, funcname, name, *,
+ _relpath=os.path.relpath,
+ ):
+ filename = _relpath(filename, REPO_ROOT)
+ return ID(filename, funcname, name)
+
+
+def vars_from_binary(*,
+ known=KNOWN_FILE,
+ _known_from_file=known_from_file,
+ _iter_files=files.iter_files_by_suffix,
+ _iter_vars=_common.vars_from_binary,
+ ):
+ """Yield a Variable for each found Symbol.
+
+ Details are filled in from the given "known" variables and types.
+ """
+ if isinstance(known, str):
+ known = _known_from_file(known)
+ dirnames = SOURCE_DIRS
+ suffixes = ('.c',)
+ filenames = _iter_files(dirnames, suffixes)
+ # XXX For now we only use known variables (no source lookup).
+ filenames = None
+ yield from _iter_vars(PYTHON,
+ known=known,
+ filenames=filenames,
+ handle_id=_handle_id,
+ check_filename=(lambda n: True),
+ )
+
+
+def vars_from_source(*,
+ preprocessed=None,
+ known=KNOWN_FILE,
+ _known_from_file=known_from_file,
+ _iter_files=files.iter_files_by_suffix,
+ _iter_vars=_common.vars_from_source,
+ ):
+ """Yield a Variable for each declaration in the raw source code.
+
+ Details are filled in from the given "known" variables and types.
+ """
+ if isinstance(known, str):
+ known = _known_from_file(known)
+ dirnames = SOURCE_DIRS
+ suffixes = ('.c',)
+ filenames = _iter_files(dirnames, suffixes)
+ yield from _iter_vars(filenames,
+ preprocessed=preprocessed,
+ known=known,
+ handle_id=_handle_id,
+ )
+
+
+def supported_vars(*,
+ known=KNOWN_FILE,
+ ignored=IGNORED_FILE,
+ skip_objects=False,
+ _known_from_file=known_from_file,
+ _ignored_from_file=ignored_from_file,
+ _iter_vars=vars_from_binary,
+ _is_supported=is_supported,
+ ):
+ """Yield (var, is supported) for each found variable."""
+ if isinstance(known, str):
+ known = _known_from_file(known)
+ if isinstance(ignored, str):
+ ignored = _ignored_from_file(ignored)
+
+ for var in _iter_vars(known=known):
+ if not var.isglobal:
+ continue
+ elif var.vartype == UNKNOWN:
+ yield var, None
+ # XXX Support proper filters instead.
+ elif skip_objects and _is_object(found.vartype):
+ continue
+ else:
+ yield var, _is_supported(var, ignored, known)
diff --git a/Tools/c-analyzer/cpython/known.py b/Tools/c-analyzer/cpython/known.py
new file mode 100644
index 0000000..c3cc2c0
--- /dev/null
+++ b/Tools/c-analyzer/cpython/known.py
@@ -0,0 +1,66 @@
+import csv
+import os.path
+
+from c_analyzer.parser.declarations import extract_storage
+from c_analyzer.variables import known as _common
+from c_analyzer.variables.info import Variable
+
+from . import DATA_DIR
+
+
+# XXX need tests:
+# * from_file()
+# * look_up_variable()
+
+
+DATA_FILE = os.path.join(DATA_DIR, 'known.tsv')
+
+
+def _get_storage(decl, infunc):
+ # statics
+ if decl.startswith(('Py_LOCAL(', 'Py_LOCAL_INLINE(')):
+ return 'static'
+ if decl.startswith(('_Py_IDENTIFIER(', '_Py_static_string(')):
+ return 'static'
+ if decl.startswith('PyDoc_VAR('):
+ return 'static'
+ if decl.startswith(('SLOT1BINFULL(', 'SLOT1BIN(')):
+ return 'static'
+ if decl.startswith('WRAP_METHOD('):
+ return 'static'
+ # public extern
+ if decl.startswith('PyAPI_DATA('):
+ return 'extern'
+ # Fall back to the normal handler.
+ return extract_storage(decl, infunc=infunc)
+
+
+def _handle_var(varid, decl):
+# if varid.name == 'id' and decl == UNKNOWN:
+# # None of these are variables.
+# decl = 'int id';
+ storage = _get_storage(decl, varid.funcname)
+ return Variable(varid, storage, decl)
+
+
+def from_file(infile=DATA_FILE, *,
+ _from_file=_common.from_file,
+ _handle_var=_handle_var,
+ ):
+ """Return the info for known declarations in the given file."""
+ return _from_file(infile, handle_var=_handle_var)
+
+
+def look_up_variable(varid, knownvars, *,
+ _lookup=_common.look_up_variable,
+ ):
+ """Return the known variable matching the given ID.
+
+ "knownvars" is a mapping of ID to Variable.
+
+ "match_files" is used to verify if two filenames point to
+ the same file.
+
+ If no match is found then None is returned.
+ """
+ return _lookup(varid, knownvars)
diff --git a/Tools/c-analyzer/cpython/supported.py b/Tools/c-analyzer/cpython/supported.py
new file mode 100644
index 0000000..18786ee
--- /dev/null
+++ b/Tools/c-analyzer/cpython/supported.py
@@ -0,0 +1,398 @@
+import os.path
+import re
+
+from c_analyzer.common.info import ID
+from c_analyzer.common.util import read_tsv, write_tsv
+
+from . import DATA_DIR
+
+# XXX need tests:
+# * generate / script
+
+
+IGNORED_FILE = os.path.join(DATA_DIR, 'ignored.tsv')
+
+IGNORED_COLUMNS = ('filename', 'funcname', 'name', 'kind', 'reason')
+IGNORED_HEADER = '\t'.join(IGNORED_COLUMNS)
+
+# XXX Move these to ignored.tsv.
+IGNORED = {
+ # global
+ 'PyImport_FrozenModules': 'process-global',
+ 'M___hello__': 'process-global',
+ 'inittab_copy': 'process-global',
+ 'PyHash_Func': 'process-global',
+ '_Py_HashSecret_Initialized': 'process-global',
+ '_TARGET_LOCALES': 'process-global',
+
+ # startup (only changed before/during)
+ '_PyRuntime': 'runtime startup',
+ 'runtime_initialized': 'runtime startup',
+ 'static_arg_parsers': 'runtime startup',
+ 'orig_argv': 'runtime startup',
+ 'opt_ptr': 'runtime startup',
+ '_preinit_warnoptions': 'runtime startup',
+ '_Py_StandardStreamEncoding': 'runtime startup',
+ 'Py_FileSystemDefaultEncoding': 'runtime startup',
+ '_Py_StandardStreamErrors': 'runtime startup',
+ 'Py_FileSystemDefaultEncodeErrors': 'runtime startup',
+ 'Py_BytesWarningFlag': 'runtime startup',
+ 'Py_DebugFlag': 'runtime startup',
+ 'Py_DontWriteBytecodeFlag': 'runtime startup',
+ 'Py_FrozenFlag': 'runtime startup',
+ 'Py_HashRandomizationFlag': 'runtime startup',
+ 'Py_IgnoreEnvironmentFlag': 'runtime startup',
+ 'Py_InspectFlag': 'runtime startup',
+ 'Py_InteractiveFlag': 'runtime startup',
+ 'Py_IsolatedFlag': 'runtime startup',
+ 'Py_NoSiteFlag': 'runtime startup',
+ 'Py_NoUserSiteDirectory': 'runtime startup',
+ 'Py_OptimizeFlag': 'runtime startup',
+ 'Py_QuietFlag': 'runtime startup',
+ 'Py_UTF8Mode': 'runtime startup',
+ 'Py_UnbufferedStdioFlag': 'runtime startup',
+ 'Py_VerboseFlag': 'runtime startup',
+ '_Py_path_config': 'runtime startup',
+ '_PyOS_optarg': 'runtime startup',
+ '_PyOS_opterr': 'runtime startup',
+ '_PyOS_optind': 'runtime startup',
+ '_Py_HashSecret': 'runtime startup',
+
+ # REPL
+ '_PyOS_ReadlineLock': 'repl',
+ '_PyOS_ReadlineTState': 'repl',
+
+ # effectively const
+ 'tracemalloc_empty_traceback': 'const',
+ '_empty_bitmap_node': 'const',
+ 'posix_constants_pathconf': 'const',
+ 'posix_constants_confstr': 'const',
+ 'posix_constants_sysconf': 'const',
+ '_PySys_ImplCacheTag': 'const',
+ '_PySys_ImplName': 'const',
+ 'PyImport_Inittab': 'const',
+ '_PyImport_DynLoadFiletab': 'const',
+ '_PyParser_Grammar': 'const',
+ 'Py_hexdigits': 'const',
+ '_PyImport_Inittab': 'const',
+ '_PyByteArray_empty_string': 'const',
+ '_PyLong_DigitValue': 'const',
+ '_Py_SwappedOp': 'const',
+ 'PyStructSequence_UnnamedField': 'const',
+
+ # signals are main-thread only
+ 'faulthandler_handlers': 'signals are main-thread only',
+ 'user_signals': 'signals are main-thread only',
+ 'wakeup': 'signals are main-thread only',
+
+ # hacks
+ '_PySet_Dummy': 'only used as a placeholder',
+ }
+
+BENIGN = 'races here are benign and unlikely'
+
+
+def is_supported(variable, ignored=None, known=None, *,
+ _ignored=(lambda *a, **k: _is_ignored(*a, **k)),
+ _vartype_okay=(lambda *a, **k: _is_vartype_okay(*a, **k)),
+ ):
+ """Return True if the given global variable is okay in CPython."""
+ if _ignored(variable,
+ ignored and ignored.get('variables')):
+ return True
+ elif _vartype_okay(variable.vartype,
+ ignored.get('types')):
+ return True
+ else:
+ return False
+
+
+def _is_ignored(variable, ignoredvars=None, *,
+ _IGNORED=IGNORED,
+ ):
+ """Return the reason if the variable is a supported global.
+
+ Return None if the variable is not a supported global.
+ """
+ if ignoredvars and (reason := ignoredvars.get(variable.id)):
+ return reason
+
+ if variable.funcname is None:
+ if reason := _IGNORED.get(variable.name):
+ return reason
+
+ # compiler
+ if variable.filename == 'Python/graminit.c':
+ if variable.vartype.startswith('static state '):
+ return 'compiler'
+ if variable.filename == 'Python/symtable.c':
+ if variable.vartype.startswith('static identifier '):
+ return 'compiler'
+ if variable.filename == 'Python/Python-ast.c':
+ # These should be const.
+ if variable.name.endswith('_field'):
+ return 'compiler'
+ if variable.name.endswith('_attribute'):
+ return 'compiler'
+
+ # other
+ if variable.filename == 'Python/dtoa.c':
+ # guarded by lock?
+ if variable.name in ('p5s', 'freelist'):
+ return 'dtoa is thread-safe?'
+ if variable.name in ('private_mem', 'pmem_next'):
+ return 'dtoa is thread-safe?'
+ if variable.filename == 'Python/thread.c':
+ # Threads do not become an issue until after these have been set
+ # and these never get changed after that.
+ if variable.name in ('initialized', 'thread_debug'):
+ return 'thread-safe'
+ if variable.filename == 'Python/getversion.c':
+ if variable.name == 'version':
+ # Races are benign here, as well as unlikely.
+ return BENIGN
+ if variable.filename == 'Python/fileutils.c':
+ if variable.name == 'force_ascii':
+ return BENIGN
+ if variable.name == 'ioctl_works':
+ return BENIGN
+ if variable.name == '_Py_open_cloexec_works':
+ return BENIGN
+ if variable.filename == 'Python/codecs.c':
+ if variable.name == 'ucnhash_CAPI':
+ return BENIGN
+ if variable.filename == 'Python/bootstrap_hash.c':
+ if variable.name == 'getrandom_works':
+ return BENIGN
+ if variable.filename == 'Objects/unicodeobject.c':
+ if variable.name == 'ucnhash_CAPI':
+ return BENIGN
+ if variable.name == 'bloom_linebreak':
+ # *mostly* benign
+ return BENIGN
+ if variable.filename == 'Modules/getbuildinfo.c':
+ if variable.name == 'buildinfo':
+ # The static is used for pre-allocation.
+ return BENIGN
+ if variable.filename == 'Modules/posixmodule.c':
+ if variable.name == 'ticks_per_second':
+ return BENIGN
+ if variable.name == 'dup3_works':
+ return BENIGN
+ if variable.filename == 'Modules/timemodule.c':
+ if variable.name == 'ticks_per_second':
+ return BENIGN
+ if variable.filename == 'Objects/longobject.c':
+ if variable.name == 'log_base_BASE':
+ return BENIGN
+ if variable.name == 'convwidth_base':
+ return BENIGN
+ if variable.name == 'convmultmax_base':
+ return BENIGN
+
+ return None
+
+
+def _is_vartype_okay(vartype, ignoredtypes=None):
+ if _is_object(vartype):
+ return None
+
+ if vartype.startswith('static const '):
+ return 'const'
+ if vartype.startswith('const '):
+ return 'const'
+
+ # components for TypeObject definitions
+ for name in ('PyMethodDef', 'PyGetSetDef', 'PyMemberDef'):
+ if name in vartype:
+ return 'const'
+ for name in ('PyNumberMethods', 'PySequenceMethods', 'PyMappingMethods',
+ 'PyBufferProcs', 'PyAsyncMethods'):
+ if name in vartype:
+ return 'const'
+ for name in ('slotdef', 'newfunc'):
+ if name in vartype:
+ return 'const'
+
+ # structseq
+ for name in ('PyStructSequence_Desc', 'PyStructSequence_Field'):
+ if name in vartype:
+ return 'const'
+
+ # other definiitions
+ if 'PyModuleDef' in vartype:
+ return 'const'
+
+ # thread-safe
+ if '_Py_atomic_int' in vartype:
+ return 'thread-safe'
+ if 'pthread_condattr_t' in vartype:
+ return 'thread-safe'
+
+ # startup
+ if '_Py_PreInitEntry' in vartype:
+ return 'startup'
+
+ # global
+# if 'PyMemAllocatorEx' in vartype:
+# return True
+
+ # others
+# if 'PyThread_type_lock' in vartype:
+# return True
+
+ # XXX ???
+ # _Py_tss_t
+ # _Py_hashtable_t
+ # stack_t
+ # _PyUnicode_Name_CAPI
+
+ # functions
+ if '(' in vartype and '[' not in vartype:
+ return 'function pointer'
+
+ # XXX finish!
+ # * allow const values?
+ #raise NotImplementedError
+ return None
+
+
+PYOBJECT_RE = re.compile(r'''
+ ^
+ (
+ # must start with "static "
+ static \s+
+ (
+ identifier
+ )
+ \b
+ ) |
+ (
+ # may start with "static "
+ ( static \s+ )?
+ (
+ .*
+ (
+ PyObject |
+ PyTypeObject |
+ _? Py \w+ Object |
+ _PyArg_Parser |
+ _Py_Identifier |
+ traceback_t |
+ PyAsyncGenASend |
+ _PyAsyncGenWrappedValue |
+ PyContext |
+ method_cache_entry
+ )
+ \b
+ ) |
+ (
+ (
+ _Py_IDENTIFIER |
+ _Py_static_string
+ )
+ [(]
+ )
+ )
+ ''', re.VERBOSE)
+
+
+def _is_object(vartype):
+ if 'PyDictKeysObject' in vartype:
+ return False
+ if PYOBJECT_RE.match(vartype):
+ return True
+ if vartype.endswith((' _Py_FalseStruct', ' _Py_TrueStruct')):
+ return True
+
+ # XXX Add more?
+
+ #for part in vartype.split():
+ # # XXX const is automatic True?
+ # if part == 'PyObject' or part.startswith('PyObject['):
+ # return True
+ return False
+
+
+def ignored_from_file(infile, *,
+ _read_tsv=read_tsv,
+ ):
+ """Yield a Variable for each ignored var in the file."""
+ ignored = {
+ 'variables': {},
+ #'types': {},
+ #'constants': {},
+ #'macros': {},
+ }
+ for row in _read_tsv(infile, IGNORED_HEADER):
+ filename, funcname, name, kind, reason = row
+ if not funcname or funcname == '-':
+ funcname = None
+ id = ID(filename, funcname, name)
+ if kind == 'variable':
+ values = ignored['variables']
+ else:
+ raise ValueError(f'unsupported kind in row {row}')
+ values[id] = reason
+ return ignored
+
+
+##################################
+# generate
+
+def _get_row(varid, reason):
+ return (
+ varid.filename,
+ varid.funcname or '-',
+ varid.name,
+ 'variable',
+ str(reason),
+ )
+
+
+def _get_rows(variables, ignored=None, *,
+ _as_row=_get_row,
+ _is_ignored=_is_ignored,
+ _vartype_okay=_is_vartype_okay,
+ ):
+ count = 0
+ for variable in variables:
+ reason = _is_ignored(variable,
+ ignored and ignored.get('variables'),
+ )
+ if not reason:
+ reason = _vartype_okay(variable.vartype,
+ ignored and ignored.get('types'))
+ if not reason:
+ continue
+
+ print(' ', variable, repr(reason))
+ yield _as_row(variable.id, reason)
+ count += 1
+ print(f'total: {count}')
+
+
+def _generate_ignored_file(variables, filename=None, *,
+ _generate_rows=_get_rows,
+ _write_tsv=write_tsv,
+ ):
+ if not filename:
+ filename = IGNORED_FILE + '.new'
+ rows = _generate_rows(variables)
+ _write_tsv(filename, IGNORED_HEADER, rows)
+
+
+if __name__ == '__main__':
+ from cpython import SOURCE_DIRS
+ from cpython.known import (
+ from_file as known_from_file,
+ DATA_FILE as KNOWN_FILE,
+ )
+ # XXX This is wrong!
+ from . import find
+ known = known_from_file(KNOWN_FILE)
+ knownvars = (known or {}).get('variables')
+ variables = find.globals_from_binary(knownvars=knownvars,
+ dirnames=SOURCE_DIRS)
+
+ _generate_ignored_file(variables)