closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)

Python now supports checking bytecode cache up-to-dateness with a hash of the
source contents rather than volatile source metadata. See the PEP for details.

While a fairly straightforward idea, quite a lot of code had to be modified due
to the pervasiveness of pyc implementation details in the codebase. Changes in
this commit include:

- The core changes to importlib to understand how to read, validate, and
  regenerate hash-based pycs.

- Support for generating hash-based pycs in py_compile and compileall.

- Modifications to our siphash implementation to support passing a custom
  key. We then expose it to importlib through _imp.

- Updates to all places in the interpreter, standard library, and tests that
  manually generate or parse pyc files to grok the new format.

- Support in the interpreter command line code for long options like
  --check-hash-based-pycs.

- Tests and documentation for all of the above.
diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py
index 41de8a7..e808507 100644
--- a/Lib/importlib/_bootstrap_external.py
+++ b/Lib/importlib/_bootstrap_external.py
@@ -242,6 +242,7 @@
 #     Python 3.6rc1 3379 (more thorough __class__ validation #23722)
 #     Python 3.7a0  3390 (add LOAD_METHOD and CALL_METHOD opcodes)
 #     Python 3.7a0  3391 (update GET_AITER #31709)
+#     Python 3.7a0  3392 (PEP 552: Deterministic pycs)
 #
 # MAGIC must change whenever the bytecode emitted by the compiler may no
 # longer be understood by older implementations of the eval loop (usually
@@ -250,7 +251,7 @@
 # Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array
 # in PC/launcher.c must also be updated.
 
-MAGIC_NUMBER = (3391).to_bytes(2, 'little') + b'\r\n'
+MAGIC_NUMBER = (3392).to_bytes(2, 'little') + b'\r\n'
 _RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little')  # For import.c
 
 _PYCACHE = '__pycache__'
@@ -429,63 +430,93 @@
     return loader
 
 
-def _validate_bytecode_header(data, source_stats=None, name=None, path=None):
-    """Validate the header of the passed-in bytecode against source_stats (if
-    given) and returning the bytecode that can be compiled by compile().
+def _classify_pyc(data, name, exc_details):
+    """Perform basic validity checking of a pyc header and return the flags field,
+    which determines how the pyc should be further validated against the source.
 
-    All other arguments are used to enhance error reporting.
+    *data* is the contents of the pyc file. (Only the first 16 bytes are
+    required, though.)
 
-    ImportError is raised when the magic number is incorrect or the bytecode is
-    found to be stale. EOFError is raised when the data is found to be
-    truncated.
+    *name* is the name of the module being imported. It is used for logging.
+
+    *exc_details* is a dictionary passed to ImportError if it raised for
+    improved debugging.
+
+    ImportError is raised when the magic number is incorrect or when the flags
+    field is invalid. EOFError is raised when the data is found to be truncated.
 
     """
-    exc_details = {}
-    if name is not None:
-        exc_details['name'] = name
-    else:
-        # To prevent having to make all messages have a conditional name.
-        name = '<bytecode>'
-    if path is not None:
-        exc_details['path'] = path
     magic = data[:4]
-    raw_timestamp = data[4:8]
-    raw_size = data[8:12]
     if magic != MAGIC_NUMBER:
-        message = 'bad magic number in {!r}: {!r}'.format(name, magic)
+        message = f'bad magic number in {name!r}: {magic!r}'
         _bootstrap._verbose_message('{}', message)
         raise ImportError(message, **exc_details)
-    elif len(raw_timestamp) != 4:
-        message = 'reached EOF while reading timestamp in {!r}'.format(name)
+    if len(data) < 16:
+        message = f'reached EOF while reading pyc header of {name!r}'
         _bootstrap._verbose_message('{}', message)
         raise EOFError(message)
-    elif len(raw_size) != 4:
-        message = 'reached EOF while reading size of source in {!r}'.format(name)
+    flags = _r_long(data[4:8])
+    # Only the first two flags are defined.
+    if flags & ~0b11:
+        message = f'invalid flags {flags!r} in {name!r}'
+        raise ImportError(message, **exc_details)
+    return flags
+
+
+def _validate_timestamp_pyc(data, source_mtime, source_size, name,
+                            exc_details):
+    """Validate a pyc against the source last-modified time.
+
+    *data* is the contents of the pyc file. (Only the first 16 bytes are
+    required.)
+
+    *source_mtime* is the last modified timestamp of the source file.
+
+    *source_size* is None or the size of the source file in bytes.
+
+    *name* is the name of the module being imported. It is used for logging.
+
+    *exc_details* is a dictionary passed to ImportError if it raised for
+    improved debugging.
+
+    An ImportError is raised if the bytecode is stale.
+
+    """
+    if _r_long(data[8:12]) != (source_mtime & 0xFFFFFFFF):
+        message = f'bytecode is stale for {name!r}'
         _bootstrap._verbose_message('{}', message)
-        raise EOFError(message)
-    if source_stats is not None:
-        try:
-            source_mtime = int(source_stats['mtime'])
-        except KeyError:
-            pass
-        else:
-            if _r_long(raw_timestamp) != source_mtime:
-                message = 'bytecode is stale for {!r}'.format(name)
-                _bootstrap._verbose_message('{}', message)
-                raise ImportError(message, **exc_details)
-        try:
-            source_size = source_stats['size'] & 0xFFFFFFFF
-        except KeyError:
-            pass
-        else:
-            if _r_long(raw_size) != source_size:
-                raise ImportError('bytecode is stale for {!r}'.format(name),
-                                  **exc_details)
-    return data[12:]
+        raise ImportError(message, **exc_details)
+    if (source_size is not None and
+        _r_long(data[12:16]) != (source_size & 0xFFFFFFFF)):
+        raise ImportError(f'bytecode is stale for {name!r}', **exc_details)
+
+
+def _validate_hash_pyc(data, source_hash, name, exc_details):
+    """Validate a hash-based pyc by checking the real source hash against the one in
+    the pyc header.
+
+    *data* is the contents of the pyc file. (Only the first 16 bytes are
+    required.)
+
+    *source_hash* is the importlib.util.source_hash() of the source file.
+
+    *name* is the name of the module being imported. It is used for logging.
+
+    *exc_details* is a dictionary passed to ImportError if it raised for
+    improved debugging.
+
+    An ImportError is raised if the bytecode is stale.
+
+    """
+    if data[8:16] != source_hash:
+        raise ImportError(
+            f'hash in bytecode doesn\'t match hash of source {name!r}',
+            **exc_details,
+        )
 
 
 def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
-    """Compile bytecode as returned by _validate_bytecode_header()."""
+    """Compile bytecode as found in a pyc."""
     code = marshal.loads(data)
     if isinstance(code, _code_type):
         _bootstrap._verbose_message('code object from {!r}', bytecode_path)
@@ -496,16 +527,28 @@
         raise ImportError('Non-code object in {!r}'.format(bytecode_path),
                           name=name, path=bytecode_path)
 
-def _code_to_bytecode(code, mtime=0, source_size=0):
-    """Compile a code object into bytecode for writing out to a byte-compiled
-    file."""
+
+def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
+    "Produce the data for a timestamp-based pyc."
     data = bytearray(MAGIC_NUMBER)
+    data.extend(_w_long(0))
     data.extend(_w_long(mtime))
     data.extend(_w_long(source_size))
     data.extend(marshal.dumps(code))
     return data
 
 
+def _code_to_hash_pyc(code, source_hash, checked=True):
+    "Produce the data for a hash-based pyc."
+    data = bytearray(MAGIC_NUMBER)
+    flags = 0b1 | checked << 1
+    data.extend(_w_long(flags))
+    assert len(source_hash) == 8
+    data.extend(source_hash)
+    data.extend(marshal.dumps(code))
+    return data
+
+
 def decode_source(source_bytes):
     """Decode bytes representing source code and return the string.
 
@@ -751,6 +794,10 @@
         """
         source_path = self.get_filename(fullname)
         source_mtime = None
+        source_bytes = None
+        source_hash = None
+        hash_based = False
+        check_source = True
         try:
             bytecode_path = cache_from_source(source_path)
         except NotImplementedError:
@@ -767,10 +814,34 @@
                 except OSError:
                     pass
                 else:
+                    exc_details = {
+                        'name': fullname,
+                        'path': bytecode_path,
+                    }
                     try:
-                        bytes_data = _validate_bytecode_header(data,
-                                source_stats=st, name=fullname,
-                                path=bytecode_path)
+                        flags = _classify_pyc(data, fullname, exc_details)
+                        bytes_data = memoryview(data)[16:]
+                        hash_based = flags & 0b1 != 0
+                        if hash_based:
+                            check_source = flags & 0b10 != 0
+                            if (_imp.check_hash_based_pycs != 'never' and
+                                (check_source or
+                                 _imp.check_hash_based_pycs == 'always')):
+                                source_bytes = self.get_data(source_path)
+                                source_hash = _imp.source_hash(
+                                    _RAW_MAGIC_NUMBER,
+                                    source_bytes,
+                                )
+                                _validate_hash_pyc(data, source_hash, fullname,
+                                                   exc_details)
+                        else:
+                            _validate_timestamp_pyc(
+                                data,
+                                source_mtime,
+                                st['size'],
+                                fullname,
+                                exc_details,
+                            )
                     except (ImportError, EOFError):
                         pass
                     else:
@@ -779,13 +850,19 @@
                         return _compile_bytecode(bytes_data, name=fullname,
                                                  bytecode_path=bytecode_path,
                                                  source_path=source_path)
-        source_bytes = self.get_data(source_path)
+        if source_bytes is None:
+            source_bytes = self.get_data(source_path)
         code_object = self.source_to_code(source_bytes, source_path)
         _bootstrap._verbose_message('code object from {}', source_path)
         if (not sys.dont_write_bytecode and bytecode_path is not None and
                 source_mtime is not None):
-            data = _code_to_bytecode(code_object, source_mtime,
-                    len(source_bytes))
+            if hash_based:
+                if source_hash is None:
+                    source_hash = _imp.source_hash(source_bytes)
+                data = _code_to_hash_pyc(code_object, source_hash, check_source)
+            else:
+                data = _code_to_timestamp_pyc(code_object, source_mtime,
+                                              len(source_bytes))
             try:
                 self._cache_bytecode(source_path, bytecode_path, data)
                 _bootstrap._verbose_message('wrote {!r}', bytecode_path)
@@ -887,8 +964,18 @@
     def get_code(self, fullname):
         path = self.get_filename(fullname)
         data = self.get_data(path)
-        bytes_data = _validate_bytecode_header(data, name=fullname, path=path)
-        return _compile_bytecode(bytes_data, name=fullname, bytecode_path=path)
+        # Call _classify_pyc to do basic validation of the pyc but ignore the
+        # result. There's no source to check against.
+        exc_details = {
+            'name': fullname,
+            'path': path,
+        }
+        _classify_pyc(data, fullname, exc_details)
+        return _compile_bytecode(
+            memoryview(data)[16:],
+            name=fullname,
+            bytecode_path=path,
+        )
 
     def get_source(self, fullname):
         """Return None as there is no source code."""
diff --git a/Lib/importlib/util.py b/Lib/importlib/util.py
index 41c74d4..9d0a90d 100644
--- a/Lib/importlib/util.py
+++ b/Lib/importlib/util.py
@@ -5,18 +5,25 @@
 from ._bootstrap import spec_from_loader
 from ._bootstrap import _find_spec
 from ._bootstrap_external import MAGIC_NUMBER
+from ._bootstrap_external import _RAW_MAGIC_NUMBER
 from ._bootstrap_external import cache_from_source
 from ._bootstrap_external import decode_source
 from ._bootstrap_external import source_from_cache
 from ._bootstrap_external import spec_from_file_location
 
 from contextlib import contextmanager
+import _imp
 import functools
 import sys
 import types
 import warnings
 
 
+def source_hash(source_bytes):
+    "Return the hash of *source_bytes* as used in hash-based pyc files."
+    return _imp.source_hash(_RAW_MAGIC_NUMBER, source_bytes)
+
+
 def resolve_name(name, package):
     """Resolve a relative module name to an absolute one."""
     if not name.startswith('.'):