bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370) "Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.

commit: 8ac658114dec4964479baecfbc439fceb40eaa79 [log] [tgz]
author: Serhiy Storchaka <storchaka@gmail.com> Sat Dec 22 11:18:40 2018 +0200
committer: GitHub <noreply@github.com> Sat Dec 22 11:18:40 2018 +0200
tree: e66c4c3beda293a6fdf01763306697d15d0af157
parent: c1b4b0f6160e1919394586f44b12538505fed300 [diff]
diff --git a/Lib/symbol.py b/Lib/symbol.py
old mode 100755
new mode 100644
index dc7dcba..40d0ed1
--- a/Lib/symbol.py
+++ b/Lib/symbol.py

@@ -1,5 +1,3 @@
-#! /usr/bin/env python3
-
 """Non-terminal symbols of Python grammar (from "graminit.h")."""
 
 #  This file is automatically generated; please don't muck it up!
@@ -7,7 +5,11 @@
 #  To update the symbols in this file, 'cd' to the top directory of
 #  the python source tree after building the interpreter and run:
 #
-#    ./python Lib/symbol.py
+#    python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py
+#
+# or just
+#
+#    make regen-symbol
 
 #--start constants--
 single_input = 256
@@ -103,14 +105,4 @@
 for _name, _value in list(globals().items()):
     if type(_value) is type(0):
         sym_name[_value] = _name
-
-
-def _main():
-    import sys
-    import token
-    if len(sys.argv) == 1:
-        sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
-    token._main()
-
-if __name__ == "__main__":
-    _main()
+del _name, _value

diff --git a/Lib/test/test_symbol.py b/Lib/test/test_symbol.py
index c1306f5..ed86aec 100644
--- a/Lib/test/test_symbol.py
+++ b/Lib/test/test_symbol.py

@@ -6,6 +6,9 @@
 
 
 SYMBOL_FILE              = support.findfile('symbol.py')
+GEN_SYMBOL_FILE          = os.path.join(os.path.dirname(__file__),
+                                        '..', '..', 'Tools', 'scripts',
+                                        'generate_symbol_py.py')
 GRAMMAR_FILE             = os.path.join(os.path.dirname(__file__),
                                         '..', '..', 'Include', 'graminit.h')
 TEST_PY_FILE             = 'symbol_test.py'
@@ -22,7 +25,7 @@
 
     def _generate_symbols(self, grammar_file, target_symbol_py_file):
         proc = subprocess.Popen([sys.executable,
-                                 SYMBOL_FILE,
+                                 GEN_SYMBOL_FILE,
                                  grammar_file,
                                  target_symbol_py_file], stderr=subprocess.PIPE)
         stderr = proc.communicate()[1]

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index ff14479..04a1254 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py

@@ -1619,6 +1619,8 @@
             testfiles = random.sample(testfiles, 10)
 
         for testfile in testfiles:
+            if support.verbose >= 2:
+                print('tokenize', testfile)
             with open(testfile, 'rb') as f:
                 with self.subTest(file=testfile):
                     self.check_roundtrip(f)

diff --git a/Lib/token.py b/Lib/token.py
index ba13205..5af7e6b 100644
--- a/Lib/token.py
+++ b/Lib/token.py

@@ -1,15 +1,8 @@
-"""Token constants (from "token.h")."""
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py
 
 __all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
 
-#  This file is automatically generated; please don't muck it up!
-#
-#  To update the symbols in this file, 'cd' to the top directory of
-#  the python source tree after building the interpreter and run:
-#
-#    ./python Lib/token.py
-
-#--start constants--
 ENDMARKER = 0
 NAME = 1
 NUMBER = 2
@@ -63,23 +56,70 @@
 ATEQUAL = 50
 RARROW = 51
 ELLIPSIS = 52
-# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
 OP = 53
-ERRORTOKEN = 54
 # These aren't used by the C tokenizer but are needed for tokenize.py
+ERRORTOKEN = 54
 COMMENT = 55
 NL = 56
 ENCODING = 57
 N_TOKENS = 58
 # Special definitions for cooperation with parser
 NT_OFFSET = 256
-#--end constants--
 
 tok_name = {value: name
             for name, value in globals().items()
             if isinstance(value, int) and not name.startswith('_')}
 __all__.extend(tok_name.values())
 
+EXACT_TOKEN_TYPES = {
+    '!=': NOTEQUAL,
+    '%': PERCENT,
+    '%=': PERCENTEQUAL,
+    '&': AMPER,
+    '&=': AMPEREQUAL,
+    '(': LPAR,
+    ')': RPAR,
+    '*': STAR,
+    '**': DOUBLESTAR,
+    '**=': DOUBLESTAREQUAL,
+    '*=': STAREQUAL,
+    '+': PLUS,
+    '+=': PLUSEQUAL,
+    ',': COMMA,
+    '-': MINUS,
+    '-=': MINEQUAL,
+    '->': RARROW,
+    '.': DOT,
+    '...': ELLIPSIS,
+    '/': SLASH,
+    '//': DOUBLESLASH,
+    '//=': DOUBLESLASHEQUAL,
+    '/=': SLASHEQUAL,
+    ':': COLON,
+    ';': SEMI,
+    '<': LESS,
+    '<<': LEFTSHIFT,
+    '<<=': LEFTSHIFTEQUAL,
+    '<=': LESSEQUAL,
+    '=': EQUAL,
+    '==': EQEQUAL,
+    '>': GREATER,
+    '>=': GREATEREQUAL,
+    '>>': RIGHTSHIFT,
+    '>>=': RIGHTSHIFTEQUAL,
+    '@': AT,
+    '@=': ATEQUAL,
+    '[': LSQB,
+    ']': RSQB,
+    '^': CIRCUMFLEX,
+    '^=': CIRCUMFLEXEQUAL,
+    '{': LBRACE,
+    '|': VBAR,
+    '|=': VBAREQUAL,
+    '}': RBRACE,
+    '~': TILDE,
+}
+
 def ISTERMINAL(x):
     return x < NT_OFFSET
 
@@ -88,73 +128,3 @@
 
 def ISEOF(x):
     return x == ENDMARKER
-
-
-def _main():
-    import re
-    import sys
-    args = sys.argv[1:]
-    inFileName = args and args[0] or "Include/token.h"
-    outFileName = "Lib/token.py"
-    if len(args) > 1:
-        outFileName = args[1]
-    try:
-        fp = open(inFileName)
-    except OSError as err:
-        sys.stdout.write("I/O error: %s\n" % str(err))
-        sys.exit(1)
-    with fp:
-        lines = fp.read().split("\n")
-    prog = re.compile(
-        r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
-        re.IGNORECASE)
-    comment_regex = re.compile(
-        r"^\s*/\*\s*(.+?)\s*\*/\s*$",
-        re.IGNORECASE)
-
-    tokens = {}
-    prev_val = None
-    for line in lines:
-        match = prog.match(line)
-        if match:
-            name, val = match.group(1, 2)
-            val = int(val)
-            tokens[val] = {'token': name}          # reverse so we can sort them...
-            prev_val = val
-        else:
-            comment_match = comment_regex.match(line)
-            if comment_match and prev_val is not None:
-                comment = comment_match.group(1)
-                tokens[prev_val]['comment'] = comment
-    keys = sorted(tokens.keys())
-    # load the output skeleton from the target:
-    try:
-        fp = open(outFileName)
-    except OSError as err:
-        sys.stderr.write("I/O error: %s\n" % str(err))
-        sys.exit(2)
-    with fp:
-        format = fp.read().split("\n")
-    try:
-        start = format.index("#--start constants--") + 1
-        end = format.index("#--end constants--")
-    except ValueError:
-        sys.stderr.write("target does not contain format markers")
-        sys.exit(3)
-    lines = []
-    for key in keys:
-        lines.append("%s = %d" % (tokens[key]["token"], key))
-        if "comment" in tokens[key]:
-            lines.append("# %s" % tokens[key]["comment"])
-    format[start:end] = lines
-    try:
-        fp = open(outFileName, 'w')
-    except OSError as err:
-        sys.stderr.write("I/O error: %s\n" % str(err))
-        sys.exit(4)
-    with fp:
-        fp.write("\n".join(format))
-
-
-if __name__ == "__main__":
-    _main()

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index fce010b..cf1ecc9 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py

@@ -32,6 +32,7 @@
 import re
 import sys
 from token import *
+from token import EXACT_TOKEN_TYPES
 
 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -41,55 +42,6 @@
                            "untokenize", "TokenInfo"]
 del token
 
-EXACT_TOKEN_TYPES = {
-    '(':   LPAR,
-    ')':   RPAR,
-    '[':   LSQB,
-    ']':   RSQB,
-    ':':   COLON,
-    ',':   COMMA,
-    ';':   SEMI,
-    '+':   PLUS,
-    '-':   MINUS,
-    '*':   STAR,
-    '/':   SLASH,
-    '|':   VBAR,
-    '&':   AMPER,
-    '<':   LESS,
-    '>':   GREATER,
-    '=':   EQUAL,
-    '.':   DOT,
-    '%':   PERCENT,
-    '{':   LBRACE,
-    '}':   RBRACE,
-    '==':  EQEQUAL,
-    '!=':  NOTEQUAL,
-    '<=':  LESSEQUAL,
-    '>=':  GREATEREQUAL,
-    '~':   TILDE,
-    '^':   CIRCUMFLEX,
-    '<<':  LEFTSHIFT,
-    '>>':  RIGHTSHIFT,
-    '**':  DOUBLESTAR,
-    '+=':  PLUSEQUAL,
-    '-=':  MINEQUAL,
-    '*=':  STAREQUAL,
-    '/=':  SLASHEQUAL,
-    '%=':  PERCENTEQUAL,
-    '&=':  AMPEREQUAL,
-    '|=':  VBAREQUAL,
-    '^=':  CIRCUMFLEXEQUAL,
-    '<<=': LEFTSHIFTEQUAL,
-    '>>=': RIGHTSHIFTEQUAL,
-    '**=': DOUBLESTAREQUAL,
-    '//':  DOUBLESLASH,
-    '//=': DOUBLESLASHEQUAL,
-    '...': ELLIPSIS,
-    '->':  RARROW,
-    '@':   AT,
-    '@=':  ATEQUAL,
-}
-
 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
     def __repr__(self):
         annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@@ -163,17 +115,11 @@
 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 
-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
-                 r"//=?", r"->",
-                 r"[+\-*/%&@|^=<>]=?",
-                 r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
+# Sorting in reverse order puts the long operators before their prefixes.
+# Otherwise if = came before ==, == would get recognized as two instances
+# of =.
+Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
+Funny = group(r'\r?\n', Special)
 
 PlainToken = group(Number, Funny, String, Name)
 Token = Ignore + PlainToken
commit	8ac658114dec4964479baecfbc439fceb40eaa79	[log] [tgz]
author	Serhiy Storchaka <storchaka@gmail.com>	Sat Dec 22 11:18:40 2018 +0200
committer	GitHub <noreply@github.com>	Sat Dec 22 11:18:40 2018 +0200
tree	e66c4c3beda293a6fdf01763306697d15d0af157
parent	c1b4b0f6160e1919394586f44b12538505fed300 [diff]