bpo-35975: Support parsing earlier minor versions of Python 3 (GH-12086)



This adds a `feature_version` flag to `ast.parse()` (documented) and `compile()` (hidden) that allow tweaking the parser to support older versions of the grammar. In particular if `feature_version` is 5 or 6, the hacks for the `async` and `await` keyword from PEP 492 are reinstated. (For 7 or higher, these are unconditionally treated as keywords, but they are still special tokens rather than `NAME` tokens that the parser driver recognizes.)



https://bugs.python.org/issue35975
diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py
index 1526995..5224755 100644
--- a/Parser/asdl_c.py
+++ b/Parser/asdl_c.py
@@ -1189,6 +1189,11 @@
 /* mode is 0 for "exec", 1 for "eval" and 2 for "single" input */
 mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode)
 {
+    return PyAST_obj2mod_ex(ast, arena, mode, PY_MINOR_VERSION);
+}
+
+mod_ty PyAST_obj2mod_ex(PyObject* ast, PyArena* arena, int mode, int feature_version)
+{
     mod_ty res;
     PyObject *req_type[3];
     char *req_name[] = {"Module", "Expression", "Interactive"};
@@ -1269,6 +1274,7 @@
             f.write("\n")
             f.write("PyObject* PyAST_mod2obj(mod_ty t);\n")
             f.write("mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);\n")
+            f.write("mod_ty PyAST_obj2mod_ex(PyObject* ast, PyArena* arena, int mode, int feature_version);\n")
             f.write("int PyAST_Check(PyObject* obj);\n")
             f.write('\n')
             f.write('#ifdef __cplusplus\n')
diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index 7a6c886..ba33a9a 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -101,6 +101,8 @@
 
     Py_INCREF(err_ret->filename);
     tok->filename = err_ret->filename;
+    if (*flags & PyPARSE_ASYNC_HACKS)
+        tok->async_hacks = 1;
     return parsetok(tok, g, start, err_ret, flags);
 }
 
diff --git a/Parser/token.c b/Parser/token.c
index 228ecff..a489668 100644
--- a/Parser/token.c
+++ b/Parser/token.c
@@ -61,6 +61,8 @@
     "ELLIPSIS",
     "COLONEQUAL",
     "OP",
+    "AWAIT",
+    "ASYNC",
     "TYPE_IGNORE",
     "TYPE_COMMENT",
     "<ERRORTOKEN>",
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 44ec415..8f0a9c8 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -84,6 +84,11 @@
     tok->decoding_buffer = NULL;
     tok->type_comments = 0;
 
+    tok->async_hacks = 0;
+    tok->async_def = 0;
+    tok->async_def_indent = 0;
+    tok->async_def_nl = 0;
+
     return tok;
 }
 
@@ -1196,6 +1201,31 @@
         }
     }
 
+    /* Peek ahead at the next character */
+    c = tok_nextc(tok);
+    tok_backup(tok, c);
+    /* Check if we are closing an async function */
+    if (tok->async_def
+        && !blankline
+        /* Due to some implementation artifacts of type comments,
+         * a TYPE_COMMENT at the start of a function won't set an
+         * indentation level and it will produce a NEWLINE after it.
+         * To avoid spuriously ending an async function due to this,
+         * wait until we have some non-newline char in front of us. */
+        && c != '\n'
+        && tok->level == 0
+        /* There was a NEWLINE after ASYNC DEF,
+           so we're past the signature. */
+        && tok->async_def_nl
+        /* Current indentation level is less than where
+           the async function was defined */
+        && tok->async_def_indent >= tok->indent)
+    {
+        tok->async_def = 0;
+        tok->async_def_indent = 0;
+        tok->async_def_nl = 0;
+    }
+
  again:
     tok->start = NULL;
     /* Skip spaces */
@@ -1310,6 +1340,50 @@
         *p_start = tok->start;
         *p_end = tok->cur;
 
+        /* async/await parsing block. */
+        if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
+            /* May be an 'async' or 'await' token.  For Python 3.7 or
+               later we recognize them unconditionally.  For Python
+               3.5 or 3.6 we recognize 'async' in front of 'def', and
+               either one inside of 'async def'.  (Technically we
+               shouldn't recognize these at all for 3.4 or earlier,
+               but there's no *valid* Python 3.4 code that would be
+               rejected, and async functions will be rejected in a
+               later phase.) */
+            if (!tok->async_hacks || tok->async_def) {
+                /* Always recognize the keywords. */
+                if (memcmp(tok->start, "async", 5) == 0) {
+                    return ASYNC;
+                }
+                if (memcmp(tok->start, "await", 5) == 0) {
+                    return AWAIT;
+                }
+            }
+            else if (memcmp(tok->start, "async", 5) == 0) {
+                /* The current token is 'async'.
+                   Look ahead one token to see if that is 'def'. */
+
+                struct tok_state ahead_tok;
+                char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
+                int ahead_tok_kind;
+
+                memcpy(&ahead_tok, tok, sizeof(ahead_tok));
+                ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
+                                         &ahead_tok_end);
+
+                if (ahead_tok_kind == NAME
+                    && ahead_tok.cur - ahead_tok.start == 3
+                    && memcmp(ahead_tok.start, "def", 3) == 0)
+                {
+                    /* The next token is going to be 'def', so instead of
+                       returning a plain NAME token, return ASYNC. */
+                    tok->async_def_indent = tok->indent;
+                    tok->async_def = 1;
+                    return ASYNC;
+                }
+            }
+        }
+
         return NAME;
     }
 
@@ -1322,6 +1396,11 @@
         *p_start = tok->start;
         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
         tok->cont_line = 0;
+        if (tok->async_def) {
+            /* We're somewhere inside an 'async def' function, and
+               we've encountered a NEWLINE after its signature. */
+            tok->async_def_nl = 1;
+        }
         return NEWLINE;
     }
 
diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h
index 22e91f7..06c7a14 100644
--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -64,6 +64,13 @@
     const char* input; /* Tokenizer's newline translated copy of the string. */
 
     int type_comments;      /* Whether to look for type comments */
+
+    /* async/await related fields (still needed depending on feature_version) */
+    int async_hacks;     /* =1 if async/await aren't always keywords */
+    int async_def;        /* =1 if tokens are inside an 'async def' body. */
+    int async_def_indent; /* Indentation level of the outermost 'async def'. */
+    int async_def_nl;     /* =1 if the outermost 'async def' had at least one
+                             NEWLINE token after it. */
 };
 
 extern struct tok_state *PyTokenizer_FromString(const char *, int);