Issue #25388: Fixed tokenizer hang when processing undecodable source code with a null byte.

commit: 5d7d26c403d86e9525820d872eb3e331dbc31750 [log] [tgz]
author: Serhiy Storchaka <storchaka@gmail.com> Sat Nov 14 15:14:29 2015 +0200
committer: Serhiy Storchaka <storchaka@gmail.com> Sat Nov 14 15:14:29 2015 +0200
tree: 1d300e51de7abe9289fc58059a02f00aa7e42d69
parent: b1c1e673cbbd67f70714d7c65d4a9f0f85c53d0f [diff]
diff --git a/Lib/test/test_compile.py b/Lib/test/test_compile.py
index cfc6389..c166ff1 100644
--- a/Lib/test/test_compile.py
+++ b/Lib/test/test_compile.py

@@ -3,6 +3,9 @@
 import sys
 import _ast
 from test import test_support
+from test import script_helper
+import os
+import tempfile
 import textwrap
 
 class TestSpecifics(unittest.TestCase):
@@ -555,6 +558,19 @@
         ast.body = [_ast.BoolOp()]
         self.assertRaises(TypeError, compile, ast, '<ast>', 'exec')
 
+    def test_yet_more_evil_still_undecodable(self):
+        # Issue #25388
+        src = b"#\x00\n#\xfd\n"
+        tmpd = tempfile.mkdtemp()
+        try:
+            fn = os.path.join(tmpd, "bad.py")
+            with open(fn, "wb") as fp:
+                fp.write(src)
+            rc, out, err = script_helper.assert_python_failure(fn)
+        finally:
+            test_support.rmtree(tmpd)
+        self.assertIn(b"Non-ASCII", err)
+
 
 class TestStackSize(unittest.TestCase):
     # These tests check that the computed stack size for a code object

diff --git a/Misc/NEWS b/Misc/NEWS
index 43e0418..5d30b1a 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS

@@ -10,6 +10,9 @@
 Core and Builtins
 -----------------
 
+- Issue #25388: Fixed tokenizer hang when processing undecodable source code
+  with a null byte.
+
 - Issue #22995: Default implementation of __reduce__ and __reduce_ex__ now
   rejects builtin types with not defined __new__.
 

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 109c0ee..7e4a300 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c

@@ -169,7 +169,8 @@
     tok->decoding_erred = 1;
     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
         PyMem_FREE(tok->buf);
-    tok->buf = NULL;
+    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
+    tok->done = E_DECODE;
     return NULL;                /* as if it were EOF */
 }
 
@@ -921,7 +922,6 @@
                 if (tok->buf != NULL)
                     PyMem_FREE(tok->buf);
                 tok->buf = newtok;
-                tok->line_start = tok->buf;
                 tok->cur = tok->buf;
                 tok->line_start = tok->buf;
                 tok->inp = strchr(tok->buf, '\0');
@@ -944,7 +944,8 @@
                 }
                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
                           tok) == NULL) {
-                    tok->done = E_EOF;
+                    if (!tok->decoding_erred)
+                        tok->done = E_EOF;
                     done = 1;
                 }
                 else {
@@ -978,6 +979,8 @@
                     return EOF;
                 }
                 tok->buf = newbuf;
+                tok->cur = tok->buf + cur;
+                tok->line_start = tok->cur;
                 tok->inp = tok->buf + curvalid;
                 tok->end = tok->buf + newsize;
                 tok->start = curstart < 0 ? NULL :
commit	5d7d26c403d86e9525820d872eb3e331dbc31750	[log] [tgz]
author	Serhiy Storchaka <storchaka@gmail.com>	Sat Nov 14 15:14:29 2015 +0200
committer	Serhiy Storchaka <storchaka@gmail.com>	Sat Nov 14 15:14:29 2015 +0200
tree	1d300e51de7abe9289fc58059a02f00aa7e42d69
parent	b1c1e673cbbd67f70714d7c65d4a9f0f85c53d0f [diff]