Bug #2301: Don't try decoding the source code into the original encoding for syntax errors.

commit: 259314622750c72de2ef377e77a0b70b8d8b2fb5 [log] [tgz]
author: Martin v. Löwis <martin@v.loewis.de> Mon Mar 17 20:43:42 2008 +0000
committer: Martin v. Löwis <martin@v.loewis.de> Mon Mar 17 20:43:42 2008 +0000
tree: 089ad865c7be59bf68fd72e0d5c18c12d831e345
parent: ddaa7064ee81c48adc4fdea327892c29179f7845 [diff]
diff --git a/Lib/test/test_pep263.py b/Lib/test/test_pep263.py
index cc126ba..92065c9 100644
--- a/Lib/test/test_pep263.py
+++ b/Lib/test/test_pep263.py

@@ -23,6 +23,13 @@
         exec(c, d)

         self.assertEqual(d['u'], '\xf3')

 

+    def test_issue2301(self):

+        try:

+            compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec")

+        except SyntaxError as v:

+            self.assertEquals(v.text, "print '\u5e74'")

+        else:

+            self.fail()

 

 def test_main():

     test_support.run_unittest(PEP263Test)


diff --git a/Misc/NEWS b/Misc/NEWS
index 1665256..6c38150 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS

@@ -9,6 +9,12 @@
 
 *Release date: XX-XXX-2008*
 
+Core and Builtins
+-----------------
+
+- Bug #2301: Don't try decoding the source code into the original
+  encoding for syntax errors.
+
 Extension Modules
 -----------------
 

diff --git a/Parser/parsetok.c b/Parser/parsetok.c
index 0b3314e..708c26d 100644
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c

@@ -213,21 +213,16 @@
 			err_ret->error = E_EOF;
 		err_ret->lineno = tok->lineno;
 		if (tok->buf != NULL) {
-			char *text = NULL;
 			size_t len;
 			assert(tok->cur - tok->buf < INT_MAX);
 			err_ret->offset = (int)(tok->cur - tok->buf);
 			len = tok->inp - tok->buf;
-			text = PyTokenizer_RestoreEncoding(tok, len, &err_ret->offset);
-			if (text == NULL) {
-				text = (char *) PyObject_MALLOC(len + 1);
-				if (text != NULL) {
-					if (len > 0)
-						strncpy(text, tok->buf, len);
-					text[len] = '\0';
-				}
+			err_ret->text = (char *) PyObject_MALLOC(len + 1);
+			if (err_ret->text != NULL) {
+				if (len > 0)
+					strncpy(err_ret->text, tok->buf, len);
+				err_ret->text[len] = '\0';
 			}
-			err_ret->text = text;
 		}
 	} else if (tok->encoding != NULL) {
 		node* r = PyNode_New(encoding_decl);

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 2833e53..0b8341a 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c

@@ -1579,70 +1579,6 @@
 	return result;
 }
 
-/* This function is only called from parsetok. However, it cannot live
-   there, as it must be empty for PGEN, and we can check for PGEN only
-   in this file. */
-
-#ifdef PGEN
-char*
-PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
-{
-	return NULL;
-}
-#else
-static PyObject *
-dec_utf8(const char *enc, const char *text, size_t len) {
-	PyObject *ret = NULL;
-	PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
-	if (unicode_text) {
-		ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
-		Py_DECREF(unicode_text);
-	}
-	if (!ret) {
-		PyErr_Clear();
-	}
-        else {
-		assert(PyString_Check(ret));
-	}
-	return ret;
-}
-
-char *
-PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
-{
-	char *text = NULL;
-	if (tok->encoding) {
-		/* convert source to original encondig */
-		PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
-		if (lineobj != NULL) {
-			int linelen = PyString_GET_SIZE(lineobj);
-			const char *line = PyString_AS_STRING(lineobj);
-			text = PyObject_MALLOC(linelen + 1);
-			if (text != NULL && line != NULL) {
-				if (linelen)
-					strncpy(text, line, linelen);
-				text[linelen] = '\0';
-			}
-			Py_DECREF(lineobj);
-
-			/* adjust error offset */
-			if (*offset > 1) {
-				PyObject *offsetobj = dec_utf8(tok->encoding,
-							       tok->buf,
-							       *offset-1);
-				if (offsetobj) {
-					*offset = 1 + Py_SIZE(offsetobj);
-					Py_DECREF(offsetobj);
-				}
-			}
-
-		}
-	}
-	return text;
-
-}
-#endif
-
 /* Get -*- encoding -*- from a Python file.
 
    PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
commit	259314622750c72de2ef377e77a0b70b8d8b2fb5	[log] [tgz]
author	Martin v. Löwis <martin@v.loewis.de>	Mon Mar 17 20:43:42 2008 +0000
committer	Martin v. Löwis <martin@v.loewis.de>	Mon Mar 17 20:43:42 2008 +0000
tree	089ad865c7be59bf68fd72e0d5c18c12d831e345
parent	ddaa7064ee81c48adc4fdea327892c29179f7845 [diff]