SF #941229: Decode source code with sys.stdin.encoding in interactive modes like non-interactive modes. This allows for non-latin-1 users to write unicode strings directly and sets Japanese users free from weird manual escaping <wink> in shift_jis environments. (Reviewed by Martin v. Loewis)

commit: 7df44b384a4391cfed0a4d26b7e314a06ae4d595 [log] [tgz]
author: Hye-Shik Chang <hyeshik@gmail.com> Wed Aug 04 17:36:41 2004 +0000
committer: Hye-Shik Chang <hyeshik@gmail.com> Wed Aug 04 17:36:41 2004 +0000
tree: ca296981c3244abf8c42ac8f813e540fe9833e24
parent: 5910d81c979b79a98f3d5ac8dea81e84ab721c37 [diff]
diff --git a/Misc/NEWS b/Misc/NEWS
index 630c85e..aea6867 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS

@@ -70,6 +70,10 @@
 - unicode.iswide() and unicode.width() is dropped and the East Asian
   Width support is moved to unicodedata extension module.
 
+- Patch #941229: The source code encoding in interactive mode
+  now refers sys.stdin.encoding not just ISO-8859-1 anymore.  This
+  allows for non-latin-1 users to write unicode strings directly.
+
 Extension modules
 -----------------
 

diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 4fdc2e6..8fc2c26 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c

@@ -651,6 +651,63 @@
 	PyMem_DEL(tok);
 }
 
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+static int
+tok_stdin_decode(struct tok_state *tok, char **inp)
+{
+	PyObject *enc, *sysstdin, *decoded, *utf8;
+	const char *encoding;
+	char *converted;
+
+	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
+		return 0;
+	sysstdin = PySys_GetObject("stdin");
+	if (sysstdin == NULL || !PyFile_Check(sysstdin))
+		return 0;
+
+	enc = ((PyFileObject *)sysstdin)->f_encoding;
+	if (enc == NULL || !PyString_Check(enc))
+		return 0;
+	Py_INCREF(enc);
+
+	encoding = PyString_AsString(enc);
+	decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
+	if (decoded == NULL)
+		goto error_clear;
+
+	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
+	Py_DECREF(decoded);
+	if (utf8 == NULL)
+		goto error_clear;
+
+	converted = new_string(PyString_AsString(utf8), PyString_Size(utf8));
+	Py_DECREF(utf8);
+	if (converted == NULL)
+		goto error_nomem;
+
+	PyMem_FREE(*inp);
+	*inp = converted;
+	if (tok->encoding != NULL)
+		PyMem_DEL(tok->encoding);
+	tok->encoding = new_string(encoding, strlen(encoding));
+	if (tok->encoding == NULL)
+		goto error_nomem;
+
+	Py_DECREF(enc);
+	return 0;
+
+error_nomem:
+	Py_DECREF(enc);
+	tok->done = E_NOMEM;
+	return -1;
+
+error_clear:
+	/* Fallback to iso-8859-1: for backward compatibility */
+	Py_DECREF(enc);
+	PyErr_Clear();
+	return 0;
+}
+#endif
 
 /* Get next char, updating state; error code goes into tok->done */
 
@@ -690,6 +747,10 @@
 				PyMem_FREE(new);
 				tok->done = E_EOF;
 			}
+#if !defined(PGEN) && defined(Py_USING_UNICODE)
+			else if (tok_stdin_decode(tok, &new) != 0)
+				PyMem_FREE(new);
+#endif
 			else if (tok->start != NULL) {
 				size_t start = tok->start - tok->buf;
 				size_t oldlen = tok->cur - tok->buf;
commit	7df44b384a4391cfed0a4d26b7e314a06ae4d595	[log] [tgz]
author	Hye-Shik Chang <hyeshik@gmail.com>	Wed Aug 04 17:36:41 2004 +0000
committer	Hye-Shik Chang <hyeshik@gmail.com>	Wed Aug 04 17:36:41 2004 +0000
tree	ca296981c3244abf8c42ac8f813e540fe9833e24
parent	5910d81c979b79a98f3d5ac8dea81e84ab721c37 [diff]