Issue #6097: Escape UTF-8 surrogates resulting from mbstocs conversion
of the command line.
diff --git a/Modules/python.c b/Modules/python.c
index 13c6d5b..edd33f4 100644
--- a/Modules/python.c
+++ b/Modules/python.c
@@ -38,8 +38,16 @@
if (!res)
goto oom;
count = mbstowcs(res, arg, argsize+1);
- if (count != (size_t)-1)
- return res;
+ if (count != (size_t)-1) {
+ wchar_t *tmp;
+ /* Only use the result if it contains no
+ surrogate characters. */
+ for (tmp = res; *tmp != 0 &&
+ (*tmp < 0xd800 || *tmp > 0xdfff); tmp++)
+ ;
+ if (*tmp == 0)
+ return res;
+ }
PyMem_Free(res);
}
/* Conversion failed. Fall back to escaping with surrogateescape. */
@@ -75,6 +83,14 @@
memset(&mbs, 0, sizeof mbs);
continue;
}
+ if (*out >= 0xd800 && *out <= 0xdfff) {
+ /* Surrogate character. Escape the original
+ byte sequence with surrogateescape. */
+ argsize -= converted;
+ while (converted--)
+ *out++ = 0xdc00 + *in++;
+ continue;
+ }
/* successfully converted some bytes */
in += converted;
argsize -= converted;