[2.7] bpo-30363: Backport warnings in the re module. (#1577)
Running Python with the -3 option now warns about regular expression
syntax that is invalid or has different semantic in Python 3
or will change the behavior in future Python versions.
diff --git a/Lib/_strptime.py b/Lib/_strptime.py
index feac05a..8eb2718 100644
--- a/Lib/_strptime.py
+++ b/Lib/_strptime.py
@@ -254,8 +254,8 @@
# format directives (%m, etc.).
regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
format = regex_chars.sub(r"\\\1", format)
- whitespace_replacement = re_compile('\s+')
- format = whitespace_replacement.sub('\s+', format)
+ whitespace_replacement = re_compile(r'\s+')
+ format = whitespace_replacement.sub(r'\\s+', format)
while '%' in format:
directive_index = format.index('%')+1
processed_format = "%s%s%s" % (processed_format,
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index c5a7e89..b6689fa 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -435,7 +435,7 @@
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
- if lo == 0:
+ if not lo and hi:
return # not worth it
# look for a literal prefix
prefix = []
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 75f488b..e0d003e 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -23,6 +23,7 @@
OCTDIGITS = set("01234567")
HEXDIGITS = set("0123456789abcdefABCDEF")
+ASCIILETTERS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = set(" \t\n\r\v\f")
@@ -260,6 +261,15 @@
elif c in DIGITS:
raise error, "bogus escape: %s" % repr(escape)
if len(escape) == 2:
+ if sys.py3kwarning and c in ASCIILETTERS:
+ import warnings
+ if c in 'Uu':
+ warnings.warn('bad escape %s; Unicode escapes are '
+ 'supported only since Python 3.3' % escape,
+ FutureWarning, stacklevel=8)
+ else:
+ warnings.warnpy3k('bad escape %s' % escape,
+ DeprecationWarning, stacklevel=8)
return LITERAL, ord(escape[1])
except ValueError:
pass
@@ -309,6 +319,15 @@
return GROUPREF, group
raise ValueError
if len(escape) == 2:
+ if sys.py3kwarning and c in ASCIILETTERS:
+ import warnings
+ if c in 'Uu':
+ warnings.warn('bad escape %s; Unicode escapes are '
+ 'supported only since Python 3.3' % escape,
+ FutureWarning, stacklevel=8)
+ else:
+ warnings.warnpy3k('bad escape %s' % escape,
+ DeprecationWarning, stacklevel=8)
return LITERAL, ord(escape[1])
except ValueError:
pass
@@ -714,6 +733,12 @@
pattern.str = str
p = _parse_sub(source, pattern, 0)
+ if (sys.py3kwarning and
+ (p.pattern.flags & SRE_FLAG_LOCALE) and
+ (p.pattern.flags & SRE_FLAG_UNICODE)):
+ import warnings
+ warnings.warnpy3k("LOCALE and UNICODE flags are incompatible",
+ DeprecationWarning, stacklevel=5)
tail = source.get()
if tail == ")":
@@ -801,7 +826,10 @@
try:
this = makechar(ESCAPES[this][1])
except KeyError:
- pass
+ if sys.py3kwarning and c in ASCIILETTERS:
+ import warnings
+ warnings.warnpy3k('bad escape %s' % this,
+ DeprecationWarning, stacklevel=4)
literal(this)
else:
literal(this)
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 5725a99..174c5ca 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -3,7 +3,7 @@
verbose, run_unittest, import_module,
precisionbigmemtest, _2G, cpython_only,
captured_stdout, have_unicode, requires_unicode, u,
- check_warnings)
+ check_warnings, check_py3k_warnings)
import locale
import re
from re import Scanner
@@ -66,11 +66,13 @@
self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
- self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
- '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
- self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
- self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
- (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
+ self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
+ self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
+ self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'),
+ (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)+chr(8)))
+ for c in 'cdehijklmopqsuwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
+ with check_py3k_warnings():
+ self.assertEqual(re.sub('a', '\\' + c, 'a'), '\\' + c)
self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
@@ -223,11 +225,11 @@
def test_re_split(self):
self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
- self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
- self.assertEqual(re.split("(:*)", ":a:b::c"),
+ self.assertEqual(re.split(":+", ":a:b::c"), ['', 'a', 'b', 'c'])
+ self.assertEqual(re.split("(:+)", ":a:b::c"),
['', ':', 'a', ':', 'b', '::', 'c'])
- self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
- self.assertEqual(re.split("(:)*", ":a:b::c"),
+ self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
+ self.assertEqual(re.split("(:)+", ":a:b::c"),
['', ':', 'a', ':', 'b', ':', 'c'])
self.assertEqual(re.split("([b:]+)", ":a:b::c"),
['', ':', 'a', ':b::', 'c'])
@@ -237,13 +239,34 @@
self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
['', 'a', '', '', 'c'])
+ for sep, expected in [
+ (':*', ['', 'a', 'b', 'c']),
+ ('(?::*)', ['', 'a', 'b', 'c']),
+ ('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
+ ('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
+ ]:
+ with check_py3k_warnings(('', FutureWarning)):
+ self.assertEqual(re.split(sep, ':a:b::c'), expected)
+
+ for sep, expected in [
+ ('', [':a:b::c']),
+ (r'\b', [':a:b::c']),
+ (r'(?=:)', [':a:b::c']),
+ (r'(?<=:)', [':a:b::c']),
+ ]:
+ with check_py3k_warnings():
+ self.assertEqual(re.split(sep, ':a:b::c'), expected)
+
def test_qualified_re_split(self):
self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
self.assertEqual(re.split("(:)", ":a:b::c", 2),
['', ':', 'a', ':', 'b::c'])
- self.assertEqual(re.split("(:*)", ":a:b::c", 2),
+ self.assertEqual(re.split("(:+)", ":a:b::c", 2),
['', ':', 'a', ':', 'b::c'])
+ with check_py3k_warnings(('', FutureWarning)):
+ self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
+ ['', ':', 'a', ':', 'b::c'])
def test_re_findall(self):
self.assertEqual(re.findall(":+", "abc"), [])
@@ -404,6 +427,29 @@
self.assertEqual(re.search(r"\d\D\w\W\s\S",
"1aa! a", re.UNICODE).group(0), "1aa! a")
+ def test_other_escapes(self):
+ self.assertRaises(re.error, re.compile, "\\")
+ self.assertEqual(re.match(r"\(", '(').group(), '(')
+ self.assertIsNone(re.match(r"\(", ')'))
+ self.assertEqual(re.match(r"\\", '\\').group(), '\\')
+ self.assertEqual(re.match(r"[\]]", ']').group(), ']')
+ self.assertIsNone(re.match(r"[\]]", '['))
+ self.assertEqual(re.match(r"[a\-c]", '-').group(), '-')
+ self.assertIsNone(re.match(r"[a\-c]", 'b'))
+ self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
+ self.assertIsNone(re.match(r"[\^a]+", 'b'))
+ re.purge() # for warnings
+ for c in 'ceghijklmopquyzCEFGHIJKLMNOPQRTUVXY':
+ warn = FutureWarning if c in 'Uu' else DeprecationWarning
+ with check_py3k_warnings(('', warn)):
+ self.assertEqual(re.match('\\%c$' % c, c).group(), c)
+ self.assertIsNone(re.match('\\%c' % c, 'a'))
+ for c in 'ceghijklmopquyzABCEFGHIJKLMNOPQRTUVXYZ':
+ warn = FutureWarning if c in 'Uu' else DeprecationWarning
+ with check_py3k_warnings(('', warn)):
+ self.assertEqual(re.match('[\\%c]$' % c, c).group(), c)
+ self.assertIsNone(re.match('[\\%c]' % c, 'a'))
+
def test_string_boundaries(self):
# See http://bugs.python.org/issue10713
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
@@ -931,6 +977,19 @@
self.assertTrue(re.match('(?ixu) ' + upper_char, lower_char))
self.assertTrue(re.match('(?ixu) ' + lower_char, upper_char))
+ # Incompatibilities
+ re.purge()
+ with check_py3k_warnings():
+ re.compile('', re.LOCALE|re.UNICODE)
+ with check_py3k_warnings():
+ re.compile('(?L)', re.UNICODE)
+ with check_py3k_warnings():
+ re.compile('(?u)', re.LOCALE)
+ with check_py3k_warnings():
+ re.compile('(?Lu)')
+ with check_py3k_warnings():
+ re.compile('(?uL)')
+
def test_dollar_matches_twice(self):
"$ matches the end of string, and just before the terminating \n"
pattern = re.compile('$')
@@ -967,8 +1026,9 @@
def test_bug_13899(self):
# Issue #13899: re pattern r"[\A]" should work like "A" but matches
# nothing. Ditto B and Z.
- self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
- ['A', 'B', '\b', 'C', 'Z'])
+ with check_py3k_warnings():
+ self.assertEqual(re.findall(r'[\A\B\b\C\Z]', 'AB\bCZ'),
+ ['A', 'B', '\b', 'C', 'Z'])
@precisionbigmemtest(size=_2G, memuse=1)
def test_large_search(self, size):
@@ -1261,7 +1321,11 @@
def test_main():
run_unittest(ReTests)
- run_re_tests()
+ deprecations = [
+ ('bad escape', DeprecationWarning),
+ ]
+ with check_py3k_warnings(*deprecations):
+ run_re_tests()
if __name__ == "__main__":
test_main()
diff --git a/Misc/NEWS b/Misc/NEWS
index dd6ec1b..3f34c6c 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -42,6 +42,10 @@
Library
-------
+- bpo-30363: Running Python with the -3 option now warns about regular
+ expression syntax that is invalid or has different semantic in Python 3
+ or will change the behavior in future Python versions.
+
- bpo-30365: Running Python with the -3 option now emits deprecation warnings
for getchildren() and getiterator() methods of the Element class in the
xml.etree.cElementTree module and when pass the html argument to
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 8e16c1d..6fd3aff 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -2267,6 +2267,20 @@
if (!string)
return NULL;
+ if (Py_Py3kWarningFlag &&
+ (self->code[0] != SRE_OP_INFO || self->code[3] == 0))
+ {
+ if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
+ if (PyErr_WarnPy3k("split() requires a non-empty pattern match.",
+ 1) < 0)
+ return NULL;
+ }
+ else if (PyErr_WarnEx(PyExc_FutureWarning,
+ "split() requires a non-empty pattern match.",
+ 1) < 0)
+ return NULL;
+ }
+
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
if (!string)
return NULL;