bpo-30215: Make re.compile() locale agnostic. (#1361)
Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time. Only the locale at matching
time affects the result of matching.
diff --git a/Lib/re.py b/Lib/re.py
index 7053edd..d0ee5db 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -268,9 +268,7 @@
def _compile(pattern, flags):
# internal: compile pattern
try:
- p, loc = _cache[type(pattern), pattern, flags]
- if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
- return p
+ return _cache[type(pattern), pattern, flags]
except KeyError:
pass
if isinstance(pattern, _pattern_type):
@@ -284,13 +282,7 @@
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
_cache.clear()
- if p.flags & LOCALE:
- if not _locale:
- return p
- loc = _locale.setlocale(_locale.LC_CTYPE)
- else:
- loc = None
- _cache[type(pattern), pattern, flags] = p, loc
+ _cache[type(pattern), pattern, flags] = p
return p
@functools.lru_cache(_MAXCACHE)
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 2cc3900..d7ee4e8 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -78,7 +78,13 @@
fixes = None
for op, av in pattern:
if op in LITERAL_CODES:
- if flags & SRE_FLAG_IGNORECASE:
+ if not flags & SRE_FLAG_IGNORECASE:
+ emit(op)
+ emit(av)
+ elif flags & SRE_FLAG_LOCALE:
+ emit(OP_LOC_IGNORE[op])
+ emit(av)
+ else:
lo = _sre.getlower(av, flags)
if fixes and lo in fixes:
emit(IN_IGNORE)
@@ -93,17 +99,17 @@
else:
emit(OP_IGNORE[op])
emit(lo)
- else:
- emit(op)
- emit(av)
elif op is IN:
- if flags & SRE_FLAG_IGNORECASE:
- emit(OP_IGNORE[op])
- def fixup(literal, flags=flags):
- return _sre.getlower(literal, flags)
- else:
+ if not flags & SRE_FLAG_IGNORECASE:
emit(op)
fixup = None
+ elif flags & SRE_FLAG_LOCALE:
+ emit(IN_LOC_IGNORE)
+ fixup = None
+ else:
+ emit(IN_IGNORE)
+ def fixup(literal, flags=flags):
+ return _sre.getlower(literal, flags)
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup, fixes)
code[skip] = _len(code) - skip
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index fc684ae..b016431 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
# update when constants are added or removed
-MAGIC = 20140917
+MAGIC = 20170530
from _sre import MAXREPEAT, MAXGROUPS
@@ -87,6 +87,9 @@
SUBPATTERN
MIN_REPEAT_ONE
RANGE_IGNORE
+ LITERAL_LOC_IGNORE
+ NOT_LITERAL_LOC_IGNORE
+ IN_LOC_IGNORE
MIN_REPEAT MAX_REPEAT
""")
@@ -124,6 +127,11 @@
RANGE: RANGE_IGNORE,
}
+OP_LOC_IGNORE = {
+ LITERAL: LITERAL_LOC_IGNORE,
+ NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
+}
+
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index da5c953..7601dc8 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1730,6 +1730,38 @@
self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
+ def test_locale_compiled(self):
+ oldlocale = locale.setlocale(locale.LC_CTYPE)
+ self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
+ for loc in 'en_US.iso88591', 'en_US.utf8':
+ try:
+ locale.setlocale(locale.LC_CTYPE, loc)
+ except locale.Error:
+ # Unsupported locale on this system
+ self.skipTest('test needs %s locale' % loc)
+
+ locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
+ p1 = re.compile(b'\xc5\xe5', re.L|re.I)
+ p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
+ p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
+ p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
+ for p in p1, p2, p3:
+ self.assertTrue(p.match(b'\xc5\xe5'))
+ self.assertTrue(p.match(b'\xe5\xe5'))
+ self.assertTrue(p.match(b'\xc5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xe5'))
+ self.assertIsNone(p4.match(b'\xc5\xc5'))
+
+ locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
+ for p in p1, p2, p3:
+ self.assertTrue(p.match(b'\xc5\xe5'))
+ self.assertIsNone(p.match(b'\xe5\xe5'))
+ self.assertIsNone(p.match(b'\xc5\xc5'))
+ self.assertTrue(p4.match(b'\xe5\xc5'))
+ self.assertIsNone(p4.match(b'\xe5\xe5'))
+ self.assertIsNone(p4.match(b'\xc5\xc5'))
+
def test_error(self):
with self.assertRaises(re.error) as cm:
re.compile('(\u20ac))')