bpo-30215: Make re.compile() locale agnostic. (#1361)

Compiled regular expression objects with the re.LOCALE flag no longer
depend on the locale at compile time.  Only the locale at matching
time affects the result of matching.
diff --git a/Doc/library/re.rst b/Doc/library/re.rst
index 0fa7196..131f372 100644
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -559,6 +559,11 @@
       :const:`re.LOCALE` can be used only with bytes patterns and is
       not compatible with :const:`re.ASCII`.
 
+   .. versionchanged:: 3.7
+      Compiled regular expression objects with the :const:`re.LOCALE` flag no
+      longer depend on the locale at compile time.  Only the locale at
+      matching time affects the result of matching.
+
 
 .. data:: M
           MULTILINE
diff --git a/Lib/re.py b/Lib/re.py
index 7053edd..d0ee5db 100644
--- a/Lib/re.py
+++ b/Lib/re.py
@@ -268,9 +268,7 @@
 def _compile(pattern, flags):
     # internal: compile pattern
     try:
-        p, loc = _cache[type(pattern), pattern, flags]
-        if loc is None or loc == _locale.setlocale(_locale.LC_CTYPE):
-            return p
+        return _cache[type(pattern), pattern, flags]
     except KeyError:
         pass
     if isinstance(pattern, _pattern_type):
@@ -284,13 +282,7 @@
     if not (flags & DEBUG):
         if len(_cache) >= _MAXCACHE:
             _cache.clear()
-        if p.flags & LOCALE:
-            if not _locale:
-                return p
-            loc = _locale.setlocale(_locale.LC_CTYPE)
-        else:
-            loc = None
-        _cache[type(pattern), pattern, flags] = p, loc
+        _cache[type(pattern), pattern, flags] = p
     return p
 
 @functools.lru_cache(_MAXCACHE)
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 2cc3900..d7ee4e8 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -78,7 +78,13 @@
         fixes = None
     for op, av in pattern:
         if op in LITERAL_CODES:
-            if flags & SRE_FLAG_IGNORECASE:
+            if not flags & SRE_FLAG_IGNORECASE:
+                emit(op)
+                emit(av)
+            elif flags & SRE_FLAG_LOCALE:
+                emit(OP_LOC_IGNORE[op])
+                emit(av)
+            else:
                 lo = _sre.getlower(av, flags)
                 if fixes and lo in fixes:
                     emit(IN_IGNORE)
@@ -93,17 +99,17 @@
                 else:
                     emit(OP_IGNORE[op])
                     emit(lo)
-            else:
-                emit(op)
-                emit(av)
         elif op is IN:
-            if flags & SRE_FLAG_IGNORECASE:
-                emit(OP_IGNORE[op])
-                def fixup(literal, flags=flags):
-                    return _sre.getlower(literal, flags)
-            else:
+            if not flags & SRE_FLAG_IGNORECASE:
                 emit(op)
                 fixup = None
+            elif flags & SRE_FLAG_LOCALE:
+                emit(IN_LOC_IGNORE)
+                fixup = None
+            else:
+                emit(IN_IGNORE)
+                def fixup(literal, flags=flags):
+                    return _sre.getlower(literal, flags)
             skip = _len(code); emit(0)
             _compile_charset(av, flags, code, fixup, fixes)
             code[skip] = _len(code) - skip
diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index fc684ae..b016431 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py
@@ -13,7 +13,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20140917
+MAGIC = 20170530
 
 from _sre import MAXREPEAT, MAXGROUPS
 
@@ -87,6 +87,9 @@
     SUBPATTERN
     MIN_REPEAT_ONE
     RANGE_IGNORE
+    LITERAL_LOC_IGNORE
+    NOT_LITERAL_LOC_IGNORE
+    IN_LOC_IGNORE
 
     MIN_REPEAT MAX_REPEAT
 """)
@@ -124,6 +127,11 @@
     RANGE: RANGE_IGNORE,
 }
 
+OP_LOC_IGNORE = {
+    LITERAL: LITERAL_LOC_IGNORE,
+    NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
+}
+
 AT_MULTILINE = {
     AT_BEGINNING: AT_BEGINNING_LINE,
     AT_END: AT_END_LINE
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index da5c953..7601dc8 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1730,6 +1730,38 @@
         self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5'))
         self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
 
+    def test_locale_compiled(self):
+        oldlocale = locale.setlocale(locale.LC_CTYPE)
+        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
+        for loc in 'en_US.iso88591', 'en_US.utf8':
+            try:
+                locale.setlocale(locale.LC_CTYPE, loc)
+            except locale.Error:
+                # Unsupported locale on this system
+                self.skipTest('test needs %s locale' % loc)
+
+        locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591')
+        p1 = re.compile(b'\xc5\xe5', re.L|re.I)
+        p2 = re.compile(b'[a\xc5][a\xe5]', re.L|re.I)
+        p3 = re.compile(b'[az\xc5][az\xe5]', re.L|re.I)
+        p4 = re.compile(b'[^\xc5][^\xe5]', re.L|re.I)
+        for p in p1, p2, p3:
+            self.assertTrue(p.match(b'\xc5\xe5'))
+            self.assertTrue(p.match(b'\xe5\xe5'))
+            self.assertTrue(p.match(b'\xc5\xc5'))
+        self.assertIsNone(p4.match(b'\xe5\xc5'))
+        self.assertIsNone(p4.match(b'\xe5\xe5'))
+        self.assertIsNone(p4.match(b'\xc5\xc5'))
+
+        locale.setlocale(locale.LC_CTYPE, 'en_US.utf8')
+        for p in p1, p2, p3:
+            self.assertTrue(p.match(b'\xc5\xe5'))
+            self.assertIsNone(p.match(b'\xe5\xe5'))
+            self.assertIsNone(p.match(b'\xc5\xc5'))
+        self.assertTrue(p4.match(b'\xe5\xc5'))
+        self.assertIsNone(p4.match(b'\xe5\xe5'))
+        self.assertIsNone(p4.match(b'\xc5\xc5'))
+
     def test_error(self):
         with self.assertRaises(re.error) as cm:
             re.compile('(\u20ac))')
diff --git a/Misc/NEWS b/Misc/NEWS
index d76c76b..f2c1994 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -317,6 +317,10 @@
 Library
 -------
 
+- bpo-30215: Compiled regular expression objects with the re.LOCALE flag no
+  longer depend on the locale at compile time.  Only the locale at matching
+  time affects the result of matching.
+
 - bpo-30185: Avoid KeyboardInterrupt tracebacks in forkserver helper process
   when Ctrl-C is received.
 
diff --git a/Modules/_sre.c b/Modules/_sre.c
index 03a138e..afb2bce 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c
@@ -1588,6 +1588,8 @@
         case SRE_OP_NOT_LITERAL:
         case SRE_OP_LITERAL_IGNORE:
         case SRE_OP_NOT_LITERAL_IGNORE:
+        case SRE_OP_LITERAL_LOC_IGNORE:
+        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
             GET_ARG;
             /* The arg is just a character, nothing to check */
             break;
@@ -1625,6 +1627,7 @@
 
         case SRE_OP_IN:
         case SRE_OP_IN_IGNORE:
+        case SRE_OP_IN_LOC_IGNORE:
             GET_SKIP;
             /* Stop 1 before the end; we check the FAILURE below */
             if (!_validate_charset(code, code+skip-2))
diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index 6632442..6d6d21e 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h
@@ -11,7 +11,7 @@
  * See the _sre.c file for information on usage and redistribution.
  */
 
-#define SRE_MAGIC 20140917
+#define SRE_MAGIC 20170530
 #define SRE_OP_FAILURE 0
 #define SRE_OP_SUCCESS 1
 #define SRE_OP_ANY 2
@@ -45,6 +45,9 @@
 #define SRE_OP_SUBPATTERN 30
 #define SRE_OP_MIN_REPEAT_ONE 31
 #define SRE_OP_RANGE_IGNORE 32
+#define SRE_OP_LITERAL_LOC_IGNORE 33
+#define SRE_OP_NOT_LITERAL_LOC_IGNORE 34
+#define SRE_OP_IN_LOC_IGNORE 35
 #define SRE_AT_BEGINNING 0
 #define SRE_AT_BEGINNING_LINE 1
 #define SRE_AT_BEGINNING_STRING 2
diff --git a/Modules/sre_lib.h b/Modules/sre_lib.h
index 0865fc6..b540d21 100644
--- a/Modules/sre_lib.h
+++ b/Modules/sre_lib.h
@@ -101,6 +101,14 @@
 }
 
 LOCAL(int)
+SRE(char_loc_ignore)(SRE_STATE* state, SRE_CODE pattern, SRE_CODE ch)
+{
+    return ch == pattern
+        || (SRE_CODE) state->lower(ch) == pattern
+        || (SRE_CODE) state->upper(ch) == pattern;
+}
+
+LOCAL(int)
 SRE(charset)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
 {
     /* check if character is a member of the given set */
@@ -187,6 +195,18 @@
     }
 }
 
+LOCAL(int)
+SRE(charset_loc_ignore)(SRE_STATE* state, SRE_CODE* set, SRE_CODE ch)
+{
+    SRE_CODE lo, up;
+    lo = state->lower(ch);
+    if (SRE(charset)(state, set, lo))
+       return 1;
+
+    up = state->upper(ch);
+    return up != lo && SRE(charset)(state, set, up);
+}
+
 LOCAL(Py_ssize_t) SRE(match)(SRE_STATE* state, SRE_CODE* pattern, int match_all);
 
 LOCAL(Py_ssize_t)
@@ -247,6 +267,14 @@
             ptr++;
         break;
 
+    case SRE_OP_LITERAL_LOC_IGNORE:
+        /* repeated literal */
+        chr = pattern[1];
+        TRACE(("|%p|%p|COUNT LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
+        while (ptr < end && SRE(char_loc_ignore)(state, chr, *ptr))
+            ptr++;
+        break;
+
     case SRE_OP_NOT_LITERAL:
         /* repeated non-literal */
         chr = pattern[1];
@@ -269,6 +297,14 @@
             ptr++;
         break;
 
+    case SRE_OP_NOT_LITERAL_LOC_IGNORE:
+        /* repeated non-literal */
+        chr = pattern[1];
+        TRACE(("|%p|%p|COUNT NOT_LITERAL_LOC_IGNORE %d\n", pattern, ptr, chr));
+        while (ptr < end && !SRE(char_loc_ignore)(state, chr, *ptr))
+            ptr++;
+        break;
+
     default:
         /* repeated single character pattern */
         TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
@@ -651,7 +687,17 @@
             TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
                    ctx->pattern, ctx->ptr, ctx->pattern[0]));
             if (ctx->ptr >= end ||
-                state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
+                state->lower(*ctx->ptr) != *ctx->pattern)
+                RETURN_FAILURE;
+            ctx->pattern++;
+            ctx->ptr++;
+            break;
+
+        case SRE_OP_LITERAL_LOC_IGNORE:
+            TRACE(("|%p|%p|LITERAL_LOC_IGNORE %d\n",
+                   ctx->pattern, ctx->ptr, ctx->pattern[0]));
+            if (ctx->ptr >= end
+                || !SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
                 RETURN_FAILURE;
             ctx->pattern++;
             ctx->ptr++;
@@ -661,7 +707,17 @@
             TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
                    ctx->pattern, ctx->ptr, *ctx->pattern));
             if (ctx->ptr >= end ||
-                state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
+                state->lower(*ctx->ptr) == *ctx->pattern)
+                RETURN_FAILURE;
+            ctx->pattern++;
+            ctx->ptr++;
+            break;
+
+        case SRE_OP_NOT_LITERAL_LOC_IGNORE:
+            TRACE(("|%p|%p|NOT_LITERAL_LOC_IGNORE %d\n",
+                   ctx->pattern, ctx->ptr, *ctx->pattern));
+            if (ctx->ptr >= end
+                || SRE(char_loc_ignore)(state, *ctx->pattern, *ctx->ptr))
                 RETURN_FAILURE;
             ctx->pattern++;
             ctx->ptr++;
@@ -677,6 +733,15 @@
             ctx->ptr++;
             break;
 
+        case SRE_OP_IN_LOC_IGNORE:
+            TRACE(("|%p|%p|IN_LOC_IGNORE\n", ctx->pattern, ctx->ptr));
+            if (ctx->ptr >= end
+                || !SRE(charset_loc_ignore)(state, ctx->pattern+1, *ctx->ptr))
+                RETURN_FAILURE;
+            ctx->pattern += ctx->pattern[0];
+            ctx->ptr++;
+            break;
+
         case SRE_OP_JUMP:
         case SRE_OP_INFO:
             /* jump forward */