#2834: Change re module semantics, so that str and bytes mixing is forbidden, and str (unicode) patterns get full unicode matching by default. The re.ASCII flag is also introduced to ask for ASCII matching instead.

commit: fd036451bf0e0ade8783e21df801abf7be96d020 [log] [tgz]
author: Antoine Pitrou <solipsis@pitrou.net> Tue Aug 19 17:56:33 2008 +0000
committer: Antoine Pitrou <solipsis@pitrou.net> Tue Aug 19 17:56:33 2008 +0000
tree: e70ff65a9e641d8e790bc091f0dc2507baf344ca
parent: 3ad7ba10a20827b24d4b1aa9dd49474db8affbdd [diff]
diff --git a/Lib/_strptime.py b/Lib/_strptime.py
index c3568b0..896c798 100644
--- a/Lib/_strptime.py
+++ b/Lib/_strptime.py

@@ -14,7 +14,7 @@
 import locale
 import calendar
 from re import compile as re_compile
-from re import IGNORECASE
+from re import IGNORECASE, ASCII
 from re import escape as re_escape
 from datetime import date as datetime_date
 try:
@@ -262,7 +262,7 @@
 
     def compile(self, format):
         """Return a compiled re object for the format string."""
-        return re_compile(self.pattern(format), IGNORECASE)
+        return re_compile(self.pattern(format), IGNORECASE | ASCII)
 
 _cache_lock = _thread_allocate_lock()
 # DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock

diff --git a/Lib/base64.py b/Lib/base64.py
index fc80835..4308fb4 100755
--- a/Lib/base64.py
+++ b/Lib/base64.py

@@ -39,7 +39,7 @@
     return s.translate(translation)
 
 
-
+
 # Base64 encoding/decoding uses binascii
 
 def b64encode(s, altchars=None):
@@ -126,7 +126,7 @@
     return b64decode(s, b'-_')
 
 
-
+
 # Base32 encoding/decoding must be done in Python
 _b32alphabet = {
     0: b'A',  9: b'J', 18: b'S', 27: b'3',
@@ -225,7 +225,7 @@
     # characters because this will tell us how many null bytes to remove from
     # the end of the decoded string.
     padchars = 0
-    mo = re.search('(?P<pad>[=]*)$', s)
+    mo = re.search(b'(?P<pad>[=]*)$', s)
     if mo:
         padchars = len(mo.group('pad'))
         if padchars > 0:
@@ -262,7 +262,7 @@
     return b''.join(parts)
 
 
-
+
 # RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
 # lowercase.  The RFC also recommends against accepting input case
 # insensitively.
@@ -291,12 +291,12 @@
         raise TypeError("expected bytes, not %s" % s.__class__.__name__)
     if casefold:
         s = s.upper()
-    if re.search('[^0-9A-F]', s):
+    if re.search(b'[^0-9A-F]', s):
         raise binascii.Error('Non-base16 digit found')
     return binascii.unhexlify(s)
 
 
-
+
 # Legacy interface.  This code could be cleaned up since I don't believe
 # binascii has any line length limitations.  It just doesn't seem worth it
 # though.  The files should be opened in binary mode.
@@ -353,7 +353,7 @@
     return binascii.a2b_base64(s)
 
 
-
+
 # Usable as a script...
 def main():
     """Small main program"""

diff --git a/Lib/decimal.py b/Lib/decimal.py
index 2c02f11..88cc5cd 100644
--- a/Lib/decimal.py
+++ b/Lib/decimal.py

@@ -5415,7 +5415,7 @@
 # 2. For finite numbers (not infinities and NaNs) the body of the
 # number between the optional sign and the optional exponent must have
 # at least one decimal digit, possibly after the decimal point.  The
-# lookahead expression '(?=\d|\.\d)' checks this.
+# lookahead expression '(?=[0-9]|\.[0-9])' checks this.
 #
 # As the flag UNICODE is not enabled here, we're explicitly avoiding any
 # other meaning for \d than the numbers [0-9].

diff --git a/Lib/distutils/cygwinccompiler.py b/Lib/distutils/cygwinccompiler.py
index 48875230..da2c74a 100644
--- a/Lib/distutils/cygwinccompiler.py
+++ b/Lib/distutils/cygwinccompiler.py

@@ -409,7 +409,7 @@
         out = os.popen(gcc_exe + ' -dumpversion','r')
         out_string = out.read()
         out.close()
-        result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
+        result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
         if result:
             gcc_version = StrictVersion(result.group(1))
         else:
@@ -421,7 +421,7 @@
         out = os.popen(ld_exe + ' -v','r')
         out_string = out.read()
         out.close()
-        result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
+        result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
         if result:
             ld_version = StrictVersion(result.group(1))
         else:
@@ -433,7 +433,7 @@
         out = os.popen(dllwrap_exe + ' --version','r')
         out_string = out.read()
         out.close()
-        result = re.search(' (\d+\.\d+(\.\d+)*)',out_string)
+        result = re.search(' (\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
         if result:
             dllwrap_version = StrictVersion(result.group(1))
         else:

diff --git a/Lib/distutils/emxccompiler.py b/Lib/distutils/emxccompiler.py
index d9ee82d..62a4c5b 100644
--- a/Lib/distutils/emxccompiler.py
+++ b/Lib/distutils/emxccompiler.py

@@ -300,7 +300,7 @@
         out = os.popen(gcc_exe + ' -dumpversion','r')
         out_string = out.read()
         out.close()
-        result = re.search('(\d+\.\d+\.\d+)',out_string)
+        result = re.search('(\d+\.\d+\.\d+)', out_string, re.ASCII)
         if result:
             gcc_version = StrictVersion(result.group(1))
         else:

diff --git a/Lib/distutils/sysconfig.py b/Lib/distutils/sysconfig.py
index 3a120dd..b17743a 100644
--- a/Lib/distutils/sysconfig.py
+++ b/Lib/distutils/sysconfig.py

@@ -512,7 +512,7 @@
                         # patched up as well.
                         'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
                     flags = _config_vars[key]
-                    flags = re.sub('-arch\s+\w+\s', ' ', flags)
+                    flags = re.sub('-arch\s+\w+\s', ' ', flags, re.ASCII)
                     flags = re.sub('-isysroot [^ \t]*', ' ', flags)
                     _config_vars[key] = flags
 

diff --git a/Lib/distutils/util.py b/Lib/distutils/util.py
index 76798b9..b87dfbe 100644
--- a/Lib/distutils/util.py
+++ b/Lib/distutils/util.py

@@ -81,7 +81,7 @@
         return "%s-%s.%s" % (osname, version, release)
     elif osname[:6] == "cygwin":
         osname = "cygwin"
-        rel_re = re.compile (r'[\d.]+')
+        rel_re = re.compile (r'[\d.]+', re.ASCII)
         m = rel_re.match(release)
         if m:
             release = m.group()

diff --git a/Lib/distutils/version.py b/Lib/distutils/version.py
index f71b2f6..907f71c 100644
--- a/Lib/distutils/version.py
+++ b/Lib/distutils/version.py

@@ -134,7 +134,7 @@
     """
 
     version_re = re.compile(r'^(\d+) \. (\d+) (\. (\d+))? ([ab](\d+))?$',
-                            re.VERBOSE)
+                            re.VERBOSE | re.ASCII)
 
 
     def parse (self, vstring):

diff --git a/Lib/distutils/versionpredicate.py b/Lib/distutils/versionpredicate.py
index 434b34f..b0dd9f4 100644
--- a/Lib/distutils/versionpredicate.py
+++ b/Lib/distutils/versionpredicate.py

@@ -5,7 +5,8 @@
 import operator
 
 
-re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)")
+re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)",
+    re.ASCII)
 # (package) (rest)
 
 re_paren = re.compile(r"^\s*\((.*)\)\s*$") # (list) inside of parentheses
@@ -153,7 +154,8 @@
     global _provision_rx
     if _provision_rx is None:
         _provision_rx = re.compile(
-            "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$")
+            "([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$",
+            re.ASCII)
     value = value.strip()
     m = _provision_rx.match(value)
     if not m:

diff --git a/Lib/email/quoprimime.py b/Lib/email/quoprimime.py
index 68dc11c..ca91631 100644
--- a/Lib/email/quoprimime.py
+++ b/Lib/email/quoprimime.py

@@ -70,7 +70,7 @@
     _QUOPRI_BODY_MAP[c] = chr(c)
 
 
-
+
 # Helpers
 def header_check(octet):
     """Return True if the octet should be escaped with header quopri."""
@@ -125,7 +125,7 @@
     return '=%02X' % ord(c)
 
 
-
+
 def header_encode(header_bytes, charset='iso-8859-1'):
     """Encode a single header line with quoted-printable (like) encoding.
 
@@ -149,7 +149,7 @@
     return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded))
 
 
-
+
 def body_encode(body, maxlinelen=76, eol=NL):
     """Encode with quoted-printable, wrapping at maxlinelen characters.
 
@@ -225,7 +225,7 @@
     return encoded_body
 
 
-
+
 # BAW: I'm not sure if the intent was for the signature of this function to be
 # the same as base64MIME.decode() or not...
 def decode(encoded, eol=NL):
@@ -280,7 +280,7 @@
 decodestring = decode
 
 
-
+
 def _unquote_match(match):
     """Turn a match in the form =AB to the ASCII character with value 0xab"""
     s = match.group(0)
@@ -296,4 +296,4 @@
     the high level email.Header class for that functionality.
     """
     s = s.replace('_', ' ')
-    return re.sub(r'=\w{2}', _unquote_match, s)
+    return re.sub(r'=\w{2}', _unquote_match, s, re.ASCII)

diff --git a/Lib/email/utils.py b/Lib/email/utils.py
index 35275f6..465903f 100644
--- a/Lib/email/utils.py
+++ b/Lib/email/utils.py

@@ -52,7 +52,7 @@
 escapesre = re.compile(r'[][\\()"]')
 
 
-
+
 # Helpers
 
 def formataddr(pair):
@@ -73,7 +73,7 @@
     return address
 
 
-
+
 def getaddresses(fieldvalues):
     """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
     all = COMMASPACE.join(fieldvalues)
@@ -81,7 +81,7 @@
     return a.addresslist
 
 
-
+
 ecre = re.compile(r'''
   =\?                   # literal =?
   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
@@ -93,7 +93,7 @@
   ''', re.VERBOSE | re.IGNORECASE)
 
 
-
+
 def formatdate(timeval=None, localtime=False, usegmt=False):
     """Returns a date string as specified by RFC 2822, e.g.:
 
@@ -146,7 +146,7 @@
         zone)
 
 
-
+
 def make_msgid(idstring=None):
     """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
 
@@ -168,7 +168,7 @@
     return msgid
 
 
-
+
 # These functions are in the standalone mimelib version only because they've
 # subsequently been fixed in the latest Python versions.  We use this to worm
 # around broken older Pythons.
@@ -202,7 +202,7 @@
     return str
 
 
-
+
 # RFC2231-related functions - parameter encoding and decoding
 def decode_rfc2231(s):
     """Decode string according to RFC 2231"""
@@ -227,7 +227,8 @@
     return "%s'%s'%s" % (charset, language, s)
 
 
-rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
+rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
+    re.ASCII)
 
 def decode_params(params):
     """Decode parameters list according to RFC 2231.

diff --git a/Lib/encodings/idna.py b/Lib/encodings/idna.py
index c2ba1da..583bdf1 100644
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py

@@ -176,12 +176,10 @@
             return "", 0
 
         # IDNA allows decoding to operate on Unicode strings, too.
-        if isinstance(input, bytes):
-            labels = dots.split(input)
-        else:
-            # Force to bytes
+        if not isinstance(input, bytes):
+            # XXX obviously wrong, see #3232
             input = bytes(input)
-            labels = input.split(b".")
+        labels = input.split(b".")
 
         if labels and len(labels[-1]) == 0:
             trailing_dot = '.'

diff --git a/Lib/ftplib.py b/Lib/ftplib.py
index 4955727..b64e45e 100644
--- a/Lib/ftplib.py
+++ b/Lib/ftplib.py

@@ -590,7 +590,8 @@
     global _150_re
     if _150_re is None:
         import re
-        _150_re = re.compile("150 .* \((\d+) bytes\)", re.IGNORECASE)
+        _150_re = re.compile(
+            "150 .* \((\d+) bytes\)", re.IGNORECASE | re.ASCII)
     m = _150_re.match(resp)
     if not m:
         return None
@@ -613,7 +614,7 @@
     global _227_re
     if _227_re is None:
         import re
-        _227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)')
+        _227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)', re.ASCII)
     m = _227_re.search(resp)
     if not m:
         raise error_proto(resp)

diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 828eece..83a5825 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py

@@ -385,4 +385,4 @@
                     return '&'+s+';'
 
         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
-                      replaceEntities, s)
+                      replaceEntities, s, re.ASCII)

diff --git a/Lib/http/cookiejar.py b/Lib/http/cookiejar.py
index afd5f20..e9efab8 100644
--- a/Lib/http/cookiejar.py
+++ b/Lib/http/cookiejar.py

@@ -121,7 +121,7 @@
 
 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
 
-TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
+TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
 def offset_from_tz_string(tz):
     offset = None
     if tz in UTC_ZONES:
@@ -191,9 +191,9 @@
 
 STRICT_DATE_RE = re.compile(
     r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
-    "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
+    "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
 WEEKDAY_RE = re.compile(
-    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
+    r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
 LOOSE_HTTP_DATE_RE = re.compile(
     r"""^
     (\d\d?)            # day
@@ -210,7 +210,7 @@
     ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
        \s*
     (?:\(\w+\))?       # ASCII representation of timezone in parens.
-       \s*$""", re.X)
+       \s*$""", re.X | re.ASCII)
 def http2time(text):
     """Returns time in seconds since epoch of time represented by a string.
 
@@ -282,7 +282,7 @@
       \s*
    ([-+]?\d\d?:?(:?\d\d)?
     |Z|z)?               # timezone  (Z is "zero meridian", i.e. GMT)
-      \s*$""", re.X)
+      \s*$""", re.X | re. ASCII)
 def iso2time(text):
     """
     As for http2time, but parses the ISO 8601 formats:
@@ -489,7 +489,7 @@
     return result
 
 
-IPV4_RE = re.compile(r"\.\d+$")
+IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
 def is_HDN(text):
     """Return True if text is a host domain name."""
     # XXX
@@ -574,7 +574,7 @@
         return True
     return False
 
-cut_port_re = re.compile(r":\d+$")
+cut_port_re = re.compile(r":\d+$", re.ASCII)
 def request_host(request):
     """Return request-host, as defined by RFC 2965.
 
@@ -1207,7 +1207,7 @@
     domain_re = re.compile(r"[^.]*")
     dots_re = re.compile(r"^\.+")
 
-    magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
+    magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
 
     def __init__(self, policy=None):
         if policy is None:
@@ -1856,7 +1856,7 @@
 
     def _really_load(self, f, filename, ignore_discard, ignore_expires):
         magic = f.readline()
-        if not re.search(self.magic_re, magic):
+        if not self.magic_re.search(magic):
             msg = ("%r does not look like a Set-Cookie3 (LWP) format "
                    "file" % filename)
             raise LoadError(msg)
@@ -1965,7 +1965,7 @@
     header by default (Mozilla can cope with that).
 
     """
-    magic_re = "#( Netscape)? HTTP Cookie File"
+    magic_re = re.compile("#( Netscape)? HTTP Cookie File")
     header = """\
     # Netscape HTTP Cookie File
     # http://www.netscape.com/newsref/std/cookie_spec.html
@@ -1977,7 +1977,7 @@
         now = time.time()
 
         magic = f.readline()
-        if not re.search(self.magic_re, magic):
+        if not self.magic_re.search(magic):
             f.close()
             raise LoadError(
                 "%r does not look like a Netscape format cookies file" %

diff --git a/Lib/http/cookies.py b/Lib/http/cookies.py
index 344bede..3242d83 100644
--- a/Lib/http/cookies.py
+++ b/Lib/http/cookies.py

@@ -445,7 +445,7 @@
     ""+ _LegalCharsPatt +"*"        # Any word or empty string
     r")"                          # End of group 'val'
     r"\s*;?"                      # Probably ending in a semi-colon
-    )
+    , re.ASCII)                   # May be removed if safe.
 
 
 # At long last, here is the cookie class.

diff --git a/Lib/imaplib.py b/Lib/imaplib.py
index 0d9704e..156b15d 100644
--- a/Lib/imaplib.py
+++ b/Lib/imaplib.py

@@ -88,11 +88,12 @@
         r' (?P<hour>[0-9][0-9]):(?P<min>[0-9][0-9]):(?P<sec>[0-9][0-9])'
         r' (?P<zonen>[-+])(?P<zoneh>[0-9][0-9])(?P<zonem>[0-9][0-9])'
         r'"')
-Literal = re.compile(r'.*{(?P<size>\d+)}$')
+Literal = re.compile(r'.*{(?P<size>\d+)}$', re.ASCII)
 MapCRLF = re.compile(r'\r\n|\r|\n')
 Response_code = re.compile(r'\[(?P<type>[A-Z-]+)( (?P<data>[^\]]*))?\]')
 Untagged_response = re.compile(r'\* (?P<type>[A-Z-]+)( (?P<data>.*))?')
-Untagged_status = re.compile(r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?')
+Untagged_status = re.compile(
+    r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?', re.ASCII)
 
 
 
@@ -146,7 +147,7 @@
     class abort(error): pass        # Service errors - close and retry
     class readonly(abort): pass     # Mailbox status changed to READ-ONLY
 
-    mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]")
+    mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]", re.ASCII)
 
     def __init__(self, host = '', port = IMAP4_PORT):
         self.debug = Debug
@@ -168,7 +169,7 @@
         self.tagpre = Int2AP(random.randint(4096, 65535))
         self.tagre = re.compile(r'(?P<tag>'
                         + self.tagpre
-                        + r'\d+) (?P<type>[A-Z]+) (?P<data>.*)')
+                        + r'\d+) (?P<type>[A-Z]+) (?P<data>.*)', re.ASCII)
 
         # Get server welcome message,
         # request and store CAPABILITY response.

diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py
index 5283eae..f0bc245 100644
--- a/Lib/json/decoder.py
+++ b/Lib/json/decoder.py

@@ -67,7 +67,7 @@
         fn = getattr(context, 'parse_int', None) or int
         res = fn(integer)
     return res, None
-pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber)
+pattern(r'(-?(?:0|[1-9][0-9]*))(\.[0-9]+)?([eE][-+]?[0-9]+)?')(JSONNumber)
 
 
 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)

diff --git a/Lib/logging/handlers.py b/Lib/logging/handlers.py
index 1e69120..19fb959 100644
--- a/Lib/logging/handlers.py
+++ b/Lib/logging/handlers.py

@@ -199,7 +199,7 @@
         else:
             raise ValueError("Invalid rollover interval specified: %s" % self.when)
 
-        self.extMatch = re.compile(self.extMatch)
+        self.extMatch = re.compile(self.extMatch, re.ASCII)
         self.interval = self.interval * interval # multiply by units requested
         self.rolloverAt = currentTime + self.interval
 

diff --git a/Lib/platform.py b/Lib/platform.py
index ff81d28..0182180 100755
--- a/Lib/platform.py
+++ b/Lib/platform.py

@@ -118,7 +118,7 @@
                           '|'
                           '(GLIBC_([0-9.]+))'
                           '|'
-                          '(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)')
+                          '(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)', re.ASCII)
 
 def libc_ver(executable=sys.executable,lib='',version='',
 
@@ -223,15 +223,15 @@
 
     return distname,version,id
 
-_release_filename = re.compile(r'(\w+)[-_](release|version)')
+_release_filename = re.compile(r'(\w+)[-_](release|version)', re.ASCII)
 _lsb_release_version = re.compile(r'(.+)'
                                    ' release '
                                    '([\d.]+)'
-                                   '[^(]*(?:\((.+)\))?')
+                                   '[^(]*(?:\((.+)\))?', re.ASCII)
 _release_version = re.compile(r'([^0-9]+)'
                                '(?: release )?'
                                '([\d.]+)'
-                               '[^(]*(?:\((.+)\))?')
+                               '[^(]*(?:\((.+)\))?', re.ASCII)
 
 # See also http://www.novell.com/coolsolutions/feature/11251.html
 # and http://linuxmafia.com/faq/Admin/release-files.html
@@ -464,7 +464,7 @@
 
 _ver_output = re.compile(r'(?:([\w ]+) ([\w.]+) '
                          '.*'
-                         'Version ([\d.]+))')
+                         'Version ([\d.]+))', re.ASCII)
 
 def _syscmd_ver(system='', release='', version='',
 
@@ -1253,16 +1253,16 @@
 _sys_version_parser = re.compile(
     r'([\w.+]+)\s*'
     '\(#?([^,]+),\s*([\w ]+),\s*([\w :]+)\)\s*'
-    '\[([^\]]+)\]?')
+    '\[([^\]]+)\]?', re.ASCII)
 
 _jython_sys_version_parser = re.compile(
-    r'([\d\.]+)')
+    r'([\d\.]+)', re.ASCII)
 
 _ironpython_sys_version_parser = re.compile(
     r'IronPython\s*'
     '([\d\.]+)'
     '(?: \(([\d\.]+)\))?'
-    ' on (.NET [\d\.]+)')
+    ' on (.NET [\d\.]+)', re.ASCII)
 
 _sys_version_cache = {}
 

diff --git a/Lib/plistlib.py b/Lib/plistlib.py
index b775324..83e5d04 100644
--- a/Lib/plistlib.py
+++ b/Lib/plistlib.py

@@ -147,7 +147,7 @@
 # Contents should conform to a subset of ISO 8601
 # (in particular, YYYY '-' MM '-' DD 'T' HH ':' MM ':' SS 'Z'.  Smaller units may be omitted with
 #  a loss of precision)
-_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z")
+_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z", re.ASCII)
 
 def _dateFromString(s):
     order = ('year', 'month', 'day', 'hour', 'minute', 'second')

diff --git a/Lib/posixpath.py b/Lib/posixpath.py
index dc0aa10..575492f 100644
--- a/Lib/posixpath.py
+++ b/Lib/posixpath.py

@@ -241,7 +241,7 @@
         return path
     if not _varprog:
         import re
-        _varprog = re.compile(r'\$(\w+|\{[^}]*\})')
+        _varprog = re.compile(r'\$(\w+|\{[^}]*\})', re.ASCII)
     i = 0
     while True:
         m = _varprog.search(path, i)

diff --git a/Lib/py_compile.py b/Lib/py_compile.py
index 8ef3662..cce5ac1 100644
--- a/Lib/py_compile.py
+++ b/Lib/py_compile.py

@@ -86,7 +86,7 @@
             line = f.readline()
             if not line:
                 break
-            m = re.match(r".*\bcoding:\s*(\S+)\b", line)
+            m = re.match(br".*\bcoding:\s*(\S+)\b", line)
             if m:
                 return m.group(1).decode("ascii")
         return default

diff --git a/Lib/re.py b/Lib/re.py
index 951f239..63a95fd 100644
--- a/Lib/re.py
+++ b/Lib/re.py

@@ -44,7 +44,7 @@
     "|"      A|B, creates an RE that will match either A or B.
     (...)    Matches the RE inside the parentheses.
              The contents can be retrieved or matched later in the string.
-    (?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below).
+    (?aiLmsux) Set the A, I, L, M, S, U, or X flag for the RE (see below).
     (?:...)  Non-grouping version of regular parentheses.
     (?P<name>...) The substring matched by the group is accessible by name.
     (?P=name)     Matches the text matched earlier by the group named name.
@@ -64,11 +64,18 @@
     \Z       Matches only at the end of the string.
     \b       Matches the empty string, but only at the start or end of a word.
     \B       Matches the empty string, but not at the start or end of a word.
-    \d       Matches any decimal digit; equivalent to the set [0-9].
-    \D       Matches any non-digit character; equivalent to the set [^0-9].
+    \d       Matches any decimal digit; equivalent to the set [0-9] in
+             bytes patterns or string patterns with the ASCII flag.
+             In string patterns without the ASCII flag, it will match the whole
+             range of Unicode digits.
+    \D       Matches any non-digit character; equivalent to [^\d].
     \s       Matches any whitespace character; equivalent to [ \t\n\r\f\v].
     \S       Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
-    \w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
+    \w       Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
+             in bytes patterns or string patterns with the ASCII flag.
+             In string patterns without the ASCII flag, it will match the
+             range of Unicode alphanumeric characters (letters plus digits
+             plus underscore).
              With LOCALE, it will match the set [0-9_] plus characters defined
              as letters for the current locale.
     \W       Matches the complement of \w.
@@ -87,6 +94,12 @@
     escape   Backslash all non-alphanumerics in a string.
 
 Some of the functions in this module takes flags as optional parameters:
+    A  ASCII       For string patterns, make \w, \W, \b, \B, \d, \D
+                   match the corresponding ASCII character categories
+                   (rather than the whole Unicode categories, which is the
+                   default).
+                   For bytes patterns, this flag is the only available
+                   behaviour and needn't be specified.
     I  IGNORECASE  Perform case-insensitive matching.
     L  LOCALE      Make \w, \W, \b, \B, dependent on the current locale.
     M  MULTILINE   "^" matches the beginning of lines (after a newline)
@@ -95,7 +108,8 @@
                    as the end of the string.
     S  DOTALL      "." matches any character at all, including the newline.
     X  VERBOSE     Ignore whitespace and comments for nicer looking RE's.
-    U  UNICODE     Make \w, \W, \b, \B, dependent on the Unicode locale.
+    U  UNICODE     For compatibility only. Ignored for string patterns (it
+                   is the default), and forbidden for bytes patterns.
 
 This module also defines an exception 'error'.
 
@@ -107,16 +121,17 @@
 
 # public symbols
 __all__ = [ "match", "search", "sub", "subn", "split", "findall",
-    "compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
-    "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
+    "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X",
+    "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
     "UNICODE", "error" ]
 
 __version__ = "2.2.1"
 
 # flags
+A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
 I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
 L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
-U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
+U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
 M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
 S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
 X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments

diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index 94962cb..6cf69c3 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py

@@ -207,9 +207,10 @@
 SRE_FLAG_LOCALE = 4 # honour system locale
 SRE_FLAG_MULTILINE = 8 # treat target as multiline string
 SRE_FLAG_DOTALL = 16 # treat target as a single string
-SRE_FLAG_UNICODE = 32 # use unicode locale
+SRE_FLAG_UNICODE = 32 # use unicode "locale"
 SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
 SRE_FLAG_DEBUG = 128 # debugging
+SRE_FLAG_ASCII = 256 # use ascii "locale"
 
 # flags for INFO primitive
 SRE_INFO_PREFIX = 1 # has prefix

diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index ffa8902..9d6e631 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py

@@ -64,6 +64,7 @@
     "s": SRE_FLAG_DOTALL,
     "x": SRE_FLAG_VERBOSE,
     # extensions
+    "a": SRE_FLAG_ASCII,
     "t": SRE_FLAG_TEMPLATE,
     "u": SRE_FLAG_UNICODE,
 }
@@ -672,6 +673,18 @@
 
     return subpattern
 
+def fix_flags(src, flags):
+    # Check and fix flags according to the type of pattern (str or bytes)
+    if isinstance(src, str):
+        if not flags & SRE_FLAG_ASCII:
+            flags |= SRE_FLAG_UNICODE
+        elif flags & SRE_FLAG_UNICODE:
+            raise ValueError("ASCII and UNICODE flags are incompatible")
+    else:
+        if flags & SRE_FLAG_UNICODE:
+            raise ValueError("can't use UNICODE flag with a bytes pattern")
+    return flags
+
 def parse(str, flags=0, pattern=None):
     # parse 're' pattern into list of (opcode, argument) tuples
 
@@ -683,6 +696,7 @@
     pattern.str = str
 
     p = _parse_sub(source, pattern, 0)
+    p.pattern.flags = fix_flags(str, p.pattern.flags)
 
     tail = source.get()
     if tail == ")":

diff --git a/Lib/tarfile.py b/Lib/tarfile.py
index ecb32f6..222b433 100644
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py

@@ -1368,7 +1368,7 @@
         # "%d %s=%s\n" % (length, keyword, value). length is the size
         # of the complete record including the length field itself and
         # the newline. keyword and value are both UTF-8 encoded strings.
-        regex = re.compile(r"(\d+) ([^=]+)=", re.U)
+        regex = re.compile(br"(\d+) ([^=]+)=")
         pos = 0
         while True:
             match = regex.match(buf, pos)

diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py
index 220301a..d314e20 100755
--- a/Lib/test/re_tests.py
+++ b/Lib/test/re_tests.py

@@ -667,4 +667,4 @@
     (r'\b.\b', 'a', SUCCEED, 'found', 'a'),
     (r'(?u)\b.\b', u, SUCCEED, 'found', u),
     (r'(?u)\w', u, SUCCEED, 'found', u),
-    ])
+])

diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index c64ac19..630f862 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py

@@ -506,7 +506,7 @@
         def by(s):
             return bytearray(map(ord, s))
         b = by("Hello, world")
-        self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")])
+        self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")])
 
     def test_setitem(self):
         b = bytearray([1, 2, 3])

diff --git a/Lib/test/test_mmap.py b/Lib/test/test_mmap.py
index 0b5202a..9fe044f 100644
--- a/Lib/test/test_mmap.py
+++ b/Lib/test/test_mmap.py

@@ -54,7 +54,7 @@
         m.flush()
 
         # Test doing a regular expression match in an mmap'ed file
-        match = re.search('[A-Za-z]+', m)
+        match = re.search(b'[A-Za-z]+', m)
         if match is None:
             self.fail('regex match on mmap failed!')
         else:

diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py
index 60b816e..755cb00 100644
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py

@@ -83,23 +83,6 @@
         self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
                          'abc\ndef\n')
 
-    def test_bug_1140(self):
-        # re.sub(x, y, b'') should return b'', not '', and
-        # re.sub(x, y, '') should return '', not b''.
-        # Also:
-        # re.sub(x, y, str(x)) should return str(y), and
-        # re.sub(x, y, bytes(x)) should return
-        #     str(y) if isinstance(y, str) else unicode(y).
-        for x in 'x',  b'x':
-            for y in 'y', b'y':
-                z = re.sub(x, y, b'')
-                self.assertEqual(z, b'')
-                self.assertEqual(type(z), bytes)
-                #
-                z = re.sub(x, y, '')
-                self.assertEqual(z, '')
-                self.assertEqual(type(z), str)
-
     def test_bug_1661(self):
         # Verify that flags do not get silently ignored with compiled patterns
         pattern = re.compile('.')
@@ -327,7 +310,7 @@
 
     def test_getattr(self):
         self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
-        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I)
+        self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
         self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
         self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
         self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
@@ -614,8 +597,8 @@
         import array
         for typecode in 'bBuhHiIlLfd':
             a = array.array(typecode)
-            self.assertEqual(re.compile("bla").match(a), None)
-            self.assertEqual(re.compile("").match(a).groups(), ())
+            self.assertEqual(re.compile(b"bla").match(a), None)
+            self.assertEqual(re.compile(b"").match(a).groups(), ())
 
     def test_inline_flags(self):
         # Bug #1700
@@ -658,6 +641,48 @@
         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
 
+    def test_bytes_str_mixing(self):
+        # Mixing str and bytes is disallowed
+        pat = re.compile('.')
+        bpat = re.compile(b'.')
+        self.assertRaises(TypeError, pat.match, b'b')
+        self.assertRaises(TypeError, bpat.match, 'b')
+        self.assertRaises(TypeError, pat.sub, b'b', 'c')
+        self.assertRaises(TypeError, pat.sub, 'b', b'c')
+        self.assertRaises(TypeError, pat.sub, b'b', b'c')
+        self.assertRaises(TypeError, bpat.sub, b'b', 'c')
+        self.assertRaises(TypeError, bpat.sub, 'b', b'c')
+        self.assertRaises(TypeError, bpat.sub, 'b', 'c')
+
+    def test_ascii_and_unicode_flag(self):
+        # String patterns
+        for flags in (0, re.UNICODE):
+            pat = re.compile('\xc0', flags | re.IGNORECASE)
+            self.assertNotEqual(pat.match('\xe0'), None)
+            pat = re.compile('\w', flags)
+            self.assertNotEqual(pat.match('\xe0'), None)
+        pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
+        self.assertEqual(pat.match('\xe0'), None)
+        pat = re.compile('(?a)\xc0', re.IGNORECASE)
+        self.assertEqual(pat.match('\xe0'), None)
+        pat = re.compile('\w', re.ASCII)
+        self.assertEqual(pat.match('\xe0'), None)
+        pat = re.compile('(?a)\w')
+        self.assertEqual(pat.match('\xe0'), None)
+        # Bytes patterns
+        for flags in (0, re.ASCII):
+            pat = re.compile(b'\xc0', re.IGNORECASE)
+            self.assertEqual(pat.match(b'\xe0'), None)
+            pat = re.compile(b'\w')
+            self.assertEqual(pat.match(b'\xe0'), None)
+        # Incompatibilities
+        self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
+        self.assertRaises(ValueError, re.compile, b'(?u)\w')
+        self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
+        self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
+        self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
+        self.assertRaises(ValueError, re.compile, '(?au)\w')
+
 
 def run_re_tests():
     from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR

diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 31366de..ec5a79a 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py

@@ -47,21 +47,23 @@
 def any(*choices): return group(*choices) + '*'
 def maybe(*choices): return group(*choices) + '?'
 
+# Note: we use unicode matching for names ("\w") but ascii matching for
+# number literals.
 Whitespace = r'[ \f\t]*'
 Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 Name = r'[a-zA-Z_]\w*'
 
-Hexnumber = r'0[xX][\da-fA-F]+'
+Hexnumber = r'0[xX][0-9a-fA-F]+'
 Binnumber = r'0[bB][01]+'
 Octnumber = r'0[oO][0-7]+'
-Decnumber = r'(?:0+|[1-9]\d*)'
+Decnumber = r'(?:0+|[1-9][0-9]*)'
 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?\d+'
-Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
-Expfloat = r'\d+' + Exponent
+Exponent = r'[eE][-+]?[0-9]+'
+Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
+Expfloat = r'[0-9]+' + Exponent
 Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)
 
 # Tail end of ' string.

diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py
index 889d9642..aaab059 100644
--- a/Lib/urllib/request.py
+++ b/Lib/urllib/request.py

@@ -141,7 +141,7 @@
         _opener = None
 
 # copied from cookielib.py
-_cut_port_re = re.compile(r":\d+$")
+_cut_port_re = re.compile(r":\d+$", re.ASCII)
 def request_host(request):
     """Return request-host, as defined by RFC 2965.
commit	fd036451bf0e0ade8783e21df801abf7be96d020	[log] [tgz]
author	Antoine Pitrou <solipsis@pitrou.net>	Tue Aug 19 17:56:33 2008 +0000
committer	Antoine Pitrou <solipsis@pitrou.net>	Tue Aug 19 17:56:33 2008 +0000
tree	e70ff65a9e641d8e790bc091f0dc2507baf344ca
parent	3ad7ba10a20827b24d4b1aa9dd49474db8affbdd [diff]