sre 2.1b2 update: - take locale into account for word boundary anchors (#410271) - restored 2.0's *? behaviour (#233283, #408936 and others) - speed up re.sub/re.subn

commit: b25e1ad253a4d96aea31a7a3fb78522ea354f43a [log] [tgz]
author: Fredrik Lundh <fredrik@pythonware.com> Thu Mar 22 15:50:10 2001 +0000
committer: Fredrik Lundh <fredrik@pythonware.com> Thu Mar 22 15:50:10 2001 +0000
tree: 2cc9dc18021270ffc2d7982ecca15b6942f59413
parent: 8e9972c215ea0b10f0a7516d1cded6f26296ceba [diff]
diff --git a/Lib/sre.py b/Lib/sre.py
index 48d390a..6706fac 100644
--- a/Lib/sre.py
+++ b/Lib/sre.py

@@ -23,6 +23,8 @@
     "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
     "UNICODE", "error" ]
 
+__version__ = "2.1b2"
+
 # this module works under 1.5.2 and later.  don't use string methods
 import string
 
@@ -90,6 +92,7 @@
 def purge():
     "Clear the regular expression cache"
     _cache.clear()
+    _cache_repl.clear()
 
 def template(pattern, flags=0):
     "Compile a template pattern, returning a pattern object"
@@ -111,6 +114,8 @@
 # internals
 
 _cache = {}
+_cache_repl = {}
+
 _MAXCACHE = 100
 
 def _join(seq, sep):
@@ -134,6 +139,21 @@
     _cache[key] = p
     return p
 
+def _compile_repl(*key):
+    # internal: compile replacement pattern
+    p = _cache_repl.get(key)
+    if p is not None:
+        return p
+    repl, pattern = key
+    try:
+        p = sre_parse.parse_template(repl, pattern)
+    except error, v:
+        raise error, v # invalid expression
+    if len(_cache_repl) >= _MAXCACHE:
+        _cache_repl.clear()
+    _cache_repl[key] = p
+    return p
+
 def _expand(pattern, match, template):
     # internal: match.expand implementation hook
     template = sre_parse.parse_template(template, pattern)
@@ -148,7 +168,7 @@
     if callable(template):
         filter = template
     else:
-        template = sre_parse.parse_template(template, pattern)
+        template = _compile_repl(template, pattern)
         def filter(match, template=template):
             return sre_parse.expand_template(template, match)
     n = i = 0

diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index ab2a2cc..44cb23e 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py

@@ -105,9 +105,12 @@
         elif op is AT:
             emit(OPCODES[op])
             if flags & SRE_FLAG_MULTILINE:
-                emit(ATCODES[AT_MULTILINE.get(av, av)])
-            else:
-                emit(ATCODES[av])
+                av = AT_MULTILINE.get(av, av)
+            if flags & SRE_FLAG_LOCALE:
+                av = AT_LOCALE.get(av, av)
+            elif flags & SRE_FLAG_UNICODE:
+                av = AT_UNICODE.get(av, av)
+            emit(ATCODES[av])
         elif op is BRANCH:
             emit(OPCODES[op])
             tail = []
@@ -124,11 +127,10 @@
         elif op is CATEGORY:
             emit(OPCODES[op])
             if flags & SRE_FLAG_LOCALE:
-                emit(CHCODES[CH_LOCALE[av]])
+                av = CH_LOCALE[av]
             elif flags & SRE_FLAG_UNICODE:
-                emit(CHCODES[CH_UNICODE[av]])
-            else:
-                emit(CHCODES[av])
+                av = CH_UNICODE[av]
+            emit(CHCODES[av])
         elif op is GROUPREF:
             if flags & SRE_FLAG_IGNORECASE:
                 emit(OPCODES[OP_IGNORE[op]])

diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py
index b429a33..bbe7880 100644
--- a/Lib/sre_constants.py
+++ b/Lib/sre_constants.py

@@ -11,7 +11,7 @@
 
 # update when constants are added or removed
 
-MAGIC = 20010115
+MAGIC = 20010320
 
 # max code word in this release
 
@@ -67,6 +67,10 @@
 AT_END = "at_end"
 AT_END_LINE = "at_end_line"
 AT_END_STRING = "at_end_string"
+AT_LOC_BOUNDARY = "at_loc_boundary"
+AT_LOC_NON_BOUNDARY = "at_loc_non_boundary"
+AT_UNI_BOUNDARY = "at_uni_boundary"
+AT_UNI_NON_BOUNDARY = "at_uni_non_boundary"
 
 # categories
 CATEGORY_DIGIT = "category_digit"
@@ -119,7 +123,9 @@
 
 ATCODES = [
     AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY,
-    AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING
+    AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING,
+    AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY,
+    AT_UNI_NON_BOUNDARY
 ]
 
 CHCODES = [
@@ -157,6 +163,16 @@
     AT_END: AT_END_LINE
 }
 
+AT_LOCALE = {
+    AT_BOUNDARY: AT_LOC_BOUNDARY,
+    AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
+}
+
+AT_UNICODE = {
+    AT_BOUNDARY: AT_UNI_BOUNDARY,
+    AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
+}
+
 CH_LOCALE = {
     CATEGORY_DIGIT: CATEGORY_DIGIT,
     CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,

diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 3840365..44626bd 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py

@@ -638,6 +638,16 @@
     s = Tokenizer(source)
     p = []
     a = p.append
+    def literal(literal, p=p):
+        if p and p[-1][0] is LITERAL:
+            p[-1] = LITERAL, p[-1][1] + literal
+        else:
+            p.append((LITERAL, literal))
+    sep = source[:0]
+    if type(sep) is type(""):
+        char = chr
+    else:
+        char = unichr
     while 1:
         this = s.get()
         if this is None:
@@ -681,33 +691,42 @@
                         break
                 if not code:
                     this = this[1:]
-                    code = LITERAL, atoi(this[-6:], 8) & 0xff
-                a(code)
+                    code = LITERAL, char(atoi(this[-6:], 8) & 0xff)
+                if code[0] is LITERAL:
+                    literal(code[1])
+                else:
+                    a(code)
             else:
                 try:
-                    a(ESCAPES[this])
+                    this = char(ESCAPES[this][1])
                 except KeyError:
-                    for c in this:
-                        a((LITERAL, ord(c)))
+                    pass
+                literal(this)
         else:
-            a((LITERAL, ord(this)))
-    return p
+            literal(this)
+    # convert template to groups and literals lists
+    i = 0
+    groups = []
+    literals = []
+    for c, s in p:
+        if c is MARK:
+            groups.append((i, s))
+            literals.append(None)
+        else:
+            literals.append(s)
+        i = i + 1
+    return groups, literals
 
 def expand_template(template, match):
-    # XXX: <fl> this is sooooo slow.  drop in the slicelist code instead
-    p = []
-    a = p.append
+    g = match.group
     sep = match.string[:0]
-    if type(sep) is type(""):
-        char = chr
-    else:
-        char = unichr
-    for c, s in template:
-        if c is LITERAL:
-            a(char(s))
-        elif c is MARK:
-            s = match.group(s)
+    groups, literals = template
+    literals = literals[:]
+    try:
+        for index, group in groups:
+            literals[index] = s = g(group)
             if s is None:
-                raise error, "empty group"
-            a(s)
-    return string.join(p, sep)
+                raise IndexError
+    except IndexError:
+        raise error, "empty group"
+    return string.join(literals, sep)

diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py
index aacd916..7c5dc89 100755
--- a/Lib/test/re_tests.py
+++ b/Lib/test/re_tests.py

@@ -639,3 +639,14 @@
     # bug 130748: ^* should be an error (nothing to repeat)
     (r'^*', '', SYNTAX_ERROR),
 ]
+
+try:
+    u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'")
+except SyntaxError:
+    pass
+else:
+    tests.extend([
+    # bug 410271: \b broken under locales
+    (r'\b.\b', 'a', SUCCEED, 'found', 'a'),
+    (r'(?u)\b.\b', u, SUCCEED, 'found', u),
+    ])

diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py
index 88c0d62..031cda6 100644
--- a/Lib/test/test_sre.py
+++ b/Lib/test/test_sre.py

@@ -329,6 +329,8 @@
                 u = unicode(s, "latin-1")
             except NameError:
                 pass
+            except TypeError:
+                continue # skip unicode test strings
             else:
                 result=obj.search(u)
                 if result==None:

diff --git a/Modules/_sre.c b/Modules/_sre.c
index 63e4ef3..8811038 100644
--- a/Modules/_sre.c
+++ b/Modules/_sre.c

@@ -24,8 +24,9 @@
  * 2000-10-24 fl  really fixed assert_not; reset groups in findall
  * 2000-12-21 fl  fixed memory leak in groupdict
  * 2001-01-02 fl  properly reset pointer after failed assertion in MIN_UNTIL
- * 2001-01-15 fl  avoid recursion for MIN_UTIL; fixed uppercase literal bug
+ * 2001-01-15 fl  avoid recursion for MIN_UNTIL; fixed uppercase literal bug
  * 2001-01-16 fl  fixed memory leak in pattern destructor
+ * 2001-03-20 fl  lots of fixes for 2.1b2
  *
  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
  *
@@ -40,7 +41,7 @@
 
 #ifndef SRE_RECURSIVE
 
-char copyright[] = " SRE 2.1 Copyright (c) 1997-2001 by Secret Labs AB ";
+char copyright[] = " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB ";
 
 #include "Python.h"
 
@@ -141,11 +142,6 @@
 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
 120, 121, 122, 123, 124, 125, 126, 127 };
 
-static unsigned int sre_lower(unsigned int ch)
-{
-    return ((ch) < 128 ? sre_char_lower[ch] : ch);
-}
-
 #define SRE_IS_DIGIT(ch)\
     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0)
 #define SRE_IS_SPACE(ch)\
@@ -157,30 +153,39 @@
 #define SRE_IS_WORD(ch)\
     ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0)
 
+static unsigned int sre_lower(unsigned int ch)
+{
+    return ((ch) < 128 ? sre_char_lower[ch] : ch);
+}
+
 /* locale-specific character predicates */
 
-static unsigned int sre_lower_locale(unsigned int ch)
-{
-    return ((ch) < 256 ? tolower((ch)) : ch);
-}
 #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
 #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
 #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
 #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
 #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
 
+static unsigned int sre_lower_locale(unsigned int ch)
+{
+    return ((ch) < 256 ? tolower((ch)) : ch);
+}
+
 /* unicode-specific character predicates */
 
 #if defined(HAVE_UNICODE)
-static unsigned int sre_lower_unicode(unsigned int ch)
-{
-    return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
-}
+
 #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
 #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
 #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
 #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
 #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
+
+static unsigned int sre_lower_unicode(unsigned int ch)
+{
+    return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch));
+}
+
 #endif
 
 LOCAL(int)
@@ -418,6 +423,42 @@
         this = ((void*) ptr < state->end) ?
             SRE_IS_WORD((int) ptr[0]) : 0;
         return this == that;
+
+    case SRE_AT_LOC_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+        return this != that;
+
+    case SRE_AT_LOC_NON_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_LOC_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_LOC_IS_WORD((int) ptr[0]) : 0;
+        return this == that;
+
+    case SRE_AT_UNI_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+        return this != that;
+
+    case SRE_AT_UNI_NON_BOUNDARY:
+        if (state->beginning == state->end)
+            return 0;
+        that = ((void*) ptr > state->beginning) ?
+            SRE_UNI_IS_WORD((int) ptr[-1]) : 0;
+        this = ((void*) ptr < state->end) ?
+            SRE_UNI_IS_WORD((int) ptr[0]) : 0;
+        return this == that;
     }
 
     return 0;
@@ -1037,7 +1078,8 @@
 
             /* see if the tail matches */
             state->repeat = rp->prev;
-            if (rp->pattern[2] == 65535) {
+            /* FIXME: the following fix doesn't always work (#133283) */
+            if (0 && rp->pattern[2] == 65535) {
                 /* unbounded repeat */
                 for (;;) {
                     i = SRE_MATCH(state, pattern, level + 1);

diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h
index c6850ad..73bcb34 100644
--- a/Modules/sre_constants.h
+++ b/Modules/sre_constants.h

@@ -11,7 +11,7 @@
  * See the _sre.c file for information on usage and redistribution.
  */
 
-#define SRE_MAGIC 20010115
+#define SRE_MAGIC 20010320
 #define SRE_OP_FAILURE 0
 #define SRE_OP_SUCCESS 1
 #define SRE_OP_ANY 2
@@ -49,6 +49,10 @@
 #define SRE_AT_END 5
 #define SRE_AT_END_LINE 6
 #define SRE_AT_END_STRING 7
+#define SRE_AT_LOC_BOUNDARY 8
+#define SRE_AT_LOC_NON_BOUNDARY 9
+#define SRE_AT_UNI_BOUNDARY 10
+#define SRE_AT_UNI_NON_BOUNDARY 11
 #define SRE_CATEGORY_DIGIT 0
 #define SRE_CATEGORY_NOT_DIGIT 1
 #define SRE_CATEGORY_SPACE 2
commit	b25e1ad253a4d96aea31a7a3fb78522ea354f43a	[log] [tgz]
author	Fredrik Lundh <fredrik@pythonware.com>	Thu Mar 22 15:50:10 2001 +0000
committer	Fredrik Lundh <fredrik@pythonware.com>	Thu Mar 22 15:50:10 2001 +0000
tree	2cc9dc18021270ffc2d7982ecca15b6942f59413
parent	8e9972c215ea0b10f0a7516d1cded6f26296ceba [diff]