| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 1 | # | 
 | 2 | # Secret Labs' Regular Expression Engine | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 3 | # | 
 | 4 | # re-compatible interface for the sre matching engine | 
 | 5 | # | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 6 | # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved. | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 7 | # | 
| Fredrik Lundh | 29c4ba9 | 2000-08-01 18:20:07 +0000 | [diff] [blame] | 8 | # This version of the SRE library can be redistributed under CNRI's | 
 | 9 | # Python 1.6 license.  For any other use, please contact Secret Labs | 
 | 10 | # AB (info@pythonware.com). | 
 | 11 | # | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 12 | # Portions of this engine have been developed in cooperation with | 
| Fredrik Lundh | 29c4ba9 | 2000-08-01 18:20:07 +0000 | [diff] [blame] | 13 | # CNRI.  Hewlett-Packard provided funding for 1.6 integration and | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 14 | # other compatibility work. | 
 | 15 | # | 
 | 16 |  | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 17 | import sre_compile | 
| Fredrik Lundh | 436c3d58 | 2000-06-29 08:58:44 +0000 | [diff] [blame] | 18 | import sre_parse | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 19 |  | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 20 | # flags | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 21 | I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case | 
 | 22 | L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale | 
 | 23 | U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale | 
 | 24 | M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline | 
 | 25 | S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline | 
 | 26 | X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 27 |  | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 28 | # sre extensions (experimental, don't rely on these) | 
 | 29 | T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking | 
 | 30 | DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation | 
| Fredrik Lundh | 436c3d58 | 2000-06-29 08:58:44 +0000 | [diff] [blame] | 31 |  | 
 | 32 | # sre exception | 
| Fredrik Lundh | be2211e | 2000-06-29 16:57:40 +0000 | [diff] [blame] | 33 | error = sre_compile.error | 
| Fredrik Lundh | 436c3d58 | 2000-06-29 08:58:44 +0000 | [diff] [blame] | 34 |  | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 35 | # -------------------------------------------------------------------- | 
 | 36 | # public interface | 
 | 37 |  | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 38 | def match(pattern, string, flags=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 39 |     """Try to apply the pattern at the start of the string, returning | 
 | 40 |     a match object, or None if no match was found.""" | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 41 |     return _compile(pattern, flags).match(string) | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 42 |  | 
 | 43 | def search(pattern, string, flags=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 44 |     """Scan through string looking for a match to the pattern, returning | 
 | 45 |     a match object, or None if no match was found.""" | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 46 |     return _compile(pattern, flags).search(string) | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 47 |  | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 48 | def sub(pattern, repl, string, count=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 49 |     """Return the string obtained by replacing the leftmost | 
 | 50 |     non-overlapping occurrences of the pattern in string by the | 
 | 51 |     replacement repl""" | 
| Fredrik Lundh | 7898c3e | 2000-08-07 20:59:04 +0000 | [diff] [blame] | 52 |     return _compile(pattern, 0).sub(repl, string, count) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 53 |  | 
 | 54 | def subn(pattern, repl, string, count=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 55 |     """Return a 2-tuple containing (new_string, number). | 
 | 56 |     new_string is the string obtained by replacing the leftmost | 
 | 57 |     non-overlapping occurrences of the pattern in the source | 
 | 58 |     string by the replacement repl.  number is the number of | 
 | 59 |     substitutions that were made.""" | 
| Fredrik Lundh | 7898c3e | 2000-08-07 20:59:04 +0000 | [diff] [blame] | 60 |     return _compile(pattern, 0).subn(repl, string, count) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 61 |  | 
 | 62 | def split(pattern, string, maxsplit=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 63 |     """Split the source string by the occurrences of the pattern, | 
 | 64 |     returning a list containing the resulting substrings.""" | 
| Fredrik Lundh | 7898c3e | 2000-08-07 20:59:04 +0000 | [diff] [blame] | 65 |     return _compile(pattern, 0).split(string, maxsplit) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 66 |  | 
 | 67 | def findall(pattern, string, maxsplit=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 68 |     """Return a list of all non-overlapping matches in the string. | 
 | 69 |  | 
 | 70 |     If one or more groups are present in the pattern, return a | 
 | 71 |     list of groups; this will be a list of tuples if the pattern | 
 | 72 |     has more than one group. | 
 | 73 |  | 
 | 74 |     Empty matches are included in the result.""" | 
| Fredrik Lundh | 7898c3e | 2000-08-07 20:59:04 +0000 | [diff] [blame] | 75 |     return _compile(pattern, 0).findall(string, maxsplit) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 76 |  | 
 | 77 | def compile(pattern, flags=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 78 |     "Compile a regular expression pattern, returning a pattern object." | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 79 |     return _compile(pattern, flags) | 
 | 80 |  | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 81 | def purge(): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 82 |     "Clear the regular expression cache" | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 83 |     _cache.clear() | 
 | 84 |  | 
| Fredrik Lundh | 436c3d58 | 2000-06-29 08:58:44 +0000 | [diff] [blame] | 85 | def template(pattern, flags=0): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 86 |     "Compile a template pattern, returning a pattern object" | 
 | 87 |  | 
| Fredrik Lundh | 436c3d58 | 2000-06-29 08:58:44 +0000 | [diff] [blame] | 88 |     return _compile(pattern, flags|T) | 
 | 89 |  | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 90 | def escape(pattern): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 91 |     "Escape all non-alphanumeric characters in pattern." | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 92 |     s = list(pattern) | 
 | 93 |     for i in range(len(pattern)): | 
 | 94 |         c = pattern[i] | 
 | 95 |         if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"): | 
 | 96 |             if c == "\000": | 
 | 97 |                 s[i] = "\\000" | 
 | 98 |             else: | 
 | 99 |                 s[i] = "\\" + c | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 100 |     return _join(s, pattern) | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 101 |  | 
 | 102 | # -------------------------------------------------------------------- | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 103 | # internals | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 104 |  | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 105 | _cache = {} | 
 | 106 | _MAXCACHE = 100 | 
| Guido van Rossum | 7627c0d | 2000-03-31 14:58:54 +0000 | [diff] [blame] | 107 |  | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 108 | def _join(seq, sep): | 
 | 109 |     # internal: join into string having the same type as sep | 
| Eric S. Raymond | b08b2d3 | 2001-02-09 11:10:16 +0000 | [diff] [blame] | 110 |     return sep[:0].join(seq) | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 111 |  | 
| Fredrik Lundh | 7898c3e | 2000-08-07 20:59:04 +0000 | [diff] [blame] | 112 | def _compile(*key): | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 113 |     # internal: compile pattern | 
| Fredrik Lundh | 7898c3e | 2000-08-07 20:59:04 +0000 | [diff] [blame] | 114 |     p = _cache.get(key) | 
 | 115 |     if p is not None: | 
 | 116 |         return p | 
 | 117 |     pattern, flags = key | 
 | 118 |     if type(pattern) not in sre_compile.STRING_TYPES: | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 119 |         return pattern | 
| Fredrik Lundh | e186983 | 2000-08-01 22:47:49 +0000 | [diff] [blame] | 120 |     try: | 
 | 121 |         p = sre_compile.compile(pattern, flags) | 
 | 122 |     except error, v: | 
 | 123 |         raise error, v # invalid expression | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 124 |     if len(_cache) >= _MAXCACHE: | 
 | 125 |         _cache.clear() | 
 | 126 |     _cache[key] = p | 
 | 127 |     return p | 
 | 128 |  | 
| Fredrik Lundh | 5644b7f | 2000-09-21 17:03:25 +0000 | [diff] [blame] | 129 | def _expand(pattern, match, template): | 
 | 130 |     # internal: match.expand implementation hook | 
 | 131 |     template = sre_parse.parse_template(template, pattern) | 
 | 132 |     return sre_parse.expand_template(template, match) | 
 | 133 |  | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 134 | def _sub(pattern, template, string, count=0): | 
 | 135 |     # internal: pattern.sub implementation hook | 
 | 136 |     return _subn(pattern, template, string, count)[0] | 
 | 137 |  | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 138 | def _subn(pattern, template, string, count=0): | 
 | 139 |     # internal: pattern.subn implementation hook | 
 | 140 |     if callable(template): | 
| Andrew M. Kuchling | e8d52af | 2000-06-18 20:27:10 +0000 | [diff] [blame] | 141 |         filter = template | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 142 |     else: | 
| Fredrik Lundh | 90a0791 | 2000-06-30 07:50:59 +0000 | [diff] [blame] | 143 |         template = sre_parse.parse_template(template, pattern) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 144 |         def filter(match, template=template): | 
| Fredrik Lundh | 436c3d58 | 2000-06-29 08:58:44 +0000 | [diff] [blame] | 145 |             return sre_parse.expand_template(template, match) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 146 |     n = i = 0 | 
 | 147 |     s = [] | 
 | 148 |     append = s.append | 
| Fredrik Lundh | be2211e | 2000-06-29 16:57:40 +0000 | [diff] [blame] | 149 |     c = pattern.scanner(string) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 150 |     while not count or n < count: | 
 | 151 |         m = c.search() | 
 | 152 |         if not m: | 
 | 153 |             break | 
| Fredrik Lundh | 90a0791 | 2000-06-30 07:50:59 +0000 | [diff] [blame] | 154 |         b, e = m.span() | 
| Fredrik Lundh | 01016fe | 2000-06-30 00:27:46 +0000 | [diff] [blame] | 155 |         if i < b: | 
 | 156 |             append(string[i:b]) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 157 |         append(filter(m)) | 
| Fredrik Lundh | 90a0791 | 2000-06-30 07:50:59 +0000 | [diff] [blame] | 158 |         i = e | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 159 |         n = n + 1 | 
| Fredrik Lundh | 01016fe | 2000-06-30 00:27:46 +0000 | [diff] [blame] | 160 |     append(string[i:]) | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 161 |     return _join(s, string[:0]), n | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 162 |  | 
 | 163 | def _split(pattern, string, maxsplit=0): | 
 | 164 |     # internal: pattern.split implementation hook | 
 | 165 |     n = i = 0 | 
 | 166 |     s = [] | 
 | 167 |     append = s.append | 
| Fredrik Lundh | be2211e | 2000-06-29 16:57:40 +0000 | [diff] [blame] | 168 |     extend = s.extend | 
 | 169 |     c = pattern.scanner(string) | 
| Fredrik Lundh | 01016fe | 2000-06-30 00:27:46 +0000 | [diff] [blame] | 170 |     g = pattern.groups | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 171 |     while not maxsplit or n < maxsplit: | 
 | 172 |         m = c.search() | 
 | 173 |         if not m: | 
 | 174 |             break | 
| Fredrik Lundh | 90a0791 | 2000-06-30 07:50:59 +0000 | [diff] [blame] | 175 |         b, e = m.span() | 
 | 176 |         if b == e: | 
 | 177 |             if i >= len(string): | 
 | 178 |                 break | 
 | 179 |             continue | 
| Fredrik Lundh | be2211e | 2000-06-29 16:57:40 +0000 | [diff] [blame] | 180 |         append(string[i:b]) | 
| Fredrik Lundh | 90a0791 | 2000-06-30 07:50:59 +0000 | [diff] [blame] | 181 |         if g and b != e: | 
| Fredrik Lundh | 1c5aa69 | 2001-01-16 07:37:30 +0000 | [diff] [blame] | 182 |             extend(list(m.groups())) | 
| Fredrik Lundh | 90a0791 | 2000-06-30 07:50:59 +0000 | [diff] [blame] | 183 |         i = e | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 184 |         n = n + 1 | 
| Fredrik Lundh | 8094611 | 2000-06-29 18:03:25 +0000 | [diff] [blame] | 185 |     append(string[i:]) | 
| Jeremy Hylton | b1aa195 | 2000-06-01 17:39:12 +0000 | [diff] [blame] | 186 |     return s | 
| Fredrik Lundh | 0640e11 | 2000-06-30 13:55:15 +0000 | [diff] [blame] | 187 |  | 
 | 188 | # register myself for pickling | 
 | 189 |  | 
 | 190 | import copy_reg | 
 | 191 |  | 
 | 192 | def _pickle(p): | 
 | 193 |     return _compile, (p.pattern, p.flags) | 
 | 194 |  | 
| Fredrik Lundh | 7898c3e | 2000-08-07 20:59:04 +0000 | [diff] [blame] | 195 | copy_reg.pickle(type(_compile("", 0)), _pickle, _compile) | 
| Fredrik Lundh | 7cafe4d | 2000-07-02 17:33:27 +0000 | [diff] [blame] | 196 |  | 
 | 197 | # -------------------------------------------------------------------- | 
 | 198 | # experimental stuff (see python-dev discussions for details) | 
 | 199 |  | 
 | 200 | class Scanner: | 
 | 201 |     def __init__(self, lexicon): | 
| Fredrik Lundh | 29c4ba9 | 2000-08-01 18:20:07 +0000 | [diff] [blame] | 202 |         from sre_constants import BRANCH, SUBPATTERN | 
| Fredrik Lundh | 7cafe4d | 2000-07-02 17:33:27 +0000 | [diff] [blame] | 203 |         self.lexicon = lexicon | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 204 |         # combine phrases into a compound pattern | 
| Fredrik Lundh | 7cafe4d | 2000-07-02 17:33:27 +0000 | [diff] [blame] | 205 |         p = [] | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 206 |         s = sre_parse.Pattern() | 
| Fredrik Lundh | 7cafe4d | 2000-07-02 17:33:27 +0000 | [diff] [blame] | 207 |         for phrase, action in lexicon: | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 208 |             p.append(sre_parse.SubPattern(s, [ | 
| Fredrik Lundh | 29c4ba9 | 2000-08-01 18:20:07 +0000 | [diff] [blame] | 209 |                 (SUBPATTERN, (len(p), sre_parse.parse(phrase))), | 
| Fredrik Lundh | 8a3ebf8 | 2000-07-23 21:46:17 +0000 | [diff] [blame] | 210 |                 ])) | 
 | 211 |         p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) | 
 | 212 |         s.groups = len(p) | 
 | 213 |         self.scanner = sre_compile.compile(p) | 
| Fredrik Lundh | 7cafe4d | 2000-07-02 17:33:27 +0000 | [diff] [blame] | 214 |     def scan(self, string): | 
 | 215 |         result = [] | 
 | 216 |         append = result.append | 
 | 217 |         match = self.scanner.match | 
 | 218 |         i = 0 | 
 | 219 |         while 1: | 
 | 220 |             m = match(string, i) | 
 | 221 |             if not m: | 
 | 222 |                 break | 
 | 223 |             j = m.end() | 
 | 224 |             if i == j: | 
 | 225 |                 break | 
| Fredrik Lundh | 019bcb5 | 2000-07-02 22:59:57 +0000 | [diff] [blame] | 226 |             action = self.lexicon[m.lastindex][1] | 
| Fredrik Lundh | 7cafe4d | 2000-07-02 17:33:27 +0000 | [diff] [blame] | 227 |             if callable(action): | 
| Fredrik Lundh | 770617b | 2001-01-14 15:06:11 +0000 | [diff] [blame] | 228 |                 self.match = m | 
| Fredrik Lundh | 7cafe4d | 2000-07-02 17:33:27 +0000 | [diff] [blame] | 229 |                 action = action(self, m.group()) | 
 | 230 |             if action is not None: | 
 | 231 |                 append(action) | 
 | 232 |             i = j | 
 | 233 |         return result, string[i:] |