Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # -*- mode: python -*- |
| 3 | # $Id$ |
| 4 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 5 | |
| 6 | import sys |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 7 | import string |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 8 | from pcre import * |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 9 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 10 | # |
| 11 | # First, the public part of the interface: |
| 12 | # |
| 13 | |
| 14 | # pcre.error and re.error should be the same, since exceptions can be |
Guido van Rossum | 6af4abd | 1997-08-13 03:25:34 +0000 | [diff] [blame] | 15 | # raised from either module. |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 16 | |
| 17 | # compilation flags |
| 18 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 19 | I = IGNORECASE |
| 20 | M = MULTILINE |
| 21 | S = DOTALL |
| 22 | X = VERBOSE |
Guido van Rossum | 09bcfd6 | 1997-07-15 15:38:20 +0000 | [diff] [blame] | 23 | |
| 24 | # |
| 25 | # |
| 26 | # |
| 27 | |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 28 | _cache = {} |
| 29 | _MAXCACHE = 20 |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 30 | |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 31 | def _cachecompile(pattern, flags=0): |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 32 | key = (pattern, flags) |
| 33 | try: |
| 34 | return _cache[key] |
| 35 | except KeyError: |
| 36 | pass |
| 37 | value = compile(pattern, flags) |
| 38 | if len(_cache) >= _MAXCACHE: |
| 39 | _cache.clear() |
| 40 | _cache[key] = value |
| 41 | return value |
| 42 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 43 | def match(pattern, string, flags=0): |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 44 | return _cachecompile(pattern, flags).match(string) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 45 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 46 | def search(pattern, string, flags=0): |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 47 | return _cachecompile(pattern, flags).search(string) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 48 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 49 | def sub(pattern, repl, string, count=0): |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 50 | if type(pattern) == type(''): |
| 51 | pattern = _cachecompile(pattern) |
| 52 | return pattern.sub(repl, string, count) |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 53 | |
| 54 | def subn(pattern, repl, string, count=0): |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 55 | if type(pattern) == type(''): |
| 56 | pattern = _cachecompile(pattern) |
| 57 | return pattern.subn(repl, string, count) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 58 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 59 | def split(pattern, string, maxsplit=0): |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 60 | if type(pattern) == type(''): |
| 61 | pattern = _cachecompile(pattern) |
| 62 | return pattern.split(string, maxsplit) |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 63 | |
| 64 | # |
| 65 | # |
| 66 | # |
| 67 | |
| 68 | class RegexObject: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 69 | def __init__(self, pattern, flags, code, groupindex): |
| 70 | self.code = code |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 71 | self.flags = flags |
| 72 | self.pattern = pattern |
| 73 | self.groupindex = groupindex |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 74 | def search(self, string, pos=0): |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 75 | regs = self.code.match(string, pos, 0) |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 76 | if regs is None: |
| 77 | return None |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 78 | self.num_regs=len(regs) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 79 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 80 | return MatchObject(self, |
| 81 | string, |
| 82 | pos, |
| 83 | regs) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 84 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 85 | def match(self, string, pos=0): |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 86 | regs = self.code.match(string, pos, ANCHORED) |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 87 | if regs is None: |
| 88 | return None |
Guido van Rossum | af8d2bf | 1997-10-27 18:17:19 +0000 | [diff] [blame] | 89 | self.num_regs=len(regs) |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 90 | return MatchObject(self, |
| 91 | string, |
| 92 | pos, |
| 93 | regs) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 94 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 95 | def sub(self, repl, string, count=0): |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 96 | return self.subn(repl, string, count)[0] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 97 | |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 98 | def subn(self, repl, source, count=0): |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 99 | if count < 0: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 100 | raise error, "negative substitution count" |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 101 | if count == 0: |
| 102 | import sys |
| 103 | count = sys.maxint |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 104 | if type(repl) == type(''): |
| 105 | if '\\' in repl: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 106 | repl = lambda m, r=repl: pcre_expand(m, r) |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 107 | else: |
| 108 | repl = lambda m, r=repl: r |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 109 | n = 0 # Number of matches |
| 110 | pos = 0 # Where to start searching |
| 111 | lastmatch = -1 # End of last match |
| 112 | results = [] # Substrings making up the result |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 113 | end = len(source) |
| 114 | while n < count and pos <= end: |
| 115 | m = self.search(source, pos) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 116 | if not m: |
| 117 | break |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 118 | i, j = m.span(0) |
| 119 | if i == j == lastmatch: |
| 120 | # Empty match adjacent to previous match |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 121 | pos = pos + 1 |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 122 | results.append(source[lastmatch:pos]) |
| 123 | continue |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 124 | if pos < i: |
| 125 | results.append(source[pos:i]) |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 126 | results.append(repl(m)) |
| 127 | pos = lastmatch = j |
| 128 | if i == j: |
| 129 | # Last match was empty; don't try here again |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 130 | pos = pos + 1 |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 131 | results.append(source[lastmatch:pos]) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 132 | n = n + 1 |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 133 | results.append(source[pos:]) |
| 134 | return (string.join(results, ''), n) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 135 | |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 136 | def split(self, source, maxsplit=0): |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 137 | if maxsplit < 0: |
| 138 | raise error, "negative split count" |
| 139 | if maxsplit == 0: |
| 140 | import sys |
| 141 | maxsplit = sys.maxint |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 142 | n = 0 |
| 143 | pos = 0 |
| 144 | lastmatch = 0 |
| 145 | results = [] |
| 146 | end = len(source) |
| 147 | while n < maxsplit: |
| 148 | m = self.search(source, pos) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 149 | if not m: |
| 150 | break |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 151 | i, j = m.span(0) |
| 152 | if i == j: |
| 153 | # Empty match |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 154 | if pos >= end: |
| 155 | break |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 156 | pos = pos+1 |
| 157 | continue |
| 158 | results.append(source[lastmatch:i]) |
| 159 | g = m.group() |
| 160 | if g: |
| 161 | results[len(results):] = list(g) |
| 162 | pos = lastmatch = j |
| 163 | results.append(source[lastmatch:]) |
| 164 | return results |
| 165 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 166 | class MatchObject: |
| 167 | def __init__(self, re, string, pos, regs): |
| 168 | self.re = re |
| 169 | self.string = string |
| 170 | self.pos = pos |
| 171 | self.regs = regs |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 172 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 173 | def start(self, g): |
| 174 | if type(g) == type(''): |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 175 | try: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 176 | g = self.re.groupindex[g] |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 177 | except (KeyError, TypeError): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 178 | raise IndexError, ('group "' + g + '" is undefined') |
| 179 | return self.regs[g][0] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 180 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 181 | def end(self, g): |
| 182 | if type(g) == type(''): |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 183 | try: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 184 | g = self.re.groupindex[g] |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 185 | except (KeyError, TypeError): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 186 | raise IndexError, ('group "' + g + '" is undefined') |
| 187 | return self.regs[g][1] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 188 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 189 | def span(self, g): |
| 190 | if type(g) == type(''): |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 191 | try: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 192 | g = self.re.groupindex[g] |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 193 | except (KeyError, TypeError): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 194 | raise IndexError, ('group "' + g + '" is undefined') |
| 195 | return self.regs[g] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 196 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 197 | def group(self, *groups): |
| 198 | if len(groups) == 0: |
| 199 | groups = range(1, self.re.num_regs) |
Guido van Rossum | 5310975 | 1997-07-15 15:40:29 +0000 | [diff] [blame] | 200 | use_all = 1 |
| 201 | else: |
| 202 | use_all = 0 |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 203 | result = [] |
| 204 | for g in groups: |
| 205 | if type(g) == type(''): |
| 206 | try: |
| 207 | g = self.re.groupindex[g] |
| 208 | except (KeyError, TypeError): |
| 209 | raise IndexError, ('group "' + g + '" is undefined') |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 210 | if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined') |
| 211 | elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 212 | result.append(None) |
| 213 | else: |
| 214 | result.append(self.string[self.regs[g][0]:self.regs[g][1]]) |
Guido van Rossum | 5310975 | 1997-07-15 15:40:29 +0000 | [diff] [blame] | 215 | if use_all or len(result) > 1: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 216 | return tuple(result) |
| 217 | elif len(result) == 1: |
| 218 | return result[0] |
| 219 | else: |
| 220 | return () |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 221 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 222 | def escape(pattern): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 223 | result = [] |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 224 | alphanum=string.letters+'_'+string.digits |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 225 | for char in pattern: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 226 | if char not in alphanum: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 227 | result.append('\\') |
| 228 | result.append(char) |
| 229 | return string.join(result, '') |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 230 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 231 | def compile(pattern, flags=0): |
| 232 | groupindex={} |
| 233 | code=pcre_compile(pattern, flags, groupindex) |
| 234 | return RegexObject(pattern, flags, code, groupindex) |
| 235 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 236 | |