Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 1 | #!/usr/bin/env python |
| 2 | # -*- mode: python -*- |
| 3 | # $Id$ |
| 4 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 5 | |
| 6 | import sys |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 7 | import string |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 8 | from pcre import * |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 9 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 10 | [ NORMAL, CHARCLASS, REPLACEMENT ] = range(3) |
| 11 | [ CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER ] = range(9) |
| 12 | |
| 13 | # |
| 14 | # First, the public part of the interface: |
| 15 | # |
| 16 | |
| 17 | # pcre.error and re.error should be the same, since exceptions can be |
Guido van Rossum | 6af4abd | 1997-08-13 03:25:34 +0000 | [diff] [blame] | 18 | # raised from either module. |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 19 | |
| 20 | # compilation flags |
| 21 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 22 | I = IGNORECASE |
| 23 | M = MULTILINE |
| 24 | S = DOTALL |
| 25 | X = VERBOSE |
Guido van Rossum | 09bcfd6 | 1997-07-15 15:38:20 +0000 | [diff] [blame] | 26 | |
| 27 | # |
| 28 | # |
| 29 | # |
| 30 | |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 31 | _cache = {} |
| 32 | _MAXCACHE = 20 |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 33 | |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 34 | def _cachecompile(pattern, flags=0): |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 35 | key = (pattern, flags) |
| 36 | try: |
| 37 | return _cache[key] |
| 38 | except KeyError: |
| 39 | pass |
| 40 | value = compile(pattern, flags) |
| 41 | if len(_cache) >= _MAXCACHE: |
| 42 | _cache.clear() |
| 43 | _cache[key] = value |
| 44 | return value |
| 45 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 46 | def match(pattern, string, flags=0): |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 47 | return _cachecompile(pattern, flags).match(string) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 48 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 49 | def search(pattern, string, flags=0): |
Guido van Rossum | 26d80e6 | 1997-07-15 18:59:04 +0000 | [diff] [blame] | 50 | return _cachecompile(pattern, flags).search(string) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 51 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 52 | def sub(pattern, repl, string, count=0): |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 53 | if type(pattern) == type(''): |
| 54 | pattern = _cachecompile(pattern) |
| 55 | return pattern.sub(repl, string, count) |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 56 | |
| 57 | def subn(pattern, repl, string, count=0): |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 58 | if type(pattern) == type(''): |
| 59 | pattern = _cachecompile(pattern) |
| 60 | return pattern.subn(repl, string, count) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 61 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 62 | def split(pattern, string, maxsplit=0): |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 63 | if type(pattern) == type(''): |
| 64 | pattern = _cachecompile(pattern) |
| 65 | return pattern.split(string, maxsplit) |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 66 | |
| 67 | # |
| 68 | # |
| 69 | # |
| 70 | |
| 71 | class RegexObject: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 72 | def __init__(self, pattern, flags, code, groupindex): |
| 73 | self.code = code |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 74 | self.flags = flags |
| 75 | self.pattern = pattern |
| 76 | self.groupindex = groupindex |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 77 | def search(self, string, pos=0): |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 78 | regs = self.code.match(string, pos, 0) |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 79 | if regs is None: |
| 80 | return None |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 81 | self.num_regs=len(regs) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 82 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 83 | return MatchObject(self, |
| 84 | string, |
| 85 | pos, |
| 86 | regs) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 87 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 88 | def match(self, string, pos=0): |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 89 | regs = self.code.match(string, pos, ANCHORED) |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 90 | if regs is None: |
| 91 | return None |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 92 | self.num_regs=len(regs)/2 |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 93 | return MatchObject(self, |
| 94 | string, |
| 95 | pos, |
| 96 | regs) |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 97 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 98 | def sub(self, repl, string, count=0): |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 99 | return self.subn(repl, string, count)[0] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 100 | |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 101 | def subn(self, repl, source, count=0): |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 102 | if count < 0: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 103 | raise error, "negative substitution count" |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 104 | if count == 0: |
| 105 | import sys |
| 106 | count = sys.maxint |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 107 | if type(repl) == type(''): |
| 108 | if '\\' in repl: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 109 | repl = lambda m, r=repl: pcre_expand(m, r) |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 110 | else: |
| 111 | repl = lambda m, r=repl: r |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 112 | n = 0 # Number of matches |
| 113 | pos = 0 # Where to start searching |
| 114 | lastmatch = -1 # End of last match |
| 115 | results = [] # Substrings making up the result |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 116 | end = len(source) |
| 117 | while n < count and pos <= end: |
| 118 | m = self.search(source, pos) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 119 | if not m: |
| 120 | break |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 121 | i, j = m.span(0) |
| 122 | if i == j == lastmatch: |
| 123 | # Empty match adjacent to previous match |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 124 | pos = pos + 1 |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 125 | results.append(source[lastmatch:pos]) |
| 126 | continue |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 127 | if pos < i: |
| 128 | results.append(source[pos:i]) |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 129 | results.append(repl(m)) |
| 130 | pos = lastmatch = j |
| 131 | if i == j: |
| 132 | # Last match was empty; don't try here again |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 133 | pos = pos + 1 |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 134 | results.append(source[lastmatch:pos]) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 135 | n = n + 1 |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 136 | results.append(source[pos:]) |
| 137 | return (string.join(results, ''), n) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 138 | |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 139 | def split(self, source, maxsplit=0): |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 140 | if maxsplit < 0: |
| 141 | raise error, "negative split count" |
| 142 | if maxsplit == 0: |
| 143 | import sys |
| 144 | maxsplit = sys.maxint |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 145 | n = 0 |
| 146 | pos = 0 |
| 147 | lastmatch = 0 |
| 148 | results = [] |
| 149 | end = len(source) |
| 150 | while n < maxsplit: |
| 151 | m = self.search(source, pos) |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 152 | if not m: |
| 153 | break |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 154 | i, j = m.span(0) |
| 155 | if i == j: |
| 156 | # Empty match |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 157 | if pos >= end: |
| 158 | break |
Guido van Rossum | 9e18ec7 | 1997-07-17 22:39:13 +0000 | [diff] [blame] | 159 | pos = pos+1 |
| 160 | continue |
| 161 | results.append(source[lastmatch:i]) |
| 162 | g = m.group() |
| 163 | if g: |
| 164 | results[len(results):] = list(g) |
| 165 | pos = lastmatch = j |
| 166 | results.append(source[lastmatch:]) |
| 167 | return results |
| 168 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 169 | class MatchObject: |
| 170 | def __init__(self, re, string, pos, regs): |
| 171 | self.re = re |
| 172 | self.string = string |
| 173 | self.pos = pos |
| 174 | self.regs = regs |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 175 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 176 | def start(self, g): |
| 177 | if type(g) == type(''): |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 178 | try: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 179 | g = self.re.groupindex[g] |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 180 | except (KeyError, TypeError): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 181 | raise IndexError, ('group "' + g + '" is undefined') |
| 182 | return self.regs[g][0] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 183 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 184 | def end(self, g): |
| 185 | if type(g) == type(''): |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 186 | try: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 187 | g = self.re.groupindex[g] |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 188 | except (KeyError, TypeError): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 189 | raise IndexError, ('group "' + g + '" is undefined') |
| 190 | return self.regs[g][1] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 191 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 192 | def span(self, g): |
| 193 | if type(g) == type(''): |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 194 | try: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 195 | g = self.re.groupindex[g] |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 196 | except (KeyError, TypeError): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 197 | raise IndexError, ('group "' + g + '" is undefined') |
| 198 | return self.regs[g] |
Guido van Rossum | a0e4c1b | 1997-07-17 14:52:48 +0000 | [diff] [blame] | 199 | |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 200 | def group(self, *groups): |
| 201 | if len(groups) == 0: |
| 202 | groups = range(1, self.re.num_regs) |
Guido van Rossum | 5310975 | 1997-07-15 15:40:29 +0000 | [diff] [blame] | 203 | use_all = 1 |
| 204 | else: |
| 205 | use_all = 0 |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 206 | result = [] |
| 207 | for g in groups: |
| 208 | if type(g) == type(''): |
| 209 | try: |
| 210 | g = self.re.groupindex[g] |
| 211 | except (KeyError, TypeError): |
| 212 | raise IndexError, ('group "' + g + '" is undefined') |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 213 | if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined') |
| 214 | elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 215 | result.append(None) |
| 216 | else: |
| 217 | result.append(self.string[self.regs[g][0]:self.regs[g][1]]) |
Guido van Rossum | 5310975 | 1997-07-15 15:40:29 +0000 | [diff] [blame] | 218 | if use_all or len(result) > 1: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 219 | return tuple(result) |
| 220 | elif len(result) == 1: |
| 221 | return result[0] |
| 222 | else: |
| 223 | return () |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 224 | |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 225 | def escape(pattern): |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 226 | result = [] |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 227 | alphanum=string.letters+'_'+string.digits |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 228 | for char in pattern: |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 229 | if char not in alphanum: |
Guido van Rossum | 8a9a4a2 | 1997-07-11 20:48:25 +0000 | [diff] [blame] | 230 | result.append('\\') |
| 231 | result.append(char) |
| 232 | return string.join(result, '') |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 233 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 234 | def valid_identifier(id): |
| 235 | import string |
| 236 | if len(id) == 0: |
| 237 | return 0 |
| 238 | if id[0] not in string.letters+'_': |
| 239 | return 0 |
| 240 | for char in id[1:]: |
| 241 | if not syntax_table[char] & word: |
| 242 | return 0 |
| 243 | return 1 |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 244 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 245 | def compile(pattern, flags=0): |
| 246 | groupindex={} |
| 247 | code=pcre_compile(pattern, flags, groupindex) |
| 248 | return RegexObject(pattern, flags, code, groupindex) |
| 249 | |
| 250 | def _expand(m, repl): |
| 251 | results = [] |
| 252 | index = 0 |
| 253 | size = len(repl) |
| 254 | while index < size: |
| 255 | found = string.find(repl, '\\', index) |
| 256 | if found < 0: |
| 257 | results.append(repl[index:]) |
| 258 | break |
| 259 | if found > index: |
| 260 | results.append(repl[index:found]) |
| 261 | escape_type, value, index = _expand_escape(repl, found+1, REPLACEMENT) |
| 262 | if escape_type == CHAR: |
| 263 | results.append(value) |
| 264 | elif escape_type == MEMORY_REFERENCE: |
| 265 | r = m.group(value) |
| 266 | if r is None: |
| 267 | raise error, ('group "' + str(value) + '" did not contribute ' |
| 268 | 'to the match') |
| 269 | results.append(m.group(value)) |
| 270 | else: |
| 271 | raise error, "bad escape in replacement" |
| 272 | return string.join(results, '') |
Guido van Rossum | 5ca1b71 | 1997-07-10 21:00:31 +0000 | [diff] [blame] | 273 | |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 274 | def _expand_escape(pattern, index, context=NORMAL): |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 275 | if index >= len(pattern): |
| 276 | raise error, 'escape ends too soon' |
| 277 | |
| 278 | elif pattern[index] == 't': |
| 279 | return CHAR, chr(9), index + 1 |
| 280 | |
| 281 | elif pattern[index] == 'n': |
| 282 | return CHAR, chr(10), index + 1 |
| 283 | |
Guido van Rossum | 6af4abd | 1997-08-13 03:25:34 +0000 | [diff] [blame] | 284 | elif pattern[index] == 'v': |
| 285 | return CHAR, chr(11), index + 1 |
| 286 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 287 | elif pattern[index] == 'r': |
| 288 | return CHAR, chr(13), index + 1 |
| 289 | |
| 290 | elif pattern[index] == 'f': |
| 291 | return CHAR, chr(12), index + 1 |
| 292 | |
| 293 | elif pattern[index] == 'a': |
| 294 | return CHAR, chr(7), index + 1 |
| 295 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 296 | elif pattern[index] == 'x': |
| 297 | # CAUTION: this is the Python rule, not the Perl rule! |
Guido van Rossum | 6af4abd | 1997-08-13 03:25:34 +0000 | [diff] [blame] | 298 | end = index + 1 # Skip over the 'x' character |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 299 | while (end < len(pattern)) and (pattern[end] in string.hexdigits): |
| 300 | end = end + 1 |
| 301 | if end == index: |
| 302 | raise error, "\\x must be followed by hex digit(s)" |
| 303 | # let Python evaluate it, so we don't incorrectly 2nd-guess |
| 304 | # what it's doing (and Python in turn passes it on to sscanf, |
| 305 | # so that *it* doesn't incorrectly 2nd-guess what C does!) |
Guido van Rossum | 6af4abd | 1997-08-13 03:25:34 +0000 | [diff] [blame] | 306 | char = eval ('"' + pattern[index-1:end] + '"') |
Guido van Rossum | bf9d353 | 1997-10-06 14:45:17 +0000 | [diff] [blame] | 307 | # assert len(char) == 1 |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 308 | return CHAR, char, end |
| 309 | |
| 310 | elif pattern[index] == 'b': |
| 311 | if context != NORMAL: |
| 312 | return CHAR, chr(8), index + 1 |
| 313 | else: |
| 314 | return WORD_BOUNDARY, '', index + 1 |
| 315 | |
| 316 | elif pattern[index] == 'B': |
| 317 | if context != NORMAL: |
| 318 | return CHAR, 'B', index + 1 |
| 319 | else: |
| 320 | return NOT_WORD_BOUNDARY, '', index + 1 |
| 321 | |
| 322 | elif pattern[index] == 'A': |
| 323 | if context != NORMAL: |
| 324 | return CHAR, 'A', index + 1 |
| 325 | else: |
| 326 | return BEGINNING_OF_BUFFER, '', index + 1 |
| 327 | |
| 328 | elif pattern[index] == 'Z': |
| 329 | if context != NORMAL: |
Guido van Rossum | 6af4abd | 1997-08-13 03:25:34 +0000 | [diff] [blame] | 330 | return CHAR, 'Z', index + 1 |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 331 | else: |
| 332 | return END_OF_BUFFER, '', index + 1 |
| 333 | |
| 334 | elif pattern[index] in 'GluLUQE': |
Guido van Rossum | 6af4abd | 1997-08-13 03:25:34 +0000 | [diff] [blame] | 335 | raise error, ('\\' + pattern[index] + ' is not allowed') |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 336 | |
| 337 | elif pattern[index] == 'w': |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 338 | return CHAR, 'w', index + 1 |
| 339 | |
| 340 | elif pattern[index] == 'W': |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 341 | return CHAR, 'W', index + 1 |
| 342 | |
| 343 | elif pattern[index] == 's': |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 344 | return CHAR, 's', index + 1 |
| 345 | |
| 346 | elif pattern[index] == 'S': |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 347 | return CHAR, 'S', index + 1 |
| 348 | |
| 349 | elif pattern[index] == 'd': |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 350 | return CHAR, 'd', index + 1 |
| 351 | |
| 352 | elif pattern[index] == 'D': |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 353 | return CHAR, 'D', index + 1 |
| 354 | |
| 355 | elif pattern[index] in '0123456789': |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 356 | |
Guido van Rossum | 9f845ec | 1997-07-15 18:11:42 +0000 | [diff] [blame] | 357 | if pattern[index] == '0': |
| 358 | if (index + 1 < len(pattern)) and \ |
| 359 | (pattern[index + 1] in string.octdigits): |
| 360 | if (index + 2 < len(pattern)) and \ |
| 361 | (pattern[index + 2] in string.octdigits): |
| 362 | value = string.atoi(pattern[index:index + 3], 8) |
| 363 | index = index + 3 |
| 364 | |
| 365 | else: |
| 366 | value = string.atoi(pattern[index:index + 2], 8) |
| 367 | index = index + 2 |
| 368 | |
| 369 | else: |
| 370 | value = 0 |
| 371 | index = index + 1 |
| 372 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 373 | if value > 255: |
Guido van Rossum | 9f845ec | 1997-07-15 18:11:42 +0000 | [diff] [blame] | 374 | raise error, 'octal value out of range' |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 375 | |
Guido van Rossum | 9f845ec | 1997-07-15 18:11:42 +0000 | [diff] [blame] | 376 | return CHAR, chr(value), index |
| 377 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 378 | else: |
Guido van Rossum | 9f845ec | 1997-07-15 18:11:42 +0000 | [diff] [blame] | 379 | if (index + 1 < len(pattern)) and \ |
| 380 | (pattern[index + 1] in string.digits): |
| 381 | if (index + 2 < len(pattern)) and \ |
| 382 | (pattern[index + 2] in string.octdigits) and \ |
| 383 | (pattern[index + 1] in string.octdigits) and \ |
| 384 | (pattern[index] in string.octdigits): |
| 385 | value = string.atoi(pattern[index:index + 3], 8) |
| 386 | if value > 255: |
| 387 | raise error, 'octal value out of range' |
| 388 | |
| 389 | return CHAR, chr(value), index + 3 |
| 390 | |
| 391 | else: |
| 392 | value = string.atoi(pattern[index:index + 2]) |
| 393 | if (value < 1) or (value > 99): |
| 394 | raise error, 'memory reference out of range' |
| 395 | |
| 396 | if context == CHARCLASS: |
| 397 | raise error, ('cannot reference a register from ' |
| 398 | 'inside a character class') |
| 399 | return MEMORY_REFERENCE, value, index + 2 |
| 400 | |
| 401 | else: |
| 402 | if context == CHARCLASS: |
| 403 | raise error, ('cannot reference a register from ' |
| 404 | 'inside a character class') |
| 405 | |
| 406 | value = string.atoi(pattern[index]) |
| 407 | return MEMORY_REFERENCE, value, index + 1 |
| 408 | |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 409 | elif pattern[index] == 'g': |
| 410 | if context != REPLACEMENT: |
| 411 | return CHAR, 'g', index + 1 |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 412 | |
Guido van Rossum | 71fa97c | 1997-07-18 04:26:03 +0000 | [diff] [blame] | 413 | index = index + 1 |
| 414 | if index >= len(pattern): |
| 415 | raise error, 'unfinished symbolic reference' |
| 416 | if pattern[index] != '<': |
| 417 | raise error, 'missing < in symbolic reference' |
| 418 | |
| 419 | index = index + 1 |
| 420 | end = string.find(pattern, '>', index) |
| 421 | if end == -1: |
| 422 | raise error, 'unfinished symbolic reference' |
| 423 | value = pattern[index:end] |
| 424 | if not valid_identifier(value): |
| 425 | raise error, 'illegal symbolic reference' |
| 426 | return MEMORY_REFERENCE, value, end + 1 |
| 427 | |
Guido van Rossum | 04a1d74 | 1997-07-15 14:38:13 +0000 | [diff] [blame] | 428 | else: |
| 429 | return CHAR, pattern[index], index + 1 |
| 430 | |