| #!/usr/bin/env python |
| # -*- mode: python -*- |
| # $Id$ |
| |
| import string |
| import reop |
| |
| error = 're error' |
| |
| # compilation flags |
| |
| IGNORECASE = I = 0x01 |
| |
| MULTILINE = M = 0x02 |
| DOTALL = S = 0x04 |
| VERBOSE = X = 0x08 |
| |
| repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', |
| '{n,}', '{n,}?', '{n,m}', '{n,m}?'] |
| |
| # |
| # |
| # |
| |
| def valid_identifier(id): |
| if len(id) == 0: |
| return 0 |
| if (not reop.syntax_table[id[0]] & reop.word) or \ |
| (reop.syntax_table[id[0]] & reop.digit): |
| return 0 |
| for char in id[1:]: |
| if not reop.syntax_table[char] & reop.word: |
| return 0 |
| return 1 |
| |
| # |
| # |
| # |
| |
| _cache = {} |
| _MAXCACHE = 20 |
| |
| def _cachecompile(pattern, flags=0): |
| key = (pattern, flags) |
| try: |
| return _cache[key] |
| except KeyError: |
| pass |
| value = compile(pattern, flags) |
| if len(_cache) >= _MAXCACHE: |
| _cache.clear() |
| _cache[key] = value |
| return value |
| |
| def match(pattern, string, flags=0): |
| return _cachecompile(pattern, flags).match(string) |
| |
| def search(pattern, string, flags=0): |
| return _cachecompile(pattern, flags).search(string) |
| |
| def sub(pattern, repl, string, count=0): |
| if type(pattern) == type(''): |
| pattern = _cachecompile(pattern) |
| return pattern.sub(repl, string, count) |
| |
| def subn(pattern, repl, string, count=0): |
| if type(pattern) == type(''): |
| pattern = _cachecompile(pattern) |
| return pattern.subn(repl, string, count) |
| |
| def split(pattern, string, maxsplit=0): |
| if type(pattern) == type(''): |
| pattern = _cachecompile(pattern) |
| return pattern.split(string, maxsplit) |
| |
| # |
| # |
| # |
| |
| def _expand(m, repl): |
| results = [] |
| index = 0 |
| size = len(repl) |
| while index < size: |
| found = string.find(repl, '\\', index) |
| if found < 0: |
| results.append(repl[index:]) |
| break |
| if found > index: |
| results.append(repl[index:found]) |
| escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) |
| if escape_type == CHAR: |
| results.append(value) |
| elif escape_type == MEMORY_REFERENCE: |
| r = m.group(value) |
| if r is None: |
| raise error, ('group "' + str(value) + '" did not contribute ' |
| 'to the match') |
| results.append(m.group(value)) |
| else: |
| raise error, "bad escape in replacement" |
| return string.join(results, '') |
| |
| class RegexObject: |
| def __init__(self, pattern, flags, code, num_regs, groupindex): |
| self.code = code |
| self.num_regs = num_regs |
| self.flags = flags |
| self.pattern = pattern |
| self.groupindex = groupindex |
| self.fastmap = build_fastmap(code) |
| |
| if code[0].name == 'bol': |
| self.anchor = 1 |
| |
| elif code[0].name == 'begbuf': |
| self.anchor = 2 |
| |
| else: |
| self.anchor = 0 |
| |
| self.buffer = assemble(code) |
| def search(self, string, pos=0): |
| regs = reop.search(self.buffer, |
| self.num_regs, |
| self.flags, |
| self.fastmap.can_be_null, |
| self.fastmap.fastmap(), |
| self.anchor, |
| string, |
| pos) |
| if regs is None: |
| return None |
| |
| return MatchObject(self, |
| string, |
| pos, |
| regs) |
| |
| def match(self, string, pos=0): |
| regs = reop.match(self.buffer, |
| self.num_regs, |
| self.flags, |
| self.fastmap.can_be_null, |
| self.fastmap.fastmap(), |
| self.anchor, |
| string, |
| pos) |
| if regs is None: |
| return None |
| |
| return MatchObject(self, |
| string, |
| pos, |
| regs) |
| |
| def sub(self, repl, string, count=0): |
| return self.subn(repl, string, count)[0] |
| |
| def subn(self, repl, source, count=0): |
| if count < 0: |
| raise ValueError, "negative substibution count" |
| if count == 0: |
| import sys |
| count = sys.maxint |
| if type(repl) == type(''): |
| if '\\' in repl: |
| repl = lambda m, r=repl: _expand(m, r) |
| else: |
| repl = lambda m, r=repl: r |
| n = 0 # Number of matches |
| pos = 0 # Where to start searching |
| lastmatch = -1 # End of last match |
| results = [] # Substrings making up the result |
| end = len(source) |
| while n < count and pos <= end: |
| m = self.search(source, pos) |
| if not m: |
| break |
| i, j = m.span(0) |
| if i == j == lastmatch: |
| # Empty match adjacent to previous match |
| pos = pos + 1 |
| results.append(source[lastmatch:pos]) |
| continue |
| if pos < i: |
| results.append(source[pos:i]) |
| results.append(repl(m)) |
| pos = lastmatch = j |
| if i == j: |
| # Last match was empty; don't try here again |
| pos = pos + 1 |
| results.append(source[lastmatch:pos]) |
| n = n + 1 |
| results.append(source[pos:]) |
| return (string.join(results, ''), n) |
| |
| def split(self, source, maxsplit=0): |
| if maxsplit < 0: |
| raise error, "negative split count" |
| if maxsplit == 0: |
| import sys |
| maxsplit = sys.maxint |
| n = 0 |
| pos = 0 |
| lastmatch = 0 |
| results = [] |
| end = len(source) |
| while n < maxsplit: |
| m = self.search(source, pos) |
| if not m: |
| break |
| i, j = m.span(0) |
| if i == j: |
| # Empty match |
| if pos >= end: |
| break |
| pos = pos+1 |
| continue |
| results.append(source[lastmatch:i]) |
| g = m.group() |
| if g: |
| results[len(results):] = list(g) |
| pos = lastmatch = j |
| results.append(source[lastmatch:]) |
| return results |
| |
| class MatchObject: |
| def __init__(self, re, string, pos, regs): |
| self.re = re |
| self.string = string |
| self.pos = pos |
| self.regs = regs |
| |
| def start(self, g): |
| if type(g) == type(''): |
| try: |
| g = self.re.groupindex[g] |
| except (KeyError, TypeError): |
| raise IndexError, ('group "' + g + '" is undefined') |
| return self.regs[g][0] |
| |
| def end(self, g): |
| if type(g) == type(''): |
| try: |
| g = self.re.groupindex[g] |
| except (KeyError, TypeError): |
| raise IndexError, ('group "' + g + '" is undefined') |
| return self.regs[g][1] |
| |
| def span(self, g): |
| if type(g) == type(''): |
| try: |
| g = self.re.groupindex[g] |
| except (KeyError, TypeError): |
| raise IndexError, ('group "' + g + '" is undefined') |
| return self.regs[g] |
| |
| def group(self, *groups): |
| if len(groups) == 0: |
| groups = range(1, self.re.num_regs) |
| use_all = 1 |
| else: |
| use_all = 0 |
| result = [] |
| for g in groups: |
| if type(g) == type(''): |
| try: |
| g = self.re.groupindex[g] |
| except (KeyError, TypeError): |
| raise IndexError, ('group "' + g + '" is undefined') |
| if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): |
| result.append(None) |
| else: |
| result.append(self.string[self.regs[g][0]:self.regs[g][1]]) |
| if use_all or len(result) > 1: |
| return tuple(result) |
| elif len(result) == 1: |
| return result[0] |
| else: |
| return () |
| |
| # |
| # A set of classes to make assembly a bit easier, if a bit verbose. |
| # |
| |
| class Instruction: |
| def __init__(self, opcode, size=1): |
| self.opcode = opcode |
| self.size = size |
| def assemble(self, position, labels): |
| return self.opcode |
| def __repr__(self): |
| return '%-15s' % (self.name) |
| |
| class End(Instruction): |
| name = 'end' |
| def __init__(self): |
| Instruction.__init__(self, chr(0)) |
| |
| class Bol(Instruction): |
| name = 'bol' |
| def __init__(self): |
| self.name = 'bol' |
| Instruction.__init__(self, chr(1)) |
| |
| class Eol(Instruction): |
| name = 'eol' |
| def __init__(self): |
| Instruction.__init__(self, chr(2)) |
| |
| class Set(Instruction): |
| name = 'set' |
| def __init__(self, set): |
| self.set = set |
| Instruction.__init__(self, chr(3), 33) |
| def assemble(self, position, labels): |
| result = self.opcode |
| temp = 0 |
| for i, c in map(lambda x: (x, chr(x)), range(256)): |
| if c in self.set: |
| temp = temp | (1 << (i & 7)) |
| if (i % 8) == 7: |
| result = result + chr(temp) |
| temp = 0 |
| return result |
| def __repr__(self): |
| result = '%-15s' % (self.name) |
| self.set.sort() |
| for char in self.set: |
| result = result + char |
| return result |
| |
| class Exact(Instruction): |
| name = 'exact' |
| def __init__(self, char): |
| self.char = char |
| Instruction.__init__(self, chr(4), 2) |
| def assemble(self, position, labels): |
| return self.opcode + self.char |
| def __repr__(self): |
| return '%-15s %s' % (self.name, `self.char`) |
| |
| class AnyChar(Instruction): |
| name = 'anychar' |
| def __init__(self): |
| Instruction.__init__(self, chr(5)) |
| def assemble(self, position, labels): |
| return self.opcode |
| |
| class MemoryInstruction(Instruction): |
| def __init__(self, opcode, register): |
| self.register = register |
| Instruction.__init__(self, opcode, 2) |
| def assemble(self, position, labels): |
| return self.opcode + chr(self.register) |
| def __repr__(self): |
| return '%-15s %i' % (self.name, self.register) |
| |
| class StartMemory(MemoryInstruction): |
| name = 'start_memory' |
| def __init__(self, register): |
| MemoryInstruction.__init__(self, chr(6), register) |
| |
| class EndMemory(MemoryInstruction): |
| name = 'end_memory' |
| def __init__(self, register): |
| MemoryInstruction.__init__(self, chr(7), register) |
| |
| class MatchMemory(MemoryInstruction): |
| name = 'match_memory' |
| def __init__(self, register): |
| MemoryInstruction.__init__(self, chr(8), register) |
| |
| class JumpInstruction(Instruction): |
| def __init__(self, opcode, label): |
| self.label = label |
| Instruction.__init__(self, opcode, 3) |
| def compute_offset(self, start, dest): |
| return dest - (start + 3) |
| def pack_offset(self, offset): |
| if offset > 32767: |
| raise error, 'offset out of range (pos)' |
| elif offset < -32768: |
| raise error, 'offset out of range (neg)' |
| elif offset < 0: |
| offset = offset + 65536 |
| return chr(offset & 0xff) + chr((offset >> 8) & 0xff) |
| def assemble(self, position, labels): |
| return self.opcode + \ |
| self.pack_offset(self.compute_offset(position, |
| labels[self.label])) |
| def __repr__(self): |
| return '%-15s %i' % (self.name, self.label) |
| |
| class Jump(JumpInstruction): |
| name = 'jump' |
| def __init__(self, label): |
| JumpInstruction.__init__(self, chr(9), label) |
| |
| class StarJump(JumpInstruction): |
| name = 'star_jump' |
| def __init__(self, label): |
| JumpInstruction.__init__(self, chr(10), label) |
| |
| class FailureJump(JumpInstruction): |
| name = 'failure_jump' |
| def __init__(self, label): |
| JumpInstruction.__init__(self, chr(11), label) |
| |
| class UpdateFailureJump(JumpInstruction): |
| name = 'update_failure_jump' |
| def __init__(self, label): |
| JumpInstruction.__init__(self, chr(12), label) |
| |
| class DummyFailureJump(JumpInstruction): |
| name = 'dummy_failure_jump' |
| def __init__(self, label): |
| JumpInstruction.__init__(self, chr(13), label) |
| |
| class BegBuf(Instruction): |
| name = 'begbuf' |
| def __init__(self): |
| Instruction.__init__(self, chr(14)) |
| |
| class EndBuf(Instruction): |
| name = 'endbuf' |
| def __init__(self): |
| Instruction.__init__(self, chr(15)) |
| |
| class WordBeg(Instruction): |
| name = 'wordbeg' |
| def __init__(self): |
| Instruction.__init__(self, chr(16)) |
| |
| class WordEnd(Instruction): |
| name = 'wordend' |
| def __init__(self): |
| Instruction.__init__(self, chr(17)) |
| |
| class WordBound(Instruction): |
| name = 'wordbound' |
| def __init__(self): |
| Instruction.__init__(self, chr(18)) |
| |
| class NotWordBound(Instruction): |
| name = 'notwordbound' |
| def __init__(self): |
| Instruction.__init__(self, chr(18)) |
| |
| class SyntaxSpec(Instruction): |
| name = 'syntaxspec' |
| def __init__(self, syntax): |
| self.syntax = syntax |
| Instruction.__init__(self, chr(20), 2) |
| def assemble(self, postition, labels): |
| return self.opcode + chr(self.syntax) |
| |
| class NotSyntaxSpec(Instruction): |
| name = 'notsyntaxspec' |
| def __init__(self, syntax): |
| self.syntax = syntax |
| Instruction.__init__(self, chr(21), 2) |
| def assemble(self, postition, labels): |
| return self.opcode + chr(self.syntax) |
| |
| class Label(Instruction): |
| name = 'label' |
| def __init__(self, label): |
| self.label = label |
| Instruction.__init__(self, '', 0) |
| def __repr__(self): |
| return '%-15s %i' % (self.name, self.label) |
| |
| class OpenParen(Instruction): |
| name = '(' |
| def __init__(self, register): |
| self.register = register |
| Instruction.__init__(self, '', 0) |
| def assemble(self, position, labels): |
| raise error, 'unmatched open parenthesis' |
| |
| class Alternation(Instruction): |
| name = '|' |
| def __init__(self): |
| Instruction.__init__(self, '', 0) |
| def assemble(self, position, labels): |
| raise error, 'an alternation was not taken care of' |
| |
| # |
| # |
| # |
| |
| def assemble(instructions): |
| labels = {} |
| position = 0 |
| pass1 = [] |
| for instruction in instructions: |
| if instruction.name == 'label': |
| labels[instruction.label] = position |
| else: |
| pass1.append((position, instruction)) |
| position = position + instruction.size |
| pass2 = '' |
| for position, instruction in pass1: |
| pass2 = pass2 + instruction.assemble(position, labels) |
| return pass2 |
| |
| # |
| # |
| # |
| |
| def escape(pattern): |
| result = [] |
| for char in pattern: |
| if not reop.syntax_table[char] & reop.word: |
| result.append('\\') |
| result.append(char) |
| return string.join(result, '') |
| |
| # |
| # |
| # |
| |
| def registers_used(instructions): |
| result = [] |
| for instruction in instructions: |
| if (instruction.name in ['set_memory', 'end_memory']) and \ |
| (instruction.register not in result): |
| result.append(instruction.register) |
| return result |
| |
| # |
| # |
| # |
| |
| class Fastmap: |
| def __init__(self): |
| self.map = ['\000']*256 |
| self.can_be_null = 0 |
| def add(self, char): |
| self.map[ord(char)] = '\001' |
| def fastmap(self): |
| return string.join(self.map, '') |
| def __getitem__(self, char): |
| return ord(self.map[ord(char)]) |
| def __repr__(self): |
| self.map.sort() |
| return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' |
| |
| # |
| # |
| # |
| |
| def find_label(code, label): |
| line = 0 |
| for instruction in code: |
| if (instruction.name == 'label') and (instruction.label == label): |
| return line + 1 |
| line = line + 1 |
| |
| def build_fastmap_aux(code, pos, visited, fastmap): |
| if visited[pos]: |
| return |
| while 1: |
| instruction = code[pos] |
| visited[pos] = 1 |
| pos = pos + 1 |
| if instruction.name == 'end': |
| fastmap.can_be_null = 1 |
| return |
| elif instruction.name == 'syntaxspec': |
| for char in map(chr, range(256)): |
| if reop.syntax_table[char] & instruction.syntax: |
| fastmap.add(char) |
| return |
| elif instruction.name == 'notsyntaxspec': |
| for char in map(chr, range(256)): |
| if not reop.syntax_table[char] & instruction.syntax: |
| fastmap.add(char) |
| return |
| elif instruction.name == 'eol': |
| fastmap.add('\n') |
| if fastmap.can_be_null == 0: |
| fastmap.can_be_null = 2 |
| return |
| elif instruction.name == 'set': |
| for char in instruction.set: |
| fastmap.add(char) |
| return |
| elif instruction.name == 'exact': |
| fastmap.add(instruction.char) |
| elif instruction.name == 'anychar': |
| for char in map(chr, range(256)): |
| if char != '\n': |
| fastmap.add(char) |
| return |
| elif instruction.name == 'match_memory': |
| for char in map(chr, range(256)): |
| fastmap.add(char) |
| fastmap.can_be_null = 1 |
| return |
| elif instruction.name in ['jump', 'dummy_failure_jump', \ |
| 'update_failure_jump', 'star_jump']: |
| pos = find_label(code, instruction.label) |
| if visited[pos]: |
| return |
| visited[pos] = 1 |
| elif instruction.name == 'failure_jump': |
| build_fastmap_aux(code, |
| find_label(code, instruction.label), |
| visited, |
| fastmap) |
| |
| def build_fastmap(code, pos=0): |
| visited = [0] * len(code) |
| fastmap = Fastmap() |
| build_fastmap_aux(code, pos, visited, fastmap) |
| return fastmap |
| |
| # |
| # |
| # |
| |
| [NORMAL, CHARCLASS, REPLACEMENT] = range(3) |
| [CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, |
| NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) |
| |
| def expand_escape(pattern, index, context=NORMAL): |
| if index >= len(pattern): |
| raise error, 'escape ends too soon' |
| |
| elif pattern[index] == 't': |
| return CHAR, chr(9), index + 1 |
| |
| elif pattern[index] == 'n': |
| return CHAR, chr(10), index + 1 |
| |
| elif pattern[index] == 'r': |
| return CHAR, chr(13), index + 1 |
| |
| elif pattern[index] == 'f': |
| return CHAR, chr(12), index + 1 |
| |
| elif pattern[index] == 'a': |
| return CHAR, chr(7), index + 1 |
| |
| elif pattern[index] == 'e': |
| return CHAR, chr(27), index + 1 |
| |
| elif pattern[index] == 'c': |
| if index + 1 >= len(pattern): |
| raise error, '\\c must be followed by another character' |
| elif pattern[index + 1] in 'abcdefghijklmnopqrstuvwxyz': |
| return CHAR, chr(ord(pattern[index + 1]) - ord('a') + 1), index + 2 |
| else: |
| return CHAR, chr(ord(pattern[index + 1]) ^ 64), index + 2 |
| |
| elif pattern[index] == 'x': |
| # CAUTION: this is the Python rule, not the Perl rule! |
| end = index |
| while (end < len(pattern)) and (pattern[end] in string.hexdigits): |
| end = end + 1 |
| if end == index: |
| raise error, "\\x must be followed by hex digit(s)" |
| # let Python evaluate it, so we don't incorrectly 2nd-guess |
| # what it's doing (and Python in turn passes it on to sscanf, |
| # so that *it* doesn't incorrectly 2nd-guess what C does!) |
| char = eval ('"' + pattern[index-2:end] + '"') |
| assert len(char) == 1 |
| return CHAR, char, end |
| |
| elif pattern[index] == 'b': |
| if context != NORMAL: |
| return CHAR, chr(8), index + 1 |
| else: |
| return WORD_BOUNDARY, '', index + 1 |
| |
| elif pattern[index] == 'B': |
| if context != NORMAL: |
| return CHAR, 'B', index + 1 |
| else: |
| return NOT_WORD_BOUNDARY, '', index + 1 |
| |
| elif pattern[index] == 'A': |
| if context != NORMAL: |
| return CHAR, 'A', index + 1 |
| else: |
| return BEGINNING_OF_BUFFER, '', index + 1 |
| |
| elif pattern[index] == 'Z': |
| if context != NORMAL: |
| return 'Z', index + 1 |
| else: |
| return END_OF_BUFFER, '', index + 1 |
| |
| elif pattern[index] in 'GluLUQE': |
| raise error, ('\\' + ch + ' is not allowed') |
| |
| elif pattern[index] == 'w': |
| if context == NORMAL: |
| return SYNTAX, reop.word, index + 1 |
| elif context == CHARCLASS: |
| set = [] |
| for char in reop.syntax_table.keys(): |
| if reop.syntax_table[char] & reop.word: |
| set.append(char) |
| return SET, set, index + 1 |
| else: |
| return CHAR, 'w', index + 1 |
| |
| elif pattern[index] == 'W': |
| if context == NORMAL: |
| return NOT_SYNTAX, reop.word, index + 1 |
| elif context == CHARCLASS: |
| set = [] |
| for char in reop.syntax_table.keys(): |
| if not reop.syntax_table[char] & reop.word: |
| set.append(char) |
| return SET, set, index + 1 |
| else: |
| return CHAR, 'W', index + 1 |
| |
| elif pattern[index] == 's': |
| if context == NORMAL: |
| return SYNTAX, reop.whitespace, index + 1 |
| elif context == CHARCLASS: |
| set = [] |
| for char in reop.syntax_table.keys(): |
| if reop.syntax_table[char] & reop.whitespace: |
| set.append(char) |
| return SET, set, index + 1 |
| else: |
| return CHAR, 's', index + 1 |
| |
| elif pattern[index] == 'S': |
| if context == NORMAL: |
| return NOT_SYNTAX, reop.whitespace, index + 1 |
| elif context == CHARCLASS: |
| set = [] |
| for char in reop.syntax_table.keys(): |
| if not reop.syntax_table[char] & reop.word: |
| set.append(char) |
| return SET, set, index + 1 |
| else: |
| return CHAR, 'S', index + 1 |
| |
| elif pattern[index] == 'd': |
| if context == NORMAL: |
| return SYNTAX, reop.digit, index + 1 |
| elif context == CHARCLASS: |
| set = [] |
| for char in reop.syntax_table.keys(): |
| if reop.syntax_table[char] & reop.digit: |
| set.append(char) |
| return SET, set, index + 1 |
| else: |
| return CHAR, 'd', index + 1 |
| |
| elif pattern[index] == 'D': |
| if context == NORMAL: |
| return NOT_SYNTAX, reop.digit, index + 1 |
| elif context == CHARCLASS: |
| set = [] |
| for char in reop.syntax_table.keys(): |
| if not reop.syntax_table[char] & reop.digit: |
| set.append(char) |
| return SET, set, index + 1 |
| else: |
| return CHAR, 'D', index + 1 |
| |
| elif pattern[index] in '0123456789': |
| |
| if pattern[index] == '0': |
| if (index + 1 < len(pattern)) and \ |
| (pattern[index + 1] in string.octdigits): |
| if (index + 2 < len(pattern)) and \ |
| (pattern[index + 2] in string.octdigits): |
| value = string.atoi(pattern[index:index + 3], 8) |
| index = index + 3 |
| |
| else: |
| value = string.atoi(pattern[index:index + 2], 8) |
| index = index + 2 |
| |
| else: |
| value = 0 |
| index = index + 1 |
| |
| if value > 255: |
| raise error, 'octal value out of range' |
| |
| return CHAR, chr(value), index |
| |
| else: |
| if (index + 1 < len(pattern)) and \ |
| (pattern[index + 1] in string.digits): |
| if (index + 2 < len(pattern)) and \ |
| (pattern[index + 2] in string.octdigits) and \ |
| (pattern[index + 1] in string.octdigits) and \ |
| (pattern[index] in string.octdigits): |
| value = string.atoi(pattern[index:index + 3], 8) |
| if value > 255: |
| raise error, 'octal value out of range' |
| |
| return CHAR, chr(value), index + 3 |
| |
| else: |
| value = string.atoi(pattern[index:index + 2]) |
| if (value < 1) or (value > 99): |
| raise error, 'memory reference out of range' |
| |
| if context == CHARCLASS: |
| raise error, ('cannot reference a register from ' |
| 'inside a character class') |
| return MEMORY_REFERENCE, value, index + 2 |
| |
| else: |
| if context == CHARCLASS: |
| raise error, ('cannot reference a register from ' |
| 'inside a character class') |
| |
| value = string.atoi(pattern[index]) |
| return MEMORY_REFERENCE, value, index + 1 |
| |
| elif pattern[index] == 'g': |
| if context != REPLACEMENT: |
| return CHAR, 'g', index + 1 |
| |
| index = index + 1 |
| if index >= len(pattern): |
| raise error, 'unfinished symbolic reference' |
| if pattern[index] != '<': |
| raise error, 'missing < in symbolic reference' |
| |
| index = index + 1 |
| end = string.find(pattern, '>', index) |
| if end == -1: |
| raise error, 'unfinished symbolic reference' |
| value = pattern[index:end] |
| if not valid_identifier(value): |
| raise error, 'illegal symbolic reference' |
| return MEMORY_REFERENCE, value, end + 1 |
| |
| else: |
| return CHAR, pattern[index], index + 1 |
| |
| def compile(pattern, flags=0): |
| stack = [] |
| label = 0 |
| register = 1 |
| groupindex = {} |
| lastop = '' |
| |
| # look for embedded pattern modifiers at the beginning of the pattern |
| |
| index = 0 |
| |
| if len(pattern) >= 3 and \ |
| (pattern[:2] == '(?') and \ |
| (pattern[2] in 'iImMsSxX'): |
| index = 2 |
| while (index < len(pattern)) and (pattern[index] != ')'): |
| if pattern[index] in 'iI': |
| flags = flags | IGNORECASE |
| elif pattern[index] in 'mM': |
| flags = flags | MULTILINE |
| elif pattern[index] in 'sS': |
| flags = flags | DOTALL |
| elif pattern[index] in 'xX': |
| flags = flags | VERBOSE |
| else: |
| raise error, 'unknown modifier' |
| index = index + 1 |
| index = index + 1 |
| |
| # compile the rest of the pattern |
| |
| while (index < len(pattern)): |
| char = pattern[index] |
| index = index + 1 |
| if char == '\\': |
| escape_type, value, index = expand_escape(pattern, index) |
| |
| if escape_type == CHAR: |
| stack.append([Exact(value)]) |
| lastop = '\\' + value |
| |
| elif escape_type == MEMORY_REFERENCE: |
| if value >= register: |
| raise error, ('cannot reference a register ' |
| 'not yet used') |
| stack.append([MatchMemory(value)]) |
| lastop = '\\1' |
| |
| elif escape_type == BEGINNING_OF_BUFFER: |
| stack.append([BegBuf()]) |
| lastop = '\\A' |
| |
| elif escape_type == END_OF_BUFFER: |
| stack.append([EndBuf()]) |
| lastop = '\\Z' |
| |
| elif escape_type == WORD_BOUNDARY: |
| stack.append([WordBound()]) |
| lastop = '\\b' |
| |
| elif escape_type == NOT_WORD_BOUNDARY: |
| stack.append([NotWordBound()]) |
| lastop = '\\B' |
| |
| elif escape_type == SYNTAX: |
| stack.append([SyntaxSpec(value)]) |
| if value == reop.word: |
| lastop = '\\w' |
| elif value == reop.whitespace: |
| lastop = '\\s' |
| elif value == reop.digit: |
| lastop = '\\d' |
| else: |
| lastop = '\\?' |
| |
| elif escape_type == NOT_SYNTAX: |
| stack.append([NotSyntaxSpec(value)]) |
| if value == reop.word: |
| lastop = '\\W' |
| elif value == reop.whitespace: |
| lastop = '\\S' |
| elif value == reop.digit: |
| lastop = '\\D' |
| else: |
| lastop = '\\?' |
| |
| elif escape_type == SET: |
| raise error, 'cannot use set escape type here' |
| |
| else: |
| raise error, 'unknown escape type' |
| |
| elif char == '|': |
| expr = [] |
| |
| while (len(stack) != 0) and \ |
| (stack[-1][0].name != '(') and \ |
| (stack[-1][0].name != '|'): |
| expr = stack[-1] + expr |
| del stack[-1] |
| stack.append([FailureJump(label)] + \ |
| expr + \ |
| [Jump(-1), |
| Label(label)]) |
| stack.append([Alternation()]) |
| label = label + 1 |
| lastop = '|' |
| |
| elif char == '(': |
| if index >= len(pattern): |
| raise error, 'no matching close paren' |
| |
| elif pattern[index] == '?': |
| # Perl style (?...) extensions |
| index = index + 1 |
| if index >= len(pattern): |
| raise error, 'extension ends prematurely' |
| |
| elif pattern[index] == 'P': |
| # Python extensions |
| index = index + 1 |
| if index >= len(pattern): |
| raise error, 'extension ends prematurely' |
| |
| elif pattern[index] == '<': |
| # Handle Python symbolic group names (?P<...>...) |
| index = index + 1 |
| end = string.find(pattern, '>', index) |
| if end == -1: |
| raise error, 'no end to symbolic group name' |
| name = pattern[index:end] |
| if not valid_identifier(name): |
| raise error, ('symbolic group name must be a ' |
| 'valid identifier') |
| index = end + 1 |
| groupindex[name] = register |
| stack.append([OpenParen(register)]) |
| register = register + 1 |
| lastop = '(' |
| |
| elif pattern[index] == '=': |
| # backreference to symbolic group name |
| if index >= len(pattern): |
| raise error, '(?P= at the end of the pattern' |
| start = index + 1 |
| end = string.find(pattern, ')', start) |
| if end == -1: |
| raise error, 'no ) to end symbolic group name' |
| name = pattern[start:end] |
| if name not in groupindex.keys(): |
| raise error, ('symbolic group name ' + name + \ |
| ' has not been used yet') |
| stack.append([MatchMemory(groupindex[name])]) |
| index = end + 1 |
| lastop = '(?P=)' |
| |
| else: |
| raise error, ('unknown Python extension: ' + \ |
| pattern[index]) |
| |
| elif pattern[index] == ':': |
| # grouping, but no registers |
| index = index + 1 |
| stack.append([OpenParen(-1)]) |
| lastop = '(' |
| |
| elif pattern[index] == '#': |
| # comment |
| index = index + 1 |
| end = string.find(pattern, ')', index) |
| if end == -1: |
| raise error, 'no end to comment' |
| index = end + 1 |
| # do not change lastop |
| |
| elif pattern[index] == '=': |
| raise error, ('zero-width positive lookahead ' |
| 'assertion is unsupported') |
| |
| elif pattern[index] == '!': |
| raise error, ('zero-width negative lookahead ' |
| 'assertion is unsupported') |
| |
| elif pattern[index] in 'iImMsSxX': |
| raise error, ('embedded pattern modifiers are only ' |
| 'allowed at the beginning of the pattern') |
| |
| else: |
| raise error, 'unknown extension' |
| |
| else: |
| stack.append([OpenParen(register)]) |
| register = register + 1 |
| lastop = '(' |
| |
| elif char == ')': |
| # make one expression out of everything on the stack up to |
| # the marker left by the last parenthesis |
| expr = [] |
| while (len(stack) > 0) and (stack[-1][0].name != '('): |
| expr = stack[-1] + expr |
| del stack[-1] |
| |
| if len(stack) == 0: |
| raise error, 'too many close parens' |
| |
| # remove markers left by alternation |
| expr = filter(lambda x: x.name != '|', expr) |
| |
| # clean up jumps inserted by alternation |
| need_label = 0 |
| for i in range(len(expr)): |
| if (expr[i].name == 'jump') and (expr[i].label == -1): |
| expr[i] = Jump(label) |
| need_label = 1 |
| if need_label: |
| expr.append(Label(label)) |
| label = label + 1 |
| |
| if stack[-1][0].register > 0: |
| expr = [StartMemory(stack[-1][0].register)] + \ |
| expr + \ |
| [EndMemory(stack[-1][0].register)] |
| del stack[-1] |
| stack.append(expr) |
| lastop = ')' |
| |
| elif char == '{': |
| if len(stack) == 0: |
| raise error, 'no expression to repeat' |
| end = string.find(pattern, '}', index) |
| if end == -1: |
| raise error, ('no close curly bracket to match' |
| ' open curly bracket') |
| |
| fields = map(string.strip, |
| string.split(pattern[index:end], ',')) |
| index = end + 1 |
| |
| minimal = 0 |
| if (index < len(pattern)) and (pattern[index] == '?'): |
| minimal = 1 |
| index = index + 1 |
| |
| if len(fields) == 1: |
| # {n} or {n}? (there's really no difference) |
| try: |
| count = string.atoi(fields[0]) |
| except ValueError: |
| raise error, ('count must be an integer ' |
| 'inside curly braces') |
| if count > 65535: |
| raise error, 'repeat count out of range' |
| expr = [] |
| while count > 0: |
| expr = expr + stack[-1] |
| count = count - 1 |
| del stack[-1] |
| stack.append(expr) |
| if minimal: |
| lastop = '{n}?' |
| else: |
| lastop = '{n}' |
| |
| elif len(fields) == 2: |
| # {n,} or {n,m} |
| if fields[1] == '': |
| # {n,} |
| try: |
| min = string.atoi(fields[0]) |
| except ValueError: |
| raise error, ('minimum must be an integer ' |
| 'inside curly braces') |
| if min > 65535: |
| raise error, 'minimum repeat count out of range' |
| |
| expr = [] |
| while min > 0: |
| expr = expr + stack[-1] |
| min = min - 1 |
| if minimal: |
| expr = expr + \ |
| ([Jump(label + 1), |
| Label(label)] + \ |
| stack[-1] + \ |
| [Label(label + 1), |
| FailureJump(label)]) |
| lastop = '{n,}?' |
| else: |
| expr = expr + \ |
| ([Label(label), |
| FailureJump(label + 1)] + |
| stack[-1] + |
| [StarJump(label), |
| Label(label + 1)]) |
| lastop = '{n,}' |
| |
| del stack[-1] |
| stack.append(expr) |
| label = label + 2 |
| |
| else: |
| # {n,m} |
| try: |
| min = string.atoi(fields[0]) |
| except ValueError: |
| raise error, ('minimum must be an integer ' |
| 'inside curly braces') |
| try: |
| max = string.atoi(fields[1]) |
| except ValueError: |
| raise error, ('maximum must be an integer ' |
| 'inside curly braces') |
| if min > 65535: |
| raise error, ('minumim repeat count out ' |
| 'of range') |
| if max > 65535: |
| raise error, ('maximum repeat count out ' |
| 'of range') |
| if min > max: |
| raise error, ('minimum repeat count must be ' |
| 'less than the maximum ' |
| 'repeat count') |
| expr = [] |
| while min > 0: |
| expr = expr + stack[-1] |
| min = min - 1 |
| max = max - 1 |
| if minimal: |
| while max > 0: |
| expr = expr + \ |
| [FailureJump(label), |
| Jump(label + 1), |
| Label(label)] + \ |
| stack[-1] + \ |
| [Label(label + 1)] |
| max = max - 1 |
| label = label + 2 |
| del stack[-1] |
| stack.append(expr) |
| lastop = '{n,m}?' |
| else: |
| while max > 0: |
| expr = expr + \ |
| [FailureJump(label)] + \ |
| stack[-1] |
| max = max - 1 |
| del stack[-1] |
| stack.append(expr + [Label(label)]) |
| label = label + 1 |
| lastop = '{n,m}' |
| |
| else: |
| raise error, ('there need to be one or two fields ' |
| 'in a {} expression') |
| |
| elif char == '}': |
| raise error, 'unbalanced close curly brace' |
| |
| elif char == '*': |
| # Kleene closure |
| if len(stack) == 0: |
| raise error, '* needs something to repeat' |
| |
| if lastop in ['(', '|']: |
| raise error, '* needs something to repeat' |
| |
| if lastop in repetition_operators: |
| raise error, 'nested repetition operators' |
| |
| if (index < len(pattern)) and (pattern[index] == '?'): |
| # non-greedy matching |
| expr = [Jump(label + 1), |
| Label(label)] + \ |
| stack[-1] + \ |
| [Label(label + 1), |
| FailureJump(label)] |
| index = index + 1 |
| lastop = '*?' |
| else: |
| # greedy matching |
| expr = [Label(label), |
| FailureJump(label + 1)] + \ |
| stack[-1] + \ |
| [StarJump(label), |
| Label(label + 1)] |
| lastop = '*' |
| del stack[-1] |
| stack.append(expr) |
| label = label + 2 |
| |
| elif char == '+': |
| # positive closure |
| if len(stack) == 0: |
| raise error, '+ needs something to repeat' |
| |
| if lastop in ['(', '|']: |
| raise error, '+ needs something to repeat' |
| |
| if lastop in repetition_operators: |
| raise error, 'nested repetition operators' |
| |
| if (index < len(pattern)) and (pattern[index] == '?'): |
| # non-greedy |
| expr = [Label(label)] + \ |
| stack[-1] + \ |
| [FailureJump(label)] |
| label = label + 1 |
| index = index + 1 |
| lastop = '+?' |
| |
| else: |
| # greedy |
| expr = [DummyFailureJump(label + 1), |
| Label(label), |
| FailureJump(label + 2), |
| Label(label + 1)] + \ |
| stack[-1] + \ |
| [StarJump(label), |
| Label(label + 2)] |
| label = label + 3 |
| lastop = '+' |
| |
| del stack[-1] |
| stack.append(expr) |
| |
| elif char == '?': |
| if len(stack) == 0: |
| raise error, 'need something to be optional' |
| |
| if len(stack) == 0: |
| raise error, '? needs something to repeat' |
| |
| if lastop in ['(', '|']: |
| raise error, '? needs something to repeat' |
| |
| if lastop in repetition_operators: |
| raise error, 'nested repetition operators' |
| |
| if (index < len(pattern)) and (pattern[index] == '?'): |
| # non-greedy matching |
| expr = [FailureJump(label), |
| Jump(label + 1), |
| Label(label)] + \ |
| stack[-1] + \ |
| [Label(label + 1)] |
| label = label + 2 |
| index = index + 1 |
| lastop = '??' |
| |
| else: |
| # greedy matching |
| expr = [FailureJump(label)] + \ |
| stack[-1] + \ |
| [Label(label)] |
| label = label + 1 |
| lastop = '?' |
| |
| del stack[-1] |
| stack.append(expr) |
| |
| elif char == '.': |
| if flags & DOTALL: |
| stack.append([Set(map(chr, range(256)))]) |
| else: |
| stack.append([AnyChar()]) |
| lastop = '.' |
| |
| elif char == '^': |
| if flags & MULTILINE: |
| stack.append([Bol()]) |
| else: |
| stack.append([BegBuf()]) |
| lastop = '^' |
| |
| elif char == '$': |
| if flags & MULTILINE: |
| stack.append([Eol()]) |
| else: |
| stack.append([EndBuf()]) |
| lastop = '$' |
| |
| elif char == '#': |
| if flags & VERBOSE: |
| # comment |
| index = index + 1 |
| end = string.find(pattern, '\n', index) |
| if end == -1: |
| index = len(pattern) |
| else: |
| index = end + 1 |
| # do not change lastop |
| else: |
| stack.append([Exact(char)]) |
| lastop = '#' |
| |
| elif char in string.whitespace: |
| if not (flags & VERBOSE): |
| stack.append([Exact(char)]) |
| lastop = char |
| |
| elif char == '[': |
| # compile character class |
| |
| if index >= len(pattern): |
| raise error, 'unclosed character class' |
| |
| negate = 0 |
| last = '' |
| set = [] |
| |
| if pattern[index] == '^': |
| negate = 1 |
| index = index + 1 |
| if index >= len(pattern): |
| raise error, 'unclosed character class' |
| |
| if pattern[index] == ']': |
| set.append(']') |
| index = index + 1 |
| if index >= len(pattern): |
| raise error, 'unclosed character class' |
| |
| elif pattern[index] == '-': |
| set.append('-') |
| index = index + 1 |
| if index >= len(pattern): |
| raise error, 'unclosed character class' |
| |
| while (index < len(pattern)) and (pattern[index] != ']'): |
| next = pattern[index] |
| index = index + 1 |
| if next == '-': |
| if index >= len(pattern): |
| raise error, 'incomplete range in character class' |
| |
| elif pattern[index] == ']': |
| set.append('-') |
| |
| else: |
| if last == '': |
| raise error, ('improper use of range in ' |
| 'character class') |
| |
| start = last |
| |
| if pattern[index] == '\\': |
| escape_type, |
| value, |
| index = expand_escape(pattern, |
| index + 1, |
| CHARCLASS) |
| |
| if escape_type == CHAR: |
| end = value |
| |
| else: |
| raise error, ('illegal escape in character ' |
| 'class range') |
| else: |
| end = pattern[index] |
| index = index + 1 |
| |
| if start > end: |
| raise error, ('range arguments out of order ' |
| 'in character class') |
| |
| for char in map(chr, range(ord(start), ord(end) + 1)): |
| if char not in set: |
| set.append(char) |
| |
| last = '' |
| |
| elif next == '\\': |
| # expand syntax meta-characters and add to set |
| if index >= len(pattern): |
| raise error, 'incomplete set' |
| |
| escape_type, value, index = expand_escape(pattern, |
| index, |
| CHARCLASS) |
| |
| if escape_type == CHAR: |
| set.append(value) |
| last = value |
| |
| elif escape_type == SET: |
| for char in value: |
| if char not in set: |
| set.append(char) |
| last = '' |
| |
| else: |
| raise error, 'illegal escape type in character class' |
| |
| else: |
| if next not in set: |
| set.append(next) |
| last = next |
| |
| if (index >= len(pattern)) or ( pattern[index] != ']'): |
| raise error, 'incomplete set' |
| |
| index = index + 1 |
| |
| if negate: |
| notset = [] |
| for char in map(chr, range(256)): |
| if char not in set: |
| notset.append(char) |
| if len(notset) == 0: |
| raise error, 'empty negated set' |
| stack.append([Set(notset)]) |
| else: |
| if len(set) == 0: |
| raise error, 'empty set' |
| stack.append([Set(set)]) |
| |
| lastop = '[]' |
| |
| else: |
| stack.append([Exact(char)]) |
| lastop = char |
| |
| code = [] |
| while len(stack) > 0: |
| if stack[-1][0].name == '(': |
| raise error, 'too many open parens' |
| code = stack[-1] + code |
| del stack[-1] |
| if len(code) == 0: |
| raise error, 'no code generated' |
| code = filter(lambda x: x.name != '|', code) |
| need_label = 0 |
| for i in range(len(code)): |
| if (code[i].name == 'jump') and (code[i].label == -1): |
| code[i] = Jump(label) |
| need_label = 1 |
| if need_label: |
| code.append(Label(label)) |
| label = label + 1 |
| code.append(End()) |
| return RegexObject(pattern, flags, code, register, groupindex) |