| # | 
 | # Secret Labs' Regular Expression Engine | 
 | # $Id$ | 
 | # | 
 | # convert re-style regular expression to SRE template.  the current | 
 | # implementation is somewhat incomplete, and not very fast.  should | 
 | # definitely be rewritten before Python 1.6 goes beta. | 
 | # | 
 | # Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved. | 
 | # | 
 | # This code can only be used for 1.6 alpha testing.  All other use | 
 | # require explicit permission from Secret Labs AB. | 
 | # | 
 | # Portions of this engine have been developed in cooperation with | 
 | # CNRI.  Hewlett-Packard provided funding for 1.6 integration and | 
 | # other compatibility work. | 
 | # | 
 |  | 
 | # FIXME: comments marked with the FIXME tag are open issues.  all such | 
 | # issues should be closed before the final beta. | 
 |  | 
 | import string, sys | 
 |  | 
 | from sre_constants import * | 
 |  | 
 | SPECIAL_CHARS = ".\\[{()*+?^$|" | 
 | REPEAT_CHARS  = "*+?{" | 
 |  | 
 | # FIXME: string in tuple tests may explode with if char is unicode :-( | 
 | DIGITS = tuple(string.digits) | 
 |  | 
 | OCTDIGITS = tuple("01234567") | 
 | HEXDIGITS = tuple("0123456789abcdefABCDEF") | 
 |  | 
 | ESCAPES = { | 
 |     "\\a": (LITERAL, chr(7)), | 
 |     "\\b": (LITERAL, chr(8)), | 
 |     "\\f": (LITERAL, chr(12)), | 
 |     "\\n": (LITERAL, chr(10)), | 
 |     "\\r": (LITERAL, chr(13)), | 
 |     "\\t": (LITERAL, chr(9)), | 
 |     "\\v": (LITERAL, chr(11)) | 
 | } | 
 |  | 
 | CATEGORIES = { | 
 |     "\\A": (AT, AT_BEGINNING), # start of string | 
 |     "\\b": (AT, AT_BOUNDARY), | 
 |     "\\B": (AT, AT_NON_BOUNDARY), | 
 |     "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), | 
 |     "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), | 
 |     "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), | 
 |     "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), | 
 |     "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]), | 
 |     "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), | 
 |     "\\Z": (AT, AT_END), # end of string | 
 | } | 
 |  | 
 | class Pattern: | 
 |     # FIXME: <fl> rename class, and store flags in here too! | 
 |     def __init__(self): | 
 | 	self.flags = [] | 
 | 	self.groups = 1 | 
 | 	self.groupdict = {} | 
 |     def getgroup(self, name=None): | 
 | 	gid = self.groups | 
 | 	self.groups = gid + 1 | 
 | 	if name: | 
 | 	    self.groupdict[name] = gid | 
 | 	return gid | 
 |     def setflag(self, flag): | 
 | 	if flag in self.flags: | 
 | 	    self.flags.append(flag) | 
 |  | 
 | class SubPattern: | 
 |     # a subpattern, in intermediate form | 
 |     def __init__(self, pattern, data=None): | 
 | 	self.pattern = pattern | 
 | 	if not data: | 
 | 	    data = [] | 
 | 	self.data = data | 
 | 	self.flags = [] | 
 | 	self.width = None | 
 |     def __repr__(self): | 
 | 	return repr(self.data) | 
 |     def __len__(self): | 
 | 	return len(self.data) | 
 |     def __delitem__(self, index): | 
 | 	del self.data[index] | 
 |     def __getitem__(self, index): | 
 | 	return self.data[index] | 
 |     def __setitem__(self, index, code): | 
 | 	self.data[index] = code | 
 |     def __getslice__(self, start, stop): | 
 | 	return SubPattern(self.pattern, self.data[start:stop]) | 
 |     def insert(self, index, code): | 
 | 	self.data.insert(index, code) | 
 |     def append(self, code): | 
 | 	self.data.append(code) | 
 |     def getwidth(self): | 
 | 	# determine the width (min, max) for this subpattern | 
 | 	if self.width: | 
 | 	    return self.width | 
 | 	lo = hi = 0L | 
 | 	for op, av in self.data: | 
 | 	    if op is BRANCH: | 
 | 		l = sys.maxint | 
 | 		h = 0 | 
 | 		for av in av[1]: | 
 | 		    i, j = av.getwidth() | 
 | 		    l = min(l, i) | 
 | 		    h = min(h, j) | 
 | 		lo = lo + i | 
 | 		hi = hi + j | 
 | 	    elif op is CALL: | 
 | 		i, j = av.getwidth() | 
 | 		lo = lo + i | 
 | 		hi = hi + j | 
 | 	    elif op is SUBPATTERN: | 
 | 		i, j = av[1].getwidth() | 
 | 		lo = lo + i | 
 | 		hi = hi + j | 
 | 	    elif op in (MIN_REPEAT, MAX_REPEAT): | 
 | 		i, j = av[2].getwidth() | 
 | 		lo = lo + i * av[0] | 
 | 		hi = hi + j * av[1] | 
 | 	    elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY): | 
 | 		lo = lo + 1 | 
 | 		hi = hi + 1 | 
 | 	    elif op == SUCCESS: | 
 | 		break | 
 | 	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) | 
 | 	return self.width | 
 |     def set(self, flag): | 
 | 	if not flag in self.flags: | 
 | 	    self.flags.append(flag) | 
 |     def reset(self, flag): | 
 | 	if flag in self.flags: | 
 | 	    self.flags.remove(flag) | 
 |  | 
 | class Tokenizer: | 
 |     def __init__(self, string): | 
 | 	self.string = list(string) | 
 | 	self.next = self.__next() | 
 |     def __next(self): | 
 | 	if not self.string: | 
 | 	    return None | 
 | 	char = self.string[0] | 
 | 	if char[0] == "\\": | 
 | 	    try: | 
 | 		c = self.string[1] | 
 | 	    except IndexError: | 
 | 		raise SyntaxError, "bogus escape" | 
 | 	    char = char + c | 
 | 	    try: | 
 | 		if c == "x": | 
 | 		    # hexadecimal constant | 
 | 		    for i in xrange(2, sys.maxint): | 
 | 			c = self.string[i] | 
 | 			if str(c) not in HEXDIGITS: | 
 | 			    break | 
 | 			char = char + c | 
 | 		elif str(c) in DIGITS: | 
 | 		    # decimal (or octal) number | 
 | 		    for i in xrange(2, sys.maxint): | 
 | 			c = self.string[i] | 
 | 			# FIXME: if larger than current number of | 
 | 			# groups, interpret as an octal number  | 
 | 			if str(c) not in DIGITS: | 
 | 			    break | 
 | 			char = char + c | 
 | 	    except IndexError: | 
 | 		pass # use what we've got this far | 
 | 	del self.string[0:len(char)] | 
 | 	return char | 
 |     def match(self, char): | 
 | 	if char == self.next: | 
 | 	    self.next = self.__next() | 
 | 	    return 1 | 
 | 	return 0 | 
 |     def match_set(self, set): | 
 | 	if self.next and self.next in set: | 
 | 	    self.next = self.__next() | 
 | 	    return 1 | 
 | 	return 0 | 
 |     def get(self): | 
 | 	this = self.next | 
 | 	self.next = self.__next() | 
 | 	return this | 
 |  | 
 | def _fixescape(escape, character_class=0): | 
 |     # convert escape to (type, value) | 
 |     if character_class: | 
 | 	# inside a character class, we'll look in the character | 
 | 	# escapes dictionary first | 
 | 	code = ESCAPES.get(escape) | 
 | 	if code: | 
 | 	    return code | 
 | 	code = CATEGORIES.get(escape) | 
 |     else: | 
 | 	code = CATEGORIES.get(escape) | 
 | 	if code: | 
 | 	    return code | 
 | 	code = ESCAPES.get(escape) | 
 |     if code: | 
 | 	return code | 
 |     if not character_class: | 
 | 	try: | 
 | 	    group = int(escape[1:]) | 
 | 	    # FIXME: only valid if group <= current number of groups | 
 | 	    return GROUP, group | 
 | 	except ValueError: | 
 | 	    pass | 
 |     try: | 
 | 	if escape[1:2] == "x": | 
 | 	    escape = escape[2:] | 
 | 	    return LITERAL, chr(int(escape[-2:], 16) & 0xff) | 
 | 	elif str(escape[1:2]) in DIGITS: | 
 | 	    return LITERAL, chr(int(escape[1:], 8) & 0xff) | 
 | 	elif len(escape) == 2: | 
 | 	    return LITERAL, escape[1] | 
 |     except ValueError: | 
 | 	pass | 
 |     raise SyntaxError, "bogus escape: %s" % repr(escape) | 
 |  | 
 | def _branch(subpattern, items): | 
 |  | 
 |     # form a branch operator from a set of items (FIXME: move this | 
 |     # optimization to the compiler module!) | 
 |  | 
 |     # check if all items share a common prefix | 
 |     while 1: | 
 | 	prefix = None | 
 | 	for item in items: | 
 | 	    if not item: | 
 | 		break | 
 | 	    if prefix is None: | 
 | 		prefix = item[0] | 
 | 	    elif item[0] != prefix: | 
 | 		break | 
 | 	else: | 
 | 	    # all subitems start with a common "prefix". | 
 | 	    # move it out of the branch | 
 | 	    for item in items: | 
 | 		del item[0] | 
 | 	    subpattern.append(prefix) | 
 | 	    continue # check next one | 
 | 	break | 
 |  | 
 |     # check if the branch can be replaced by a character set | 
 |     for item in items: | 
 | 	if len(item) != 1 or item[0][0] != LITERAL: | 
 | 	    break | 
 |     else: | 
 | 	# we can store this as a character set instead of a | 
 | 	# branch (FIXME: use a range if possible) | 
 | 	set = [] | 
 | 	for item in items: | 
 | 	    set.append(item[0]) | 
 | 	subpattern.append((IN, set)) | 
 | 	return | 
 |  | 
 |     subpattern.append((BRANCH, (None, items))) | 
 |  | 
 | def _parse(source, pattern, flags=()): | 
 |  | 
 |     # parse regular expression pattern into an operator list. | 
 |  | 
 |     subpattern = SubPattern(pattern) | 
 |  | 
 |     this = None | 
 |  | 
 |     while 1: | 
 |  | 
 | 	if str(source.next) in ("|", ")"): | 
 | 	    break # end of subpattern | 
 | 	this = source.get() | 
 | 	if this is None: | 
 | 	    break # end of pattern | 
 |  | 
 | 	if this and this[0] not in SPECIAL_CHARS: | 
 | 	    subpattern.append((LITERAL, this)) | 
 |  | 
 | 	elif this == "[": | 
 | 	    # character set | 
 | 	    set = [] | 
 | ## 	    if source.match(":"): | 
 | ## 		pass # handle character classes | 
 | 	    if source.match("^"): | 
 | 		set.append((NEGATE, None)) | 
 | 	    # check remaining characters | 
 | 	    start = set[:] | 
 | 	    while 1: | 
 | 		this = source.get() | 
 | 		if this == "]" and set != start: | 
 | 		    break | 
 | 		elif this and this[0] == "\\": | 
 | 		    code1 = _fixescape(this, 1) | 
 | 		elif this: | 
 | 		    code1 = LITERAL, this | 
 | 		else: | 
 | 		    raise SyntaxError, "unexpected end of regular expression" | 
 | 		if source.match("-"): | 
 | 		    # potential range | 
 | 		    this = source.get() | 
 | 		    if this == "]": | 
 | 			set.append(code1) | 
 | 			set.append((LITERAL, "-")) | 
 | 			break | 
 | 		    else: | 
 | 			if this[0] == "\\": | 
 | 			    code2 = _fixescape(this, 1) | 
 | 			else: | 
 | 			    code2 = LITERAL, this | 
 | 			if code1[0] != LITERAL or code2[0] != LITERAL: | 
 | 			    raise SyntaxError, "illegal range" | 
 | 			if len(code1[1]) != 1 or len(code2[1]) != 1: | 
 | 			    raise SyntaxError, "illegal range" | 
 | 			set.append((RANGE, (code1[1], code2[1]))) | 
 | 		else: | 
 | 		    if code1[0] is IN: | 
 | 			code1 = code1[1][0] | 
 | 		    set.append(code1) | 
 |  | 
 | 	    # FIXME: <fl> move set optimization to support function | 
 | 	    if len(set)==1 and set[0][0] is LITERAL: | 
 | 		subpattern.append(set[0]) # optimization | 
 | 	    elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: | 
 | 		subpattern.append((NOT_LITERAL, set[1][1])) # optimization | 
 | 	    else: | 
 | 		# FIXME: <fl> add charmap optimization | 
 | 		subpattern.append((IN, set)) | 
 |  | 
 | 	elif this and this[0] in REPEAT_CHARS: | 
 | 	    # repeat previous item | 
 | 	    if this == "?": | 
 | 		min, max = 0, 1 | 
 | 	    elif this == "*": | 
 | 		min, max = 0, sys.maxint | 
 | 	    elif this == "+": | 
 | 		min, max = 1, sys.maxint | 
 | 	    elif this == "{": | 
 | 		min, max = 0, sys.maxint | 
 | 		lo = hi = "" | 
 | 		while str(source.next) in DIGITS: | 
 | 		    lo = lo + source.get() | 
 | 		if source.match(","): | 
 | 		    while str(source.next) in DIGITS: | 
 | 			hi = hi + source.get() | 
 | 		else: | 
 | 		    hi = lo | 
 | 		if not source.match("}"): | 
 | 		    raise SyntaxError, "bogus range" | 
 | 		if lo: | 
 | 		    min = int(lo) | 
 | 		if hi: | 
 | 		    max = int(hi) | 
 | 		# FIXME: <fl> check that hi >= lo! | 
 | 	    else: | 
 | 		raise SyntaxError, "not supported" | 
 | 	    # figure out which item to repeat | 
 | 	    # FIXME: should back up to the right mark, right? | 
 | 	    if subpattern: | 
 | 		index = len(subpattern)-1 | 
 | 		while subpattern[index][0] is MARK: | 
 | 		    index = index - 1 | 
 | 		item = subpattern[index:index+1] | 
 | 	    else: | 
 | 		raise SyntaxError, "nothing to repeat" | 
 | 	    if source.match("?"): | 
 | 		subpattern[index] = (MIN_REPEAT, (min, max, item)) | 
 | 	    else: | 
 | 		subpattern[index] = (MAX_REPEAT, (min, max, item)) | 
 | 	elif this == ".": | 
 | 	    subpattern.append((ANY, None)) | 
 | 	elif this == "(": | 
 | 	    group = 1 | 
 | 	    name = None | 
 | 	    if source.match("?"): | 
 | 		group = 0 | 
 | 		# options | 
 | 		if source.match("P"): | 
 | 		    # named group: skip forward to end of name | 
 | 		    if source.match("<"): | 
 | 			name = "" | 
 | 			while 1: | 
 | 			    char = source.get() | 
 | 			    if char is None or char == ">": | 
 | 				break | 
 | 			    name = name + char | 
 | 			group = 1 | 
 | 		elif source.match(":"): | 
 | 		    # non-capturing group | 
 | 		    group = 2 | 
 | 		elif source.match_set("iI"): | 
 | 		    pattern.setflag("i") | 
 | 		elif source.match_set("lL"): | 
 | 		    pattern.setflag("l") | 
 | 		elif source.match_set("mM"): | 
 | 		    pattern.setflag("m") | 
 | 		elif source.match_set("sS"): | 
 | 		    pattern.setflag("s") | 
 | 		elif source.match_set("xX"): | 
 | 		    pattern.setflag("x") | 
 | 	    if group: | 
 | 		# parse group contents | 
 | 		b = [] | 
 | 		if group == 2: | 
 | 		    # anonymous group | 
 | 		    group = None | 
 | 		else: | 
 | 		    group = pattern.getgroup(name) | 
 |  		if group: | 
 |  		    subpattern.append((MARK, (group-1)*2)) | 
 | 		while 1: | 
 | 		    p = _parse(source, pattern, flags) | 
 | 		    if source.match(")"): | 
 | 			if b: | 
 | 			    b.append(p) | 
 | 			    _branch(subpattern, b) | 
 | 			else: | 
 | 			    subpattern.append((SUBPATTERN, (group, p))) | 
 | 			break | 
 | 		    elif source.match("|"): | 
 | 			b.append(p) | 
 | 		    else: | 
 | 			raise SyntaxError, "group not properly closed" | 
 |  		if group: | 
 |  		    subpattern.append((MARK, (group-1)*2+1)) | 
 | 	    else: | 
 | 		# FIXME: should this really be a while loop? | 
 | 		while 1: | 
 | 		    char = source.get() | 
 | 		    if char is None or char == ")": | 
 | 			break | 
 |  | 
 | 	elif this == "^": | 
 | 	    subpattern.append((AT, AT_BEGINNING)) | 
 |  | 
 | 	elif this == "$": | 
 | 	    subpattern.append((AT, AT_END)) | 
 |  | 
 | 	elif this and this[0] == "\\": | 
 | 	    code =_fixescape(this) | 
 | 	    subpattern.append(code) | 
 |  | 
 | 	else: | 
 | 	    raise SyntaxError, "parser error" | 
 |  | 
 |     return subpattern | 
 |  | 
 | def parse(source, flags=()): | 
 |     s = Tokenizer(source) | 
 |     g = Pattern() | 
 |     b = [] | 
 |     while 1: | 
 | 	p = _parse(s, g, flags) | 
 | 	tail = s.get() | 
 | 	if tail == "|": | 
 | 	    b.append(p) | 
 | 	elif tail == ")": | 
 | 	    raise SyntaxError, "unbalanced parenthesis" | 
 | 	elif tail is None: | 
 | 	    if b: | 
 | 		b.append(p) | 
 | 		p = SubPattern(g) | 
 | 		_branch(p, b) | 
 | 	    break | 
 | 	else: | 
 | 	    raise SyntaxError, "bogus characters at end of regular expression" | 
 |     return p | 
 |  | 
 | if __name__ == "__main__": | 
 |     from pprint import pprint | 
 |     from testpatterns import PATTERNS | 
 |     a = b = c = 0 | 
 |     for pattern, flags in PATTERNS: | 
 | 	if flags: | 
 | 	    continue | 
 | 	print "-"*68 | 
 | 	try: | 
 | 	    p = parse(pattern) | 
 | 	    print repr(pattern), "->" | 
 | 	    pprint(p.data) | 
 | 	    import sre_compile | 
 | 	    try: | 
 | 		code = sre_compile.compile(p) | 
 | 		c = c + 1 | 
 | 	    except: | 
 | 		pass | 
 | 	    a = a + 1 | 
 | 	except SyntaxError, v: | 
 | 	    print "**", repr(pattern), v | 
 | 	b = b + 1 | 
 |     print "-"*68 | 
 |     print a, "of", b, "patterns successfully parsed" | 
 |     print c, "of", b, "patterns successfully compiled" | 
 |  |