Blame - Lib/dos-8x3/sre_pars.py - platform/external/python/cpython3

blob: 8b68ea1f82258a07a8ce69040658a57e75fdbd67 [file] [log] [blame]

Guido van Rossum	aad6761	2000-05-08 17:31:04 +0000	[diff] [blame]	1	#
				2	# Secret Labs' Regular Expression Engine
				3	# $Id$
				4	#
				5	# convert re-style regular expression to SRE template. the current
				6	# implementation is somewhat incomplete, and not very fast. should
				7	# definitely be rewritten before Python 1.6 goes beta.
				8	#
				9	# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
				10	#
				11	# This code can only be used for 1.6 alpha testing. All other use
				12	# require explicit permission from Secret Labs AB.
				13	#
				14	# Portions of this engine have been developed in cooperation with
				15	# CNRI. Hewlett-Packard provided funding for 1.6 integration and
				16	# other compatibility work.
				17	#
				18
				19	# FIXME: comments marked with the FIXME tag are open issues. all such
				20	# issues should be closed before the final beta.
				21
				22	import string, sys
				23
				24	from sre_constants import *
				25
				26	SPECIAL_CHARS = ".\\[{()*+?^$\|"
				27	REPEAT_CHARS = "*+?{"
				28
				29	# FIXME: string in tuple tests may explode with if char is unicode :-(
				30	DIGITS = tuple(string.digits)
				31
				32	OCTDIGITS = tuple("01234567")
				33	HEXDIGITS = tuple("0123456789abcdefABCDEF")
				34
				35	ESCAPES = {
				36	"\\a": (LITERAL, chr(7)),
				37	"\\b": (LITERAL, chr(8)),
				38	"\\f": (LITERAL, chr(12)),
				39	"\\n": (LITERAL, chr(10)),
				40	"\\r": (LITERAL, chr(13)),
				41	"\\t": (LITERAL, chr(9)),
				42	"\\v": (LITERAL, chr(11))
				43	}
				44
				45	CATEGORIES = {
				46	"\\A": (AT, AT_BEGINNING), # start of string
				47	"\\b": (AT, AT_BOUNDARY),
				48	"\\B": (AT, AT_NON_BOUNDARY),
				49	"\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
				50	"\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
				51	"\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
				52	"\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
				53	"\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
				54	"\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
				55	"\\Z": (AT, AT_END), # end of string
				56	}
				57
				58	class Pattern:
				59	# FIXME: <fl> rename class, and store flags in here too!
				60	def __init__(self):
				61	self.flags = []
				62	self.groups = 1
				63	self.groupdict = {}
				64	def getgroup(self, name=None):
				65	gid = self.groups
				66	self.groups = gid + 1
				67	if name:
				68	self.groupdict[name] = gid
				69	return gid
				70	def setflag(self, flag):
				71	if flag in self.flags:
				72	self.flags.append(flag)
				73
				74	class SubPattern:
				75	# a subpattern, in intermediate form
				76	def __init__(self, pattern, data=None):
				77	self.pattern = pattern
				78	if not data:
				79	data = []
				80	self.data = data
				81	self.flags = []
				82	self.width = None
				83	def __repr__(self):
				84	return repr(self.data)
				85	def __len__(self):
				86	return len(self.data)
				87	def __delitem__(self, index):
				88	del self.data[index]
				89	def __getitem__(self, index):
				90	return self.data[index]
				91	def __setitem__(self, index, code):
				92	self.data[index] = code
				93	def __getslice__(self, start, stop):
				94	return SubPattern(self.pattern, self.data[start:stop])
				95	def insert(self, index, code):
				96	self.data.insert(index, code)
				97	def append(self, code):
				98	self.data.append(code)
				99	def getwidth(self):
				100	# determine the width (min, max) for this subpattern
				101	if self.width:
				102	return self.width
				103	lo = hi = 0L
				104	for op, av in self.data:
				105	if op is BRANCH:
				106	l = sys.maxint
				107	h = 0
				108	for av in av[1]:
				109	i, j = av.getwidth()
				110	l = min(l, i)
				111	h = min(h, j)
				112	lo = lo + i
				113	hi = hi + j
				114	elif op is CALL:
				115	i, j = av.getwidth()
				116	lo = lo + i
				117	hi = hi + j
				118	elif op is SUBPATTERN:
				119	i, j = av[1].getwidth()
				120	lo = lo + i
				121	hi = hi + j
				122	elif op in (MIN_REPEAT, MAX_REPEAT):
				123	i, j = av[2].getwidth()
				124	lo = lo + i * av[0]
				125	hi = hi + j * av[1]
				126	elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
				127	lo = lo + 1
				128	hi = hi + 1
				129	elif op == SUCCESS:
				130	break
				131	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
				132	return self.width
				133	def set(self, flag):
				134	if not flag in self.flags:
				135	self.flags.append(flag)
				136	def reset(self, flag):
				137	if flag in self.flags:
				138	self.flags.remove(flag)
				139
				140	class Tokenizer:
				141	def __init__(self, string):
				142	self.string = list(string)
				143	self.next = self.__next()
				144	def __next(self):
				145	if not self.string:
				146	return None
				147	char = self.string[0]
				148	if char[0] == "\\":
				149	try:
				150	c = self.string[1]
				151	except IndexError:
				152	raise SyntaxError, "bogus escape"
				153	char = char + c
				154	try:
				155	if c == "x":
				156	# hexadecimal constant
				157	for i in xrange(2, sys.maxint):
				158	c = self.string[i]
				159	if str(c) not in HEXDIGITS:
				160	break
				161	char = char + c
				162	elif str(c) in DIGITS:
				163	# decimal (or octal) number
				164	for i in xrange(2, sys.maxint):
				165	c = self.string[i]
				166	# FIXME: if larger than current number of
				167	# groups, interpret as an octal number
				168	if str(c) not in DIGITS:
				169	break
				170	char = char + c
				171	except IndexError:
				172	pass # use what we've got this far
				173	del self.string[0:len(char)]
				174	return char
				175	def match(self, char):
				176	if char == self.next:
				177	self.next = self.__next()
				178	return 1
				179	return 0
				180	def match_set(self, set):
				181	if self.next and self.next in set:
				182	self.next = self.__next()
				183	return 1
				184	return 0
				185	def get(self):
				186	this = self.next
				187	self.next = self.__next()
				188	return this
				189
				190	def _fixescape(escape, character_class=0):
				191	# convert escape to (type, value)
				192	if character_class:
				193	# inside a character class, we'll look in the character
				194	# escapes dictionary first
				195	code = ESCAPES.get(escape)
				196	if code:
				197	return code
				198	code = CATEGORIES.get(escape)
				199	else:
				200	code = CATEGORIES.get(escape)
				201	if code:
				202	return code
				203	code = ESCAPES.get(escape)
				204	if code:
				205	return code
				206	if not character_class:
				207	try:
				208	group = int(escape[1:])
				209	# FIXME: only valid if group <= current number of groups
				210	return GROUP, group
				211	except ValueError:
				212	pass
				213	try:
				214	if escape[1:2] == "x":
				215	escape = escape[2:]
				216	return LITERAL, chr(int(escape[-2:], 16) & 0xff)
				217	elif str(escape[1:2]) in DIGITS:
				218	return LITERAL, chr(int(escape[1:], 8) & 0xff)
				219	elif len(escape) == 2:
				220	return LITERAL, escape[1]
				221	except ValueError:
				222	pass
				223	raise SyntaxError, "bogus escape: %s" % repr(escape)
				224
				225	def _branch(subpattern, items):
				226
				227	# form a branch operator from a set of items (FIXME: move this
				228	# optimization to the compiler module!)
				229
				230	# check if all items share a common prefix
				231	while 1:
				232	prefix = None
				233	for item in items:
				234	if not item:
				235	break
				236	if prefix is None:
				237	prefix = item[0]
				238	elif item[0] != prefix:
				239	break
				240	else:
				241	# all subitems start with a common "prefix".
				242	# move it out of the branch
				243	for item in items:
				244	del item[0]
				245	subpattern.append(prefix)
				246	continue # check next one
				247	break
				248
				249	# check if the branch can be replaced by a character set
				250	for item in items:
				251	if len(item) != 1 or item[0][0] != LITERAL:
				252	break
				253	else:
				254	# we can store this as a character set instead of a
				255	# branch (FIXME: use a range if possible)
				256	set = []
				257	for item in items:
				258	set.append(item[0])
				259	subpattern.append((IN, set))
				260	return
				261
				262	subpattern.append((BRANCH, (None, items)))
				263
				264	def _parse(source, pattern, flags=()):
				265
				266	# parse regular expression pattern into an operator list.
				267
				268	subpattern = SubPattern(pattern)
				269
				270	this = None
				271
				272	while 1:
				273
				274	if str(source.next) in ("\|", ")"):
				275	break # end of subpattern
				276	this = source.get()
				277	if this is None:
				278	break # end of pattern
				279
				280	if this and this[0] not in SPECIAL_CHARS:
				281	subpattern.append((LITERAL, this))
				282
				283	elif this == "[":
				284	# character set
				285	set = []
				286	## if source.match(":"):
				287	## pass # handle character classes
				288	if source.match("^"):
				289	set.append((NEGATE, None))
				290	# check remaining characters
				291	start = set[:]
				292	while 1:
				293	this = source.get()
				294	if this == "]" and set != start:
				295	break
				296	elif this and this[0] == "\\":
				297	code1 = _fixescape(this, 1)
				298	elif this:
				299	code1 = LITERAL, this
				300	else:
				301	raise SyntaxError, "unexpected end of regular expression"
				302	if source.match("-"):
				303	# potential range
				304	this = source.get()
				305	if this == "]":
				306	set.append(code1)
				307	set.append((LITERAL, "-"))
				308	break
				309	else:
				310	if this[0] == "\\":
				311	code2 = _fixescape(this, 1)
				312	else:
				313	code2 = LITERAL, this
				314	if code1[0] != LITERAL or code2[0] != LITERAL:
				315	raise SyntaxError, "illegal range"
				316	if len(code1[1]) != 1 or len(code2[1]) != 1:
				317	raise SyntaxError, "illegal range"
				318	set.append((RANGE, (code1[1], code2[1])))
				319	else:
				320	if code1[0] is IN:
				321	code1 = code1[1][0]
				322	set.append(code1)
				323
				324	# FIXME: <fl> move set optimization to support function
				325	if len(set)==1 and set[0][0] is LITERAL:
				326	subpattern.append(set[0]) # optimization
				327	elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
				328	subpattern.append((NOT_LITERAL, set[1][1])) # optimization
				329	else:
				330	# FIXME: <fl> add charmap optimization
				331	subpattern.append((IN, set))
				332
				333	elif this and this[0] in REPEAT_CHARS:
				334	# repeat previous item
				335	if this == "?":
				336	min, max = 0, 1
				337	elif this == "*":
				338	min, max = 0, sys.maxint
				339	elif this == "+":
				340	min, max = 1, sys.maxint
				341	elif this == "{":
				342	min, max = 0, sys.maxint
				343	lo = hi = ""
				344	while str(source.next) in DIGITS:
				345	lo = lo + source.get()
				346	if source.match(","):
				347	while str(source.next) in DIGITS:
				348	hi = hi + source.get()
				349	else:
				350	hi = lo
				351	if not source.match("}"):
				352	raise SyntaxError, "bogus range"
				353	if lo:
				354	min = int(lo)
				355	if hi:
				356	max = int(hi)
				357	# FIXME: <fl> check that hi >= lo!
				358	else:
				359	raise SyntaxError, "not supported"
				360	# figure out which item to repeat
				361	# FIXME: should back up to the right mark, right?
				362	if subpattern:
				363	index = len(subpattern)-1
				364	while subpattern[index][0] is MARK:
				365	index = index - 1
				366	item = subpattern[index:index+1]
				367	else:
				368	raise SyntaxError, "nothing to repeat"
				369	if source.match("?"):
				370	subpattern[index] = (MIN_REPEAT, (min, max, item))
				371	else:
				372	subpattern[index] = (MAX_REPEAT, (min, max, item))
				373	elif this == ".":
				374	subpattern.append((ANY, None))
				375	elif this == "(":
				376	group = 1
				377	name = None
				378	if source.match("?"):
				379	group = 0
				380	# options
				381	if source.match("P"):
				382	# named group: skip forward to end of name
				383	if source.match("<"):
				384	name = ""
				385	while 1:
				386	char = source.get()
				387	if char is None or char == ">":
				388	break
				389	name = name + char
				390	group = 1
				391	elif source.match(":"):
				392	# non-capturing group
				393	group = 2
				394	elif source.match_set("iI"):
				395	pattern.setflag("i")
				396	elif source.match_set("lL"):
				397	pattern.setflag("l")
				398	elif source.match_set("mM"):
				399	pattern.setflag("m")
				400	elif source.match_set("sS"):
				401	pattern.setflag("s")
				402	elif source.match_set("xX"):
				403	pattern.setflag("x")
				404	if group:
				405	# parse group contents
				406	b = []
				407	if group == 2:
				408	# anonymous group
				409	group = None
				410	else:
				411	group = pattern.getgroup(name)
				412	if group:
				413	subpattern.append((MARK, (group-1)*2))
				414	while 1:
				415	p = _parse(source, pattern, flags)
				416	if source.match(")"):
				417	if b:
				418	b.append(p)
				419	_branch(subpattern, b)
				420	else:
				421	subpattern.append((SUBPATTERN, (group, p)))
				422	break
				423	elif source.match("\|"):
				424	b.append(p)
				425	else:
				426	raise SyntaxError, "group not properly closed"
				427	if group:
				428	subpattern.append((MARK, (group-1)*2+1))
				429	else:
				430	# FIXME: should this really be a while loop?
				431	while 1:
				432	char = source.get()
				433	if char is None or char == ")":
				434	break
				435
				436	elif this == "^":
				437	subpattern.append((AT, AT_BEGINNING))
				438
				439	elif this == "$":
				440	subpattern.append((AT, AT_END))
				441
				442	elif this and this[0] == "\\":
				443	code =_fixescape(this)
				444	subpattern.append(code)
				445
				446	else:
				447	raise SyntaxError, "parser error"
				448
				449	return subpattern
				450
				451	def parse(source, flags=()):
				452	s = Tokenizer(source)
				453	g = Pattern()
				454	b = []
				455	while 1:
				456	p = _parse(s, g, flags)
				457	tail = s.get()
				458	if tail == "\|":
				459	b.append(p)
				460	elif tail == ")":
				461	raise SyntaxError, "unbalanced parenthesis"
				462	elif tail is None:
				463	if b:
				464	b.append(p)
				465	p = SubPattern(g)
				466	_branch(p, b)
				467	break
				468	else:
				469	raise SyntaxError, "bogus characters at end of regular expression"
				470	return p
				471
				472	if __name__ == "__main__":
				473	from pprint import pprint
				474	from testpatterns import PATTERNS
				475	a = b = c = 0
				476	for pattern, flags in PATTERNS:
				477	if flags:
				478	continue
				479	print "-"*68
				480	try:
				481	p = parse(pattern)
				482	print repr(pattern), "->"
				483	pprint(p.data)
				484	import sre_compile
				485	try:
				486	code = sre_compile.compile(p)
				487	c = c + 1
				488	except:
				489	pass
				490	a = a + 1
				491	except SyntaxError, v:
				492	print "**", repr(pattern), v
				493	b = b + 1
				494	print "-"*68
				495	print a, "of", b, "patterns successfully parsed"
				496	print c, "of", b, "patterns successfully compiled"
				497