Blame - Lib/re1.py - platform/external/python/cpython2

blob: 6c24797ffd851be40ac1b5dee6a143ec425d5d70 [file] [log] [blame]

Guido van Rossum	bf9d353	1997-10-06 14:45:17 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	# -- mode: python --
				3	# $Id$
				4
				5	import string
				6	import reop
				7
				8	# reop.error and re.error should be the same, since exceptions can be
				9	# raised from either module.
				10	error = reop.error # 're error'
				11
				12	from reop import NORMAL, CHARCLASS, REPLACEMENT
				13	from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET
				14	from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER
				15
				16	# compilation flags
				17
				18	IGNORECASE = I = 0x01
				19
				20	MULTILINE = M = 0x02
				21	DOTALL = S = 0x04
				22	VERBOSE = X = 0x08
				23
				24	repetition_operators = ['', '?', '+', '+?', '?', '??', '{n}', '{n}?',
				25	'{n,}', '{n,}?', '{n,m}', '{n,m}?']
				26
				27	#
				28	#
				29	#
				30
				31	def valid_identifier(id):
				32	if len(id) == 0:
				33	return 0
				34	if (not reop.syntax_table[id[0]] & reop.word) or \
				35	(reop.syntax_table[id[0]] & reop.digit):
				36	return 0
				37	for char in id[1:]:
				38	if not reop.syntax_table[char] & reop.word:
				39	return 0
				40	return 1
				41
				42	#
				43	#
				44	#
				45
				46	_cache = {}
				47	_MAXCACHE = 20
				48
				49	def _cachecompile(pattern, flags=0):
				50	key = (pattern, flags)
				51	try:
				52	return _cache[key]
				53	except KeyError:
				54	pass
				55	value = compile(pattern, flags)
				56	if len(_cache) >= _MAXCACHE:
				57	_cache.clear()
				58	_cache[key] = value
				59	return value
				60
				61	def match(pattern, string, flags=0):
				62	return _cachecompile(pattern, flags).match(string)
				63
				64	def search(pattern, string, flags=0):
				65	return _cachecompile(pattern, flags).search(string)
				66
				67	def sub(pattern, repl, string, count=0):
				68	if type(pattern) == type(''):
				69	pattern = _cachecompile(pattern)
				70	return pattern.sub(repl, string, count)
				71
				72	def subn(pattern, repl, string, count=0):
				73	if type(pattern) == type(''):
				74	pattern = _cachecompile(pattern)
				75	return pattern.subn(repl, string, count)
				76
				77	def split(pattern, string, maxsplit=0):
				78	if type(pattern) == type(''):
				79	pattern = _cachecompile(pattern)
				80	return pattern.split(string, maxsplit)
				81
				82	#
				83	#
				84	#
				85
				86	def _expand(m, repl):
				87	results = []
				88	index = 0
				89	size = len(repl)
				90	while index < size:
				91	found = string.find(repl, '\\', index)
				92	if found < 0:
				93	results.append(repl[index:])
				94	break
				95	if found > index:
				96	results.append(repl[index:found])
				97	escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT)
				98	if escape_type == CHAR:
				99	results.append(value)
				100	elif escape_type == MEMORY_REFERENCE:
				101	r = m.group(value)
				102	if r is None:
				103	raise error, ('group "' + str(value) + '" did not contribute '
				104	'to the match')
				105	results.append(m.group(value))
				106	else:
				107	raise error, "bad escape in replacement"
				108	return string.join(results, '')
				109
				110	class RegexObject:
				111	def __init__(self, pattern, flags, code, num_regs, groupindex):
				112	self.code = code
				113	self.num_regs = num_regs
				114	self.flags = flags
				115	self.pattern = pattern
				116	self.groupindex = groupindex
				117	self.fastmap = build_fastmap(code)
				118
				119	if code[0].name == 'bol':
				120	self.anchor = 1
				121
				122	elif code[0].name == 'begbuf':
				123	self.anchor = 2
				124
				125	else:
				126	self.anchor = 0
				127
				128	self.buffer = assemble(code)
				129	def search(self, string, pos=0):
				130	regs = reop.search(self.buffer,
				131	self.num_regs,
				132	self.flags,
				133	self.fastmap.can_be_null,
				134	self.fastmap.fastmap(),
				135	self.anchor,
				136	string,
				137	pos)
				138	if regs is None:
				139	return None
				140
				141	return MatchObject(self,
				142	string,
				143	pos,
				144	regs)
				145
				146	def match(self, string, pos=0):
				147	regs = reop.match(self.buffer,
				148	self.num_regs,
				149	self.flags,
				150	self.fastmap.can_be_null,
				151	self.fastmap.fastmap(),
				152	self.anchor,
				153	string,
				154	pos)
				155	if regs is None:
				156	return None
				157
				158	return MatchObject(self,
				159	string,
				160	pos,
				161	regs)
				162
				163	def sub(self, repl, string, count=0):
				164	return self.subn(repl, string, count)[0]
				165
				166	def subn(self, repl, source, count=0):
				167	if count < 0:
				168	raise ValueError, "negative substibution count"
				169	if count == 0:
				170	import sys
				171	count = sys.maxint
				172	if type(repl) == type(''):
				173	if '\\' in repl:
				174	repl = lambda m, r=repl: _expand(m, r)
				175	else:
				176	repl = lambda m, r=repl: r
				177	n = 0 # Number of matches
				178	pos = 0 # Where to start searching
				179	lastmatch = -1 # End of last match
				180	results = [] # Substrings making up the result
				181	end = len(source)
				182	while n < count and pos <= end:
				183	m = self.search(source, pos)
				184	if not m:
				185	break
				186	i, j = m.span(0)
				187	if i == j == lastmatch:
				188	# Empty match adjacent to previous match
				189	pos = pos + 1
				190	results.append(source[lastmatch:pos])
				191	continue
				192	if pos < i:
				193	results.append(source[pos:i])
				194	results.append(repl(m))
				195	pos = lastmatch = j
				196	if i == j:
				197	# Last match was empty; don't try here again
				198	pos = pos + 1
				199	results.append(source[lastmatch:pos])
				200	n = n + 1
				201	results.append(source[pos:])
				202	return (string.join(results, ''), n)
				203
				204	def split(self, source, maxsplit=0):
				205	if maxsplit < 0:
				206	raise error, "negative split count"
				207	if maxsplit == 0:
				208	import sys
				209	maxsplit = sys.maxint
				210	n = 0
				211	pos = 0
				212	lastmatch = 0
				213	results = []
				214	end = len(source)
				215	while n < maxsplit:
				216	m = self.search(source, pos)
				217	if not m:
				218	break
				219	i, j = m.span(0)
				220	if i == j:
				221	# Empty match
				222	if pos >= end:
				223	break
				224	pos = pos+1
				225	continue
				226	results.append(source[lastmatch:i])
				227	g = m.group()
				228	if g:
				229	results[len(results):] = list(g)
				230	pos = lastmatch = j
				231	results.append(source[lastmatch:])
				232	return results
				233
				234	class MatchObject:
				235	def __init__(self, re, string, pos, regs):
				236	self.re = re
				237	self.string = string
				238	self.pos = pos
				239	self.regs = regs
				240
				241	def start(self, g):
				242	if type(g) == type(''):
				243	try:
				244	g = self.re.groupindex[g]
				245	except (KeyError, TypeError):
				246	raise IndexError, ('group "' + g + '" is undefined')
				247	return self.regs[g][0]
				248
				249	def end(self, g):
				250	if type(g) == type(''):
				251	try:
				252	g = self.re.groupindex[g]
				253	except (KeyError, TypeError):
				254	raise IndexError, ('group "' + g + '" is undefined')
				255	return self.regs[g][1]
				256
				257	def span(self, g):
				258	if type(g) == type(''):
				259	try:
				260	g = self.re.groupindex[g]
				261	except (KeyError, TypeError):
				262	raise IndexError, ('group "' + g + '" is undefined')
				263	return self.regs[g]
				264
				265	def group(self, *groups):
				266	if len(groups) == 0:
				267	groups = range(1, self.re.num_regs)
				268	use_all = 1
				269	else:
				270	use_all = 0
				271	result = []
				272	for g in groups:
				273	if type(g) == type(''):
				274	try:
				275	g = self.re.groupindex[g]
				276	except (KeyError, TypeError):
				277	raise IndexError, ('group "' + g + '" is undefined')
				278	if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
				279	result.append(None)
				280	else:
				281	result.append(self.string[self.regs[g][0]:self.regs[g][1]])
				282	if use_all or len(result) > 1:
				283	return tuple(result)
				284	elif len(result) == 1:
				285	return result[0]
				286	else:
				287	return ()
				288
				289	#
				290	# A set of classes to make assembly a bit easier, if a bit verbose.
				291	#
				292
				293	class Instruction:
				294	def __init__(self, opcode, size=1):
				295	self.opcode = opcode
				296	self.size = size
				297	def assemble(self, position, labels):
				298	return self.opcode
				299	def __repr__(self):
				300	return '%-15s' % (self.name)
				301
				302	class End(Instruction):
				303	name = 'end'
				304	def __init__(self):
				305	Instruction.__init__(self, chr(0))
				306
				307	class Bol(Instruction):
				308	name = 'bol'
				309	def __init__(self):
				310	self.name = 'bol'
				311	Instruction.__init__(self, chr(1))
				312
				313	class Eol(Instruction):
				314	name = 'eol'
				315	def __init__(self):
				316	Instruction.__init__(self, chr(2))
				317
				318	class Set(Instruction):
				319	name = 'set'
				320	def __init__(self, set, flags=0):
				321	self.set = set
				322	if flags & IGNORECASE: self.set=map(string.lower, self.set)
				323	if len(set)==1:
				324	# If only one element, use the "exact" opcode (it'll be faster)
				325	Instruction.__init__(self, chr(4), 2)
				326	else:
				327	# Use the "set" opcode
				328	Instruction.__init__(self, chr(3), 33)
				329	def assemble(self, position, labels):
				330	if len(self.set)==1:
				331	# If only one character in set, generate an "exact" opcode
				332	return self.opcode + self.set[0]
				333	result = self.opcode
				334	temp = 0
				335	for i, c in map(lambda x: (x, chr(x)), range(256)):
				336	if c in self.set:
				337	temp = temp \| (1 << (i & 7))
				338	if (i % 8) == 7:
				339	result = result + chr(temp)
				340	temp = 0
				341	return result
				342	def __repr__(self):
				343	result = '%-15s' % (self.name)
				344	self.set.sort()
				345	# XXX this should print more intelligently
				346	for char in self.set:
				347	result = result + char
				348	return result
				349
				350	class Exact(Instruction):
				351	name = 'exact'
				352	def __init__(self, char, flags):
				353	self.char = char
				354	if flags & IGNORECASE: self.char=string.lower(self.char)
				355	Instruction.__init__(self, chr(4), 2)
				356	def assemble(self, position, labels):
				357	return self.opcode + self.char
				358	def __repr__(self):
				359	return '%-15s %s' % (self.name, `self.char`)
				360
				361	class AnyChar(Instruction):
				362	name = 'anychar'
				363	def __init__(self):
				364	Instruction.__init__(self, chr(5))
				365	def assemble(self, position, labels):
				366	return self.opcode
				367
				368	class MemoryInstruction(Instruction):
				369	def __init__(self, opcode, register):
				370	self.register = register
				371	Instruction.__init__(self, opcode, 2)
				372	def assemble(self, position, labels):
				373	return self.opcode + chr(self.register)
				374	def __repr__(self):
				375	return '%-15s %i' % (self.name, self.register)
				376
				377	class StartMemory(MemoryInstruction):
				378	name = 'start_memory'
				379	def __init__(self, register):
				380	MemoryInstruction.__init__(self, chr(6), register)
				381
				382	class EndMemory(MemoryInstruction):
				383	name = 'end_memory'
				384	def __init__(self, register):
				385	MemoryInstruction.__init__(self, chr(7), register)
				386
				387	class MatchMemory(MemoryInstruction):
				388	name = 'match_memory'
				389	def __init__(self, register):
				390	MemoryInstruction.__init__(self, chr(8), register)
				391
				392	class JumpInstruction(Instruction):
				393	def __init__(self, opcode, label):
				394	self.label = label
				395	Instruction.__init__(self, opcode, 3)
				396	def compute_offset(self, start, dest):
				397	return dest - (start + 3)
				398	def pack_offset(self, offset):
				399	if offset > 32767:
				400	raise error, 'offset out of range (pos)'
				401	elif offset < -32768:
				402	raise error, 'offset out of range (neg)'
				403	elif offset < 0:
				404	offset = offset + 65536
				405	return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
				406	def assemble(self, position, labels):
				407	return self.opcode + \
				408	self.pack_offset(self.compute_offset(position,
				409	labels[self.label]))
				410	def __repr__(self):
				411	return '%-15s %i' % (self.name, self.label)
				412
				413	class Jump(JumpInstruction):
				414	name = 'jump'
				415	def __init__(self, label):
				416	JumpInstruction.__init__(self, chr(9), label)
				417
				418	class StarJump(JumpInstruction):
				419	name = 'star_jump'
				420	def __init__(self, label):
				421	JumpInstruction.__init__(self, chr(10), label)
				422
				423	class FailureJump(JumpInstruction):
				424	name = 'failure_jump'
				425	def __init__(self, label):
				426	JumpInstruction.__init__(self, chr(11), label)
				427
				428	class UpdateFailureJump(JumpInstruction):
				429	name = 'update_failure_jump'
				430	def __init__(self, label):
				431	JumpInstruction.__init__(self, chr(12), label)
				432
				433	class DummyFailureJump(JumpInstruction):
				434	name = 'dummy_failure_jump'
				435	def __init__(self, label):
				436	JumpInstruction.__init__(self, chr(13), label)
				437
				438	class BegBuf(Instruction):
				439	name = 'begbuf'
				440	def __init__(self):
				441	Instruction.__init__(self, chr(14))
				442
				443	class EndBuf(Instruction):
				444	name = 'endbuf'
				445	def __init__(self):
				446	Instruction.__init__(self, chr(15))
				447
				448	class WordBeg(Instruction):
				449	name = 'wordbeg'
				450	def __init__(self):
				451	Instruction.__init__(self, chr(16))
				452
				453	class WordEnd(Instruction):
				454	name = 'wordend'
				455	def __init__(self):
				456	Instruction.__init__(self, chr(17))
				457
				458	class WordBound(Instruction):
				459	name = 'wordbound'
				460	def __init__(self):
				461	Instruction.__init__(self, chr(18))
				462
				463	class NotWordBound(Instruction):
				464	name = 'notwordbound'
				465	def __init__(self):
				466	Instruction.__init__(self, chr(19))
				467
				468	class SyntaxSpec(Instruction):
				469	name = 'syntaxspec'
				470	def __init__(self, syntax):
				471	self.syntax = syntax
				472	Instruction.__init__(self, chr(20), 2)
				473	def assemble(self, postition, labels):
				474	return self.opcode + chr(self.syntax)
				475
				476	class NotSyntaxSpec(Instruction):
				477	name = 'notsyntaxspec'
				478	def __init__(self, syntax):
				479	self.syntax = syntax
				480	Instruction.__init__(self, chr(21), 2)
				481	def assemble(self, postition, labels):
				482	return self.opcode + chr(self.syntax)
				483
				484	class Label(Instruction):
				485	name = 'label'
				486	def __init__(self, label):
				487	self.label = label
				488	Instruction.__init__(self, '', 0)
				489	def __repr__(self):
				490	return '%-15s %i' % (self.name, self.label)
				491
				492	class OpenParen(Instruction):
				493	name = '('
				494	def __init__(self, register):
				495	self.register = register
				496	Instruction.__init__(self, '', 0)
				497	def assemble(self, position, labels):
				498	raise error, 'unmatched open parenthesis'
				499
				500	class Alternation(Instruction):
				501	name = '\|'
				502	def __init__(self):
				503	Instruction.__init__(self, '', 0)
				504	def assemble(self, position, labels):
				505	raise error, 'an alternation was not taken care of'
				506
				507	#
				508	#
				509	#
				510
				511	def assemble(instructions):
				512	labels = {}
				513	position = 0
				514	pass1 = []
				515	for instruction in instructions:
				516	if instruction.name == 'label':
				517	labels[instruction.label] = position
				518	else:
				519	pass1.append((position, instruction))
				520	position = position + instruction.size
				521	pass2 = ''
				522	for position, instruction in pass1:
				523	pass2 = pass2 + instruction.assemble(position, labels)
				524	return pass2
				525
				526	#
				527	#
				528	#
				529
				530	def escape(pattern):
				531	result = []
				532	for char in pattern:
				533	if not reop.syntax_table[char] & reop.word:
				534	result.append('\\')
				535	result.append(char)
				536	return string.join(result, '')
				537
				538	#
				539	#
				540	#
				541
				542	def registers_used(instructions):
				543	result = []
				544	for instruction in instructions:
				545	if (instruction.name in ['set_memory', 'end_memory']) and \
				546	(instruction.register not in result):
				547	result.append(instruction.register)
				548	return result
				549
				550	#
				551	#
				552	#
				553
				554	class Fastmap:
				555	def __init__(self):
				556	self.map = ['\000']*256
				557	self.can_be_null = 0
				558	def add(self, char):
				559	self.map[ord(char)] = '\001'
				560	def fastmap(self):
				561	return string.join(self.map, '')
				562	def __getitem__(self, char):
				563	return ord(self.map[ord(char)])
				564	def __repr__(self):
				565	self.map.sort()
				566	return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
				567
				568	#
				569	#
				570	#
				571
				572	def find_label(code, label):
				573	line = 0
				574	for instruction in code:
				575	if (instruction.name == 'label') and (instruction.label == label):
				576	return line + 1
				577	line = line + 1
				578
				579	def build_fastmap_aux(code, pos, visited, fastmap):
				580	if visited[pos]:
				581	return
				582	while 1:
				583	instruction = code[pos]
				584	visited[pos] = 1
				585	pos = pos + 1
				586	if instruction.name == 'end':
				587	fastmap.can_be_null = 1
				588	return
				589	elif instruction.name == 'syntaxspec':
				590	for char in map(chr, range(256)):
				591	if reop.syntax_table[char] & instruction.syntax:
				592	fastmap.add(char)
				593	return
				594	elif instruction.name == 'notsyntaxspec':
				595	for char in map(chr, range(256)):
				596	if not reop.syntax_table[char] & instruction.syntax:
				597	fastmap.add(char)
				598	return
				599	elif instruction.name == 'eol':
				600	fastmap.add('\n')
				601	if fastmap.can_be_null == 0:
				602	fastmap.can_be_null = 2
				603	return
				604	elif instruction.name == 'set':
				605	for char in instruction.set:
				606	fastmap.add(char)
				607	return
				608	elif instruction.name == 'exact':
				609	fastmap.add(instruction.char)
				610	elif instruction.name == 'anychar':
				611	for char in map(chr, range(256)):
				612	if char != '\n':
				613	fastmap.add(char)
				614	return
				615	elif instruction.name == 'match_memory':
				616	for char in map(chr, range(256)):
				617	fastmap.add(char)
				618	fastmap.can_be_null = 1
				619	return
				620	elif instruction.name in ['jump', 'dummy_failure_jump', \
				621	'update_failure_jump', 'star_jump']:
				622	pos = find_label(code, instruction.label)
				623	if visited[pos]:
				624	return
				625	visited[pos] = 1
				626	elif instruction.name == 'failure_jump':
				627	build_fastmap_aux(code,
				628	find_label(code, instruction.label),
				629	visited,
				630	fastmap)
				631
				632	def build_fastmap(code, pos=0):
				633	visited = [0] * len(code)
				634	fastmap = Fastmap()
				635	build_fastmap_aux(code, pos, visited, fastmap)
				636	return fastmap
				637
				638	#
				639	#
				640	#
				641
				642	#[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
				643	#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY,
				644	# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9)
				645
				646	def expand_escape(pattern, index, context=NORMAL):
				647	if index >= len(pattern):
				648	raise error, 'escape ends too soon'
				649
				650	elif pattern[index] == 't':
				651	return CHAR, chr(9), index + 1
				652
				653	elif pattern[index] == 'n':
				654	return CHAR, chr(10), index + 1
				655
				656	elif pattern[index] == 'v':
				657	return CHAR, chr(11), index + 1
				658
				659	elif pattern[index] == 'r':
				660	return CHAR, chr(13), index + 1
				661
				662	elif pattern[index] == 'f':
				663	return CHAR, chr(12), index + 1
				664
				665	elif pattern[index] == 'a':
				666	return CHAR, chr(7), index + 1
				667
				668	elif pattern[index] == 'x':
				669	# CAUTION: this is the Python rule, not the Perl rule!
				670	end = index + 1 # Skip over the 'x' character
				671	while (end < len(pattern)) and (pattern[end] in string.hexdigits):
				672	end = end + 1
				673	if end == index:
				674	raise error, "\\x must be followed by hex digit(s)"
				675	# let Python evaluate it, so we don't incorrectly 2nd-guess
				676	# what it's doing (and Python in turn passes it on to sscanf,
				677	# so that it doesn't incorrectly 2nd-guess what C does!)
				678	char = eval ('"' + pattern[index-1:end] + '"')
				679	assert len(char) == 1
				680	return CHAR, char, end
				681
				682	elif pattern[index] == 'b':
				683	if context != NORMAL:
				684	return CHAR, chr(8), index + 1
				685	else:
				686	return WORD_BOUNDARY, '', index + 1
				687
				688	elif pattern[index] == 'B':
				689	if context != NORMAL:
				690	return CHAR, 'B', index + 1
				691	else:
				692	return NOT_WORD_BOUNDARY, '', index + 1
				693
				694	elif pattern[index] == 'A':
				695	if context != NORMAL:
				696	return CHAR, 'A', index + 1
				697	else:
				698	return BEGINNING_OF_BUFFER, '', index + 1
				699
				700	elif pattern[index] == 'Z':
				701	if context != NORMAL:
				702	return CHAR, 'Z', index + 1
				703	else:
				704	return END_OF_BUFFER, '', index + 1
				705
				706	elif pattern[index] in 'GluLUQE':
				707	raise error, ('\\' + pattern[index] + ' is not allowed')
				708
				709	elif pattern[index] == 'w':
				710	if context == NORMAL:
				711	return SYNTAX, reop.word, index + 1
				712	elif context == CHARCLASS:
				713	set = []
				714	for char in reop.syntax_table.keys():
				715	if reop.syntax_table[char] & reop.word:
				716	set.append(char)
				717	return SET, set, index + 1
				718	else:
				719	return CHAR, 'w', index + 1
				720
				721	elif pattern[index] == 'W':
				722	if context == NORMAL:
				723	return NOT_SYNTAX, reop.word, index + 1
				724	elif context == CHARCLASS:
				725	set = []
				726	for char in reop.syntax_table.keys():
				727	if not reop.syntax_table[char] & reop.word:
				728	set.append(char)
				729	return SET, set, index + 1
				730	else:
				731	return CHAR, 'W', index + 1
				732
				733	elif pattern[index] == 's':
				734	if context == NORMAL:
				735	return SYNTAX, reop.whitespace, index + 1
				736	elif context == CHARCLASS:
				737	set = []
				738	for char in reop.syntax_table.keys():
				739	if reop.syntax_table[char] & reop.whitespace:
				740	set.append(char)
				741	return SET, set, index + 1
				742	else:
				743	return CHAR, 's', index + 1
				744
				745	elif pattern[index] == 'S':
				746	if context == NORMAL:
				747	return NOT_SYNTAX, reop.whitespace, index + 1
				748	elif context == CHARCLASS:
				749	set = []
				750	for char in reop.syntax_table.keys():
				751	if not reop.syntax_table[char] & reop.whitespace:
				752	set.append(char)
				753	return SET, set, index + 1
				754	else:
				755	return CHAR, 'S', index + 1
				756
				757	elif pattern[index] == 'd':
				758	if context == NORMAL:
				759	return SYNTAX, reop.digit, index + 1
				760	elif context == CHARCLASS:
				761	set = []
				762	for char in reop.syntax_table.keys():
				763	if reop.syntax_table[char] & reop.digit:
				764	set.append(char)
				765	return SET, set, index + 1
				766	else:
				767	return CHAR, 'd', index + 1
				768
				769	elif pattern[index] == 'D':
				770	if context == NORMAL:
				771	return NOT_SYNTAX, reop.digit, index + 1
				772	elif context == CHARCLASS:
				773	set = []
				774	for char in reop.syntax_table.keys():
				775	if not reop.syntax_table[char] & reop.digit:
				776	set.append(char)
				777	return SET, set, index + 1
				778	else:
				779	return CHAR, 'D', index + 1
				780
				781	elif pattern[index] in '0123456789':
				782
				783	if pattern[index] == '0':
				784	if (index + 1 < len(pattern)) and \
				785	(pattern[index + 1] in string.octdigits):
				786	if (index + 2 < len(pattern)) and \
				787	(pattern[index + 2] in string.octdigits):
				788	value = string.atoi(pattern[index:index + 3], 8)
				789	index = index + 3
				790
				791	else:
				792	value = string.atoi(pattern[index:index + 2], 8)
				793	index = index + 2
				794
				795	else:
				796	value = 0
				797	index = index + 1
				798
				799	if value > 255:
				800	raise error, 'octal value out of range'
				801
				802	return CHAR, chr(value), index
				803
				804	else:
				805	if (index + 1 < len(pattern)) and \
				806	(pattern[index + 1] in string.digits):
				807	if (index + 2 < len(pattern)) and \
				808	(pattern[index + 2] in string.octdigits) and \
				809	(pattern[index + 1] in string.octdigits) and \
				810	(pattern[index] in string.octdigits):
				811	value = string.atoi(pattern[index:index + 3], 8)
				812	if value > 255:
				813	raise error, 'octal value out of range'
				814
				815	return CHAR, chr(value), index + 3
				816
				817	else:
				818	value = string.atoi(pattern[index:index + 2])
				819	if (value < 1) or (value > 99):
				820	raise error, 'memory reference out of range'
				821
				822	if context == CHARCLASS:
				823	raise error, ('cannot reference a register from '
				824	'inside a character class')
				825	return MEMORY_REFERENCE, value, index + 2
				826
				827	else:
				828	if context == CHARCLASS:
				829	raise error, ('cannot reference a register from '
				830	'inside a character class')
				831
				832	value = string.atoi(pattern[index])
				833	return MEMORY_REFERENCE, value, index + 1
				834
				835	elif pattern[index] == 'g':
				836	if context != REPLACEMENT:
				837	return CHAR, 'g', index + 1
				838
				839	index = index + 1
				840	if index >= len(pattern):
				841	raise error, 'unfinished symbolic reference'
				842	if pattern[index] != '<':
				843	raise error, 'missing < in symbolic reference'
				844
				845	index = index + 1
				846	end = string.find(pattern, '>', index)
				847	if end == -1:
				848	raise error, 'unfinished symbolic reference'
				849	value = pattern[index:end]
				850	if not valid_identifier(value):
				851	raise error, 'illegal symbolic reference'
				852	return MEMORY_REFERENCE, value, end + 1
				853
				854	else:
				855	return CHAR, pattern[index], index + 1
				856
				857	def compile(pattern, flags=0):
				858	stack = []
				859	label = 0
				860	register = 1
				861	groupindex = {}
				862	lastop = ''
				863
				864	# look for embedded pattern modifiers at the beginning of the pattern
				865
				866	index = 0
				867
				868	if len(pattern) >= 3 and \
				869	(pattern[:2] == '(?') and \
				870	(pattern[2] in 'iImMsSxX'):
				871	index = 2
				872	while (index < len(pattern)) and (pattern[index] != ')'):
				873	if pattern[index] in 'iI':
				874	flags = flags \| IGNORECASE
				875	elif pattern[index] in 'mM':
				876	flags = flags \| MULTILINE
				877	elif pattern[index] in 'sS':
				878	flags = flags \| DOTALL
				879	elif pattern[index] in 'xX':
				880	flags = flags \| VERBOSE
				881	else:
				882	raise error, 'unknown modifier'
				883	index = index + 1
				884	index = index + 1
				885
				886	# compile the rest of the pattern
				887
				888	while (index < len(pattern)):
				889	char = pattern[index]
				890	index = index + 1
				891	if char == '\\':
				892	escape_type, value, index = expand_escape(pattern, index)
				893
				894	if escape_type == CHAR:
				895	stack.append([Exact(value, flags)])
				896	lastop = '\\' + value
				897
				898	elif escape_type == MEMORY_REFERENCE:
				899	if value >= register:
				900	raise error, ('cannot reference a register '
				901	'not yet used')
				902	stack.append([MatchMemory(value)])
				903	lastop = '\\1'
				904
				905	elif escape_type == BEGINNING_OF_BUFFER:
				906	stack.append([BegBuf()])
				907	lastop = '\\A'
				908
				909	elif escape_type == END_OF_BUFFER:
				910	stack.append([EndBuf()])
				911	lastop = '\\Z'
				912
				913	elif escape_type == WORD_BOUNDARY:
				914	stack.append([WordBound()])
				915	lastop = '\\b'
				916
				917	elif escape_type == NOT_WORD_BOUNDARY:
				918	stack.append([NotWordBound()])
				919	lastop = '\\B'
				920
				921	elif escape_type == SYNTAX:
				922	stack.append([SyntaxSpec(value)])
				923	if value == reop.word:
				924	lastop = '\\w'
				925	elif value == reop.whitespace:
				926	lastop = '\\s'
				927	elif value == reop.digit:
				928	lastop = '\\d'
				929	else:
				930	lastop = '\\?'
				931
				932	elif escape_type == NOT_SYNTAX:
				933	stack.append([NotSyntaxSpec(value)])
				934	if value == reop.word:
				935	lastop = '\\W'
				936	elif value == reop.whitespace:
				937	lastop = '\\S'
				938	elif value == reop.digit:
				939	lastop = '\\D'
				940	else:
				941	lastop = '\\?'
				942
				943	elif escape_type == SET:
				944	raise error, 'cannot use set escape type here'
				945
				946	else:
				947	raise error, 'unknown escape type'
				948
				949	elif char == '\|':
				950	expr = []
				951
				952	while (len(stack) != 0) and \
				953	(stack[-1][0].name != '(') and \
				954	(stack[-1][0].name != '\|'):
				955	expr = stack[-1] + expr
				956	del stack[-1]
				957	stack.append([FailureJump(label)] + \
				958	expr + \
				959	[Jump(-1),
				960	Label(label)])
				961	stack.append([Alternation()])
				962	label = label + 1
				963	lastop = '\|'
				964
				965	elif char == '(':
				966	if index >= len(pattern):
				967	raise error, 'no matching close paren'
				968
				969	elif pattern[index] == '?':
				970	# Perl style (?...) extensions
				971	index = index + 1
				972	if index >= len(pattern):
				973	raise error, 'extension ends prematurely'
				974
				975	elif pattern[index] == 'P':
				976	# Python extensions
				977	index = index + 1
				978	if index >= len(pattern):
				979	raise error, 'extension ends prematurely'
				980
				981	elif pattern[index] == '<':
				982	# Handle Python symbolic group names (?P<...>...)
				983	index = index + 1
				984	end = string.find(pattern, '>', index)
				985	if end == -1:
				986	raise error, 'no end to symbolic group name'
				987	name = pattern[index:end]
				988	if not valid_identifier(name):
				989	raise error, ('symbolic group name must be a '
				990	'valid identifier')
				991	index = end + 1
				992	groupindex[name] = register
				993	stack.append([OpenParen(register)])
				994	register = register + 1
				995	lastop = '('
				996
				997	elif pattern[index] == '=':
				998	# backreference to symbolic group name
				999	if index >= len(pattern):
				1000	raise error, '(?P= at the end of the pattern'
				1001	start = index + 1
				1002	end = string.find(pattern, ')', start)
				1003	if end == -1:
				1004	raise error, 'no ) to end symbolic group name'
				1005	name = pattern[start:end]
				1006	if name not in groupindex.keys():
				1007	raise error, ('symbolic group name ' + name + \
				1008	' has not been used yet')
				1009	stack.append([MatchMemory(groupindex[name])])
				1010	index = end + 1
				1011	lastop = '(?P=)'
				1012
				1013	else:
				1014	raise error, ('unknown Python extension: ' + \
				1015	pattern[index])
				1016
				1017	elif pattern[index] == ':':
				1018	# grouping, but no registers
				1019	index = index + 1
				1020	stack.append([OpenParen(-1)])
				1021	lastop = '('
				1022
				1023	elif pattern[index] == '#':
				1024	# comment
				1025	index = index + 1
				1026	end = string.find(pattern, ')', index)
				1027	if end == -1:
				1028	raise error, 'no end to comment'
				1029	index = end + 1
				1030	# do not change lastop
				1031
				1032	elif pattern[index] == '=':
				1033	raise error, ('zero-width positive lookahead '
				1034	'assertion is unsupported')
				1035
				1036	elif pattern[index] == '!':
				1037	raise error, ('zero-width negative lookahead '
				1038	'assertion is unsupported')
				1039
				1040	elif pattern[index] in 'iImMsSxX':
				1041	raise error, ('embedded pattern modifiers are only '
				1042	'allowed at the beginning of the pattern')
				1043
				1044	else:
				1045	raise error, 'unknown extension'
				1046
				1047	else:
				1048	stack.append([OpenParen(register)])
				1049	register = register + 1
				1050	lastop = '('
				1051
				1052	elif char == ')':
				1053	# make one expression out of everything on the stack up to
				1054	# the marker left by the last parenthesis
				1055	expr = []
				1056	while (len(stack) > 0) and (stack[-1][0].name != '('):
				1057	expr = stack[-1] + expr
				1058	del stack[-1]
				1059
				1060	if len(stack) == 0:
				1061	raise error, 'too many close parens'
				1062
				1063	# remove markers left by alternation
				1064	expr = filter(lambda x: x.name != '\|', expr)
				1065
				1066	# clean up jumps inserted by alternation
				1067	need_label = 0
				1068	for i in range(len(expr)):
				1069	if (expr[i].name == 'jump') and (expr[i].label == -1):
				1070	expr[i] = Jump(label)
				1071	need_label = 1
				1072	if need_label:
				1073	expr.append(Label(label))
				1074	label = label + 1
				1075
				1076	if stack[-1][0].register > 0:
				1077	expr = [StartMemory(stack[-1][0].register)] + \
				1078	expr + \
				1079	[EndMemory(stack[-1][0].register)]
				1080	del stack[-1]
				1081	stack.append(expr)
				1082	lastop = ')'
				1083
				1084	elif char == '{':
				1085	if len(stack) == 0:
				1086	raise error, 'no expression to repeat'
				1087	end = string.find(pattern, '}', index)
				1088	if end == -1:
				1089	raise error, ('no close curly bracket to match'
				1090	' open curly bracket')
				1091
				1092	fields = map(string.strip,
				1093	string.split(pattern[index:end], ','))
				1094	index = end + 1
				1095
				1096	minimal = 0
				1097	if (index < len(pattern)) and (pattern[index] == '?'):
				1098	minimal = 1
				1099	index = index + 1
				1100
				1101	if len(fields) == 1:
				1102	# {n} or {n}? (there's really no difference)
				1103	try:
				1104	count = string.atoi(fields[0])
				1105	except ValueError:
				1106	raise error, ('count must be an integer '
				1107	'inside curly braces')
				1108	if count > 65535:
				1109	raise error, 'repeat count out of range'
				1110	expr = []
				1111	while count > 0:
				1112	expr = expr + stack[-1]
				1113	count = count - 1
				1114	del stack[-1]
				1115	stack.append(expr)
				1116	if minimal:
				1117	lastop = '{n}?'
				1118	else:
				1119	lastop = '{n}'
				1120
				1121	elif len(fields) == 2:
				1122	# {n,} or {n,m}
				1123	if fields[1] == '':
				1124	# {n,}
				1125	try:
				1126	min = string.atoi(fields[0])
				1127	except ValueError:
				1128	raise error, ('minimum must be an integer '
				1129	'inside curly braces')
				1130	if min > 65535:
				1131	raise error, 'minimum repeat count out of range'
				1132
				1133	expr = []
				1134	while min > 0:
				1135	expr = expr + stack[-1]
				1136	min = min - 1
				1137	if minimal:
				1138	expr = expr + \
				1139	([Jump(label + 1),
				1140	Label(label)] + \
				1141	stack[-1] + \
				1142	[Label(label + 1),
				1143	FailureJump(label)])
				1144	lastop = '{n,}?'
				1145	else:
				1146	expr = expr + \
				1147	([Label(label),
				1148	FailureJump(label + 1)] +
				1149	stack[-1] +
				1150	[StarJump(label),
				1151	Label(label + 1)])
				1152	lastop = '{n,}'
				1153
				1154	del stack[-1]
				1155	stack.append(expr)
				1156	label = label + 2
				1157
				1158	else:
				1159	# {n,m}
				1160	try:
				1161	min = string.atoi(fields[0])
				1162	except ValueError:
				1163	raise error, ('minimum must be an integer '
				1164	'inside curly braces')
				1165	try:
				1166	max = string.atoi(fields[1])
				1167	except ValueError:
				1168	raise error, ('maximum must be an integer '
				1169	'inside curly braces')
				1170	if min > 65535:
				1171	raise error, ('minumim repeat count out '
				1172	'of range')
				1173	if max > 65535:
				1174	raise error, ('maximum repeat count out '
				1175	'of range')
				1176	if min > max:
				1177	raise error, ('minimum repeat count must be '
				1178	'less than the maximum '
				1179	'repeat count')
				1180	expr = []
				1181	while min > 0:
				1182	expr = expr + stack[-1]
				1183	min = min - 1
				1184	max = max - 1
				1185	if minimal:
				1186	while max > 0:
				1187	expr = expr + \
				1188	[FailureJump(label),
				1189	Jump(label + 1),
				1190	Label(label)] + \
				1191	stack[-1] + \
				1192	[Label(label + 1)]
				1193	max = max - 1
				1194	label = label + 2
				1195	del stack[-1]
				1196	stack.append(expr)
				1197	lastop = '{n,m}?'
				1198	else:
				1199	while max > 0:
				1200	expr = expr + \
				1201	[FailureJump(label)] + \
				1202	stack[-1]
				1203	max = max - 1
				1204	del stack[-1]
				1205	stack.append(expr + [Label(label)])
				1206	label = label + 1
				1207	lastop = '{n,m}'
				1208
				1209	else:
				1210	raise error, ('there need to be one or two fields '
				1211	'in a {} expression')
				1212
				1213	elif char == '}':
				1214	raise error, 'unbalanced close curly brace'
				1215
				1216	elif char == '*':
				1217	# Kleene closure
				1218	if len(stack) == 0:
				1219	raise error, '* needs something to repeat'
				1220
				1221	if lastop in ['(', '\|']:
				1222	raise error, '* needs something to repeat'
				1223
				1224	if lastop in repetition_operators:
				1225	raise error, 'nested repetition operators'
				1226
				1227	if (index < len(pattern)) and (pattern[index] == '?'):
				1228	# non-greedy matching
				1229	expr = [Jump(label + 1),
				1230	Label(label)] + \
				1231	stack[-1] + \
				1232	[Label(label + 1),
				1233	FailureJump(label)]
				1234	index = index + 1
				1235	lastop = '*?'
				1236	else:
				1237	# greedy matching
				1238	expr = [Label(label),
				1239	FailureJump(label + 1)] + \
				1240	stack[-1] + \
				1241	[StarJump(label),
				1242	Label(label + 1)]
				1243	lastop = '*'
				1244	del stack[-1]
				1245	stack.append(expr)
				1246	label = label + 2
				1247
				1248	elif char == '+':
				1249	# positive closure
				1250	if len(stack) == 0:
				1251	raise error, '+ needs something to repeat'
				1252
				1253	if lastop in ['(', '\|']:
				1254	raise error, '+ needs something to repeat'
				1255
				1256	if lastop in repetition_operators:
				1257	raise error, 'nested repetition operators'
				1258
				1259	if (index < len(pattern)) and (pattern[index] == '?'):
				1260	# non-greedy
				1261	expr = [Label(label)] + \
				1262	stack[-1] + \
				1263	[FailureJump(label)]
				1264	label = label + 1
				1265	index = index + 1
				1266	lastop = '+?'
				1267
				1268	else:
				1269	# greedy
				1270	expr = [DummyFailureJump(label + 1),
				1271	Label(label),
				1272	FailureJump(label + 2),
				1273	Label(label + 1)] + \
				1274	stack[-1] + \
				1275	[StarJump(label),
				1276	Label(label + 2)]
				1277	label = label + 3
				1278	lastop = '+'
				1279
				1280	del stack[-1]
				1281	stack.append(expr)
				1282
				1283	elif char == '?':
				1284	if len(stack) == 0:
				1285	raise error, 'need something to be optional'
				1286
				1287	if len(stack) == 0:
				1288	raise error, '? needs something to repeat'
				1289
				1290	if lastop in ['(', '\|']:
				1291	raise error, '? needs something to repeat'
				1292
				1293	if lastop in repetition_operators:
				1294	raise error, 'nested repetition operators'
				1295
				1296	if (index < len(pattern)) and (pattern[index] == '?'):
				1297	# non-greedy matching
				1298	expr = [FailureJump(label),
				1299	Jump(label + 1),
				1300	Label(label)] + \
				1301	stack[-1] + \
				1302	[Label(label + 1)]
				1303	label = label + 2
				1304	index = index + 1
				1305	lastop = '??'
				1306
				1307	else:
				1308	# greedy matching
				1309	expr = [FailureJump(label)] + \
				1310	stack[-1] + \
				1311	[Label(label)]
				1312	label = label + 1
				1313	lastop = '?'
				1314
				1315	del stack[-1]
				1316	stack.append(expr)
				1317
				1318	elif char == '.':
				1319	if flags & DOTALL:
				1320	stack.append([Set(map(chr, range(256)), flags)])
				1321	else:
				1322	stack.append([AnyChar()])
				1323	lastop = '.'
				1324
				1325	elif char == '^':
				1326	if flags & MULTILINE:
				1327	stack.append([Bol()])
				1328	else:
				1329	stack.append([BegBuf()])
				1330	lastop = '^'
				1331
				1332	elif char == '$':
				1333	if flags & MULTILINE:
				1334	stack.append([Eol()])
				1335	else:
				1336	stack.append([EndBuf()])
				1337	lastop = '$'
				1338
				1339	elif char == '#':
				1340	if flags & VERBOSE:
				1341	# comment
				1342	index = index + 1
				1343	end = string.find(pattern, '\n', index)
				1344	if end == -1:
				1345	index = len(pattern)
				1346	else:
				1347	index = end + 1
				1348	# do not change lastop
				1349	else:
				1350	stack.append([Exact(char, flags)])
				1351	lastop = '#'
				1352
				1353	elif char in string.whitespace:
				1354	if not (flags & VERBOSE):
				1355	stack.append([Exact(char, flags)])
				1356	lastop = char
				1357
				1358	elif char == '[':
				1359	# compile character class
				1360
				1361	if index >= len(pattern):
				1362	raise error, 'unclosed character class'
				1363
				1364	negate = 0
				1365	last = ''
				1366	set = []
				1367
				1368	if pattern[index] == '^':
				1369	negate = 1
				1370	index = index + 1
				1371	if index >= len(pattern):
				1372	raise error, 'unclosed character class'
				1373
				1374	if pattern[index] == ']':
				1375	set.append(']')
				1376	index = index + 1
				1377	if index >= len(pattern):
				1378	raise error, 'unclosed character class'
				1379
				1380	elif pattern[index] == '-':
				1381	set.append('-')
				1382	index = index + 1
				1383	if index >= len(pattern):
				1384	raise error, 'unclosed character class'
				1385
				1386	while (index < len(pattern)) and (pattern[index] != ']'):
				1387	next = pattern[index]
				1388	index = index + 1
				1389	if next == '-':
				1390	if index >= len(pattern):
				1391	raise error, 'incomplete range in character class'
				1392
				1393	elif pattern[index] == ']':
				1394	set.append('-')
				1395
				1396	else:
				1397	if last == '':
				1398	raise error, ('improper use of range in '
				1399	'character class')
				1400
				1401	start = last
				1402
				1403	if pattern[index] == '\\':
				1404	escape_type,
				1405	value,
				1406	index = expand_escape(pattern,
				1407	index + 1,
				1408	CHARCLASS)
				1409
				1410	if escape_type == CHAR:
				1411	end = value
				1412
				1413	else:
				1414	raise error, ('illegal escape in character '
				1415	'class range')
				1416	else:
				1417	end = pattern[index]
				1418	index = index + 1
				1419
				1420	if start > end:
				1421	raise error, ('range arguments out of order '
				1422	'in character class')
				1423
				1424	for char in map(chr, range(ord(start), ord(end) + 1)):
				1425	if char not in set:
				1426	set.append(char)
				1427
				1428	last = ''
				1429
				1430	elif next == '\\':
				1431	# expand syntax meta-characters and add to set
				1432	if index >= len(pattern):
				1433	raise error, 'incomplete set'
				1434
				1435	escape_type, value, index = expand_escape(pattern,
				1436	index,
				1437	CHARCLASS)
				1438
				1439	if escape_type == CHAR:
				1440	set.append(value)
				1441	last = value
				1442
				1443	elif escape_type == SET:
				1444	for char in value:
				1445	if char not in set:
				1446	set.append(char)
				1447	last = ''
				1448
				1449	else:
				1450	raise error, 'illegal escape type in character class'
				1451
				1452	else:
				1453	if next not in set:
				1454	set.append(next)
				1455	last = next
				1456
				1457	if (index >= len(pattern)) or ( pattern[index] != ']'):
				1458	raise error, 'incomplete set'
				1459
				1460	index = index + 1
				1461
				1462	if negate:
				1463	# If case is being ignored, then both upper- and lowercase
				1464	# versions of the letters must be excluded.
				1465	if flags & IGNORECASE: set=set+map(string.upper, set)
				1466	notset = []
				1467	for char in map(chr, range(256)):
				1468	if char not in set:
				1469	notset.append(char)
				1470	if len(notset) == 0:
				1471	raise error, 'empty negated set'
				1472	stack.append([Set(notset, flags)])
				1473	else:
				1474	if len(set) == 0:
				1475	raise error, 'empty set'
				1476	stack.append([Set(set, flags)])
				1477
				1478	lastop = '[]'
				1479
				1480	else:
				1481	stack.append([Exact(char, flags)])
				1482	lastop = char
				1483
				1484	code = []
				1485	while len(stack) > 0:
				1486	if stack[-1][0].name == '(':
				1487	raise error, 'too many open parens'
				1488	code = stack[-1] + code
				1489	del stack[-1]
				1490	if len(code) == 0:
				1491	raise error, 'no code generated'
				1492	code = filter(lambda x: x.name != '\|', code)
				1493	need_label = 0
				1494	for i in range(len(code)):
				1495	if (code[i].name == 'jump') and (code[i].label == -1):
				1496	code[i] = Jump(label)
				1497	need_label = 1
				1498	if need_label:
				1499	code.append(Label(label))
				1500	label = label + 1
				1501	code.append(End())
				1502	# print code
				1503	return RegexObject(pattern, flags, code, register, groupindex)
				1504
				1505	# Replace expand_escape and _expand functions with their C equivalents.
				1506	# If you suspect bugs in the C versions, comment out the next two lines
				1507	expand_escape = reop.expand_escape
				1508	_expand = reop._expand