Jeffrey's next installment

commit: a4f1a78b6eaed4d5d2c609d2e3160ec64535d22a [log] [tgz]
author: Guido van Rossum <guido@python.org> Thu Jul 17 22:38:10 1997 +0000
committer: Guido van Rossum <guido@python.org> Thu Jul 17 22:38:10 1997 +0000
tree: 64c0aae44cba4c7f03b7d8fbbad8760d75736df2
parent: 32d6f3c0ce22c07d512c70f34af83403acb75068 [diff] [blame]
diff --git a/Lib/re.py b/Lib/re.py
index 7ff53ca..8509b5f 100644
--- a/Lib/re.py
+++ b/Lib/re.py

@@ -15,30 +15,8 @@
 DOTALL = S = 0x04
 VERBOSE = X = 0x08
 
-#
-# Initialize syntax table. This information should really come from the
-# syntax table in regexpr.c rather than being duplicated here.
-#
-
-syntax_table = {}
-
-for char in map(chr, range(0, 256)):
-    syntax_table[char] = []
-
-for char in string.lowercase:
-    syntax_table[char].append('word')
-
-for char in string.uppercase:
-    syntax_table[char].append('word')
-
-for char in string.digits:
-    syntax_table[char].append('word')
-    syntax_table[char].append('digit')
-
-for char in string.whitespace:
-    syntax_table[char].append('whitespace')
-
-syntax_table['_'].append('word')
+repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?',
+			'{n,}', '{n,}?', '{n,m}', '{n,m}?']
 
 #
 #
@@ -47,10 +25,11 @@
 def valid_identifier(id):
     if len(id) == 0:
 	return 0
-    if ('word' not in syntax_table[id[0]]) or ('digit' in syntax_table[id[0]]):
+    if (not reop.syntax_table[id[0]] & reop.word) or \
+       (reop.syntax_table[id[0]] & reop.digit):
 	return 0
     for char in id[1:]:
-	if 'word' not in syntax_table[char]:
+	if not reop.syntax_table[char] & reop.word:
 	    return 0
     return 1
 
@@ -394,7 +373,6 @@
 	self.syntax = syntax
 	Instruction.__init__(self, chr(20), 2)
     def assemble(self, postition, labels):
-	# XXX
 	return self.opcode + chr(self.syntax)
 
 class NotSyntaxSpec(Instruction):
@@ -403,7 +381,6 @@
 	self.syntax = syntax
 	Instruction.__init__(self, chr(21), 2)
     def assemble(self, postition, labels):
-	# XXX
 	return self.opcode + chr(self.syntax)
 
 class Label(Instruction):
@@ -455,7 +432,7 @@
 def escape(pattern):
     result = []
     for char in pattern:
-	if 'word' not in syntax_table[char]:
+	if not reop.syntax_table[char] & reop.word:
 	    result.append('\\')
 	result.append(char)
     return string.join(result, '')
@@ -513,12 +490,12 @@
 	    return
 	elif instruction.name == 'syntaxspec':
 	    for char in map(chr, range(256)):
-		if instruction.syntax in syntax_table[char]:
+		if reop.syntax_table[char] & instruction.syntax:
 		    fastmap.add(char)
 	    return
 	elif instruction.name == 'notsyntaxspec':
 	    for char in map(chr, range(256)):
-		if instruction.syntax not in syntax_table[char]:
+		if not reop.syntax_table[char] & instruction.syntax:
 		    fastmap.add(char)
 	    return
 	elif instruction.name == 'eol':
@@ -570,8 +547,8 @@
 #
 
 [NORMAL, CHARCLASS, REPLACEMENT] = range(3)
-[CHAR, MEMORY_REFERENCE, SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY,
- BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(8)
+[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY,
+ NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9)
 
 def expand_escape(pattern, index, context=NORMAL):
     if index >= len(pattern):
@@ -646,11 +623,11 @@
     
     elif pattern[index] == 'w':
 	if context == NORMAL:
-	    return SYNTAX, 'word', index + 1
+	    return SYNTAX, reop.word, index + 1
 	elif context == CHARCLASS:
 	    set = []
-	    for char in syntax_table.keys():
-		if 'word' in syntax_table[char]:
+	    for char in reop.syntax_table.keys():
+		if reop.syntax_table[char] & reop.word:
 		    set.append(char)
 	    return SET, set, index + 1
 	else:
@@ -658,11 +635,11 @@
 	
     elif pattern[index] == 'W':
 	if context == NORMAL:
-	    return NOT_SYNTAX, 'word', index + 1
+	    return NOT_SYNTAX, reop.word, index + 1
 	elif context == CHARCLASS:
 	    set = []
-	    for char in syntax_table.keys():
-		if 'word' not in syntax_table[char]:
+	    for char in reop.syntax_table.keys():
+		if not reop.syntax_table[char] & reop.word:
 		    set.append(char)
 	    return SET, set, index + 1
 	else:
@@ -670,11 +647,11 @@
 	
     elif pattern[index] == 's':
 	if context == NORMAL:
-	    return SYNTAX, 'whitespace', index + 1
+	    return SYNTAX, reop.whitespace, index + 1
 	elif context == CHARCLASS:
 	    set = []
-	    for char in syntax_table.keys():
-		if 'whitespace' in syntax_table[char]:
+	    for char in reop.syntax_table.keys():
+		if reop.syntax_table[char] & reop.whitespace:
 		    set.append(char)
 	    return SET, set, index + 1
 	else:
@@ -682,11 +659,11 @@
 	
     elif pattern[index] == 'S':
 	if context == NORMAL:
-	    return NOT_SYNTAX, 'whitespace', index + 1
+	    return NOT_SYNTAX, reop.whitespace, index + 1
 	elif context == CHARCLASS:
 	    set = []
-	    for char in syntax_table.keys():
-		if 'whitespace' not in syntax_table[char]:
+	    for char in reop.syntax_table.keys():
+		if not reop.syntax_table[char] & reop.word:
 		    set.append(char)
 	    return SET, set, index + 1
 	else:
@@ -694,11 +671,11 @@
 	
     elif pattern[index] == 'd':
 	if context == NORMAL:
-	    return SYNTAX, 'digit', index + 1
+	    return SYNTAX, reop.digit, index + 1
 	elif context == CHARCLASS:
 	    set = []
-	    for char in syntax_table.keys():
-		if 'digit' in syntax_table[char]:
+	    for char in reop.syntax_table.keys():
+		if reop.syntax_table[char] & reop.digit:
 		    set.append(char)
 	    return SET, set, index + 1
 	else:
@@ -706,11 +683,11 @@
 	
     elif pattern[index] == 'D':
 	if context == NORMAL:
-	    return NOT_SYNTAX, 'digit', index + 1
+	    return NOT_SYNTAX, reop.digit, index + 1
 	elif context == CHARCLASS:
 	    set = []
-	    for char in syntax_table.keys():
-		if 'digit' not in syntax_table[char]:
+	    for char in reop.syntax_table.keys():
+		if not reop.syntax_table[char] & reop.digit:
 		    set.append(char)
 	    return SET, set, index + 1
 	else:
@@ -783,29 +760,31 @@
     register = 1
     groupindex = {}
     callouts = []
+    lastop = ''
 
-    # preprocess the pattern looking for embedded pattern modifiers
+    # look for embedded pattern modifiers at the beginning of the pattern
 
     index = 0
-    while (index != -1):
-	index = string.find(pattern, '(?', index)
-	if index != -1:
-	    index = index + 2
-	    if (index < len(pattern)) and (pattern[index] in 'iImMsSxX'):
-		while (index < len(pattern)) and (pattern[index] != ')'):
-		    if pattern[index] in 'iI':
-			flags = flags | IGNORECASE
-		    elif pattern[index] in 'mM':
-			flags = flags | MULTILINE
-		    elif pattern[index] in 'sS':
-			flags = flags | DOTALL
-		    elif pattern[index] in 'xX':
-			flags = flags | VERBOSE
-		    else:
-			raise error, 'unknown flag'
-		    index = index + 1
 
-    index = 0
+    if len(pattern) >= 3 and \
+       (pattern[:2] == '(?') and \
+       (pattern[2] in 'iImMsSxX'):
+	index = 2
+	while (index < len(pattern)) and (pattern[index] != ')'):
+	    if pattern[index] in 'iI':
+		flags = flags | IGNORECASE
+	    elif pattern[index] in 'mM':
+		flags = flags | MULTILINE
+	    elif pattern[index] in 'sS':
+		flags = flags | DOTALL
+	    elif pattern[index] in 'xX':
+		flags = flags | VERBOSE
+	    else:
+		raise error, 'unknown modifier'
+	    index = index + 1
+	index = index + 1
+
+    # compile the rest of the pattern
     
     while (index < len(pattern)):
 	char = pattern[index]
@@ -815,30 +794,52 @@
 
 	    if escape_type == CHAR:
 		stack.append([Exact(value)])
+		lastop = '\\' + value
 		
 	    elif escape_type == MEMORY_REFERENCE:
 		if value >= register:
 		    raise error, ('cannot reference a register '
 				  'not yet used')
 		stack.append([MatchMemory(value)])
+		lastop = '\\1'
 		
 	    elif escape_type == BEGINNING_OF_BUFFER:
 		stack.append([BegBuf()])
+		lastop = '\\A'
 		
 	    elif escape_type == END_OF_BUFFER:
 		stack.append([EndBuf()])
+		lastop = '\\Z'
 		
 	    elif escape_type == WORD_BOUNDARY:
 		stack.append([WordBound()])
+		lastop = '\\b'
 		
 	    elif escape_type == NOT_WORD_BOUNDARY:
 		stack.append([NotWordBound()])
+		lastop = '\\B'
 		
 	    elif escape_type == SYNTAX:
 		stack.append([SyntaxSpec(value)])
-		
+		if value == reop.word:
+		    lastop = '\\w'
+		elif value == reop.whitespace:
+		    lastop = '\\s'
+		elif value == reop.digit:
+		    lastop = '\\d'
+		else:
+		    lastop = '\\?'
+		    
 	    elif escape_type == NOT_SYNTAX:
 		stack.append([NotSyntaxSpec(value)])
+		if value == reop.word:
+		    lastop = '\\W'
+		elif value == reop.whitespace:
+		    lastop = '\\S'
+		elif value == reop.digit:
+		    lastop = '\\D'
+		else:
+		    lastop = '\\?'
 		
 	    elif escape_type == SET:
 		raise error, 'cannot use set escape type here'
@@ -860,7 +861,8 @@
 			  Label(label)])
 	    stack.append([Alternation()])
 	    label = label + 1
-
+	    lastop = '|'
+	    
 	elif char == '(':
 	    if index >= len(pattern):
 		raise error, 'no matching close paren'
@@ -891,6 +893,7 @@
 			groupindex[name] = register
 			stack.append([OpenParen(register)])
 			register = register + 1
+			lastop = '('
 
 		    elif pattern[index] == '=':
 			# backreference to symbolic group name
@@ -906,6 +909,7 @@
 					  ' has not been used yet')
 			stack.append([MatchMemory(groupindex[name])])
 			index = end + 1
+			lastop = '(?P=)'
 			
 		    elif pattern[index] == '!':
 			# function callout
@@ -920,7 +924,8 @@
 			    raise error, ('function callout name not listed '
 					  'in callouts dict')
 			stack.append([FunctionCallout(name)])
-
+			lastop = '(?P!)'
+			
 		    else:
 			raise error, ('unknown Python extension: ' + \
 				      pattern[index])
@@ -929,7 +934,8 @@
 		    # grouping, but no registers
 		    index = index + 1
 		    stack.append([OpenParen(-1)])
-
+		    lastop = '('
+		    
 		elif pattern[index] == '#':
 		    # comment
 		    index = index + 1
@@ -937,7 +943,8 @@
 		    if end == -1:
 			raise error, 'no end to comment'
 		    index = end + 1
-
+		    # do not change lastop
+		    
 		elif pattern[index] == '=':
 		    raise error, ('zero-width positive lookahead '
 				  'assertion is unsupported')
@@ -947,19 +954,16 @@
 				  'assertion is unsupported')
 
 		elif pattern[index] in 'iImMsSxX':
-		    # ignore embedded pattern modifiers here, they
-		    # have already been taken care of in the
-		    # preprocessing
-		    while (index < len(pattern)) and (pattern[index] != ')'):
-			index = index + 1
-		    index = index + 1
-		    
+		    raise error, ('embedded pattern modifiers are only '
+				  'allowed at the beginning of the pattern')
+
 		else:
 		    raise error, 'unknown extension'
 
 	    else:
 		stack.append([OpenParen(register)])
 		register = register + 1
+		lastop = '('
 
 	elif char == ')':
 	    # make one expression out of everything on the stack up to
@@ -991,6 +995,7 @@
 		       [EndMemory(stack[-1][0].register)]
 	    del stack[-1]
 	    stack.append(expr)
+	    lastop = ')'
 
 	elif char == '{':
 	    if len(stack) == 0:
@@ -1024,6 +1029,10 @@
 		    count = count - 1
 		del stack[-1]
 		stack.append(expr)
+		if minimal:
+		    lastop = '{n}?'
+		else:
+		    lastop = '{n}'
 
 	    elif len(fields) == 2:
 		# {n,} or {n,m}
@@ -1048,6 +1057,7 @@
 				stack[-1] + \
 				[Label(label + 1),
 				 FailureJump(label)])
+			lastop = '{n,}?'
 		    else:
 			expr = expr + \
 			       ([Label(label),
@@ -1055,6 +1065,7 @@
 				stack[-1] +
 				[StarJump(label),
 				 Label(label + 1)])
+			lastop = '{n,}'
 
 		    del stack[-1]
 		    stack.append(expr)
@@ -1099,6 +1110,7 @@
 			    label = label + 2
 			del stack[-1]
 			stack.append(expr)
+			lastop = '{n,m}?'
 		    else:
 			while max > 0:
 			    expr = expr + \
@@ -1108,11 +1120,11 @@
 			del stack[-1]
 			stack.append(expr + [Label(label)])
 			label = label + 1
+			lastop = '{n,m}'
 
 	    else:
 		raise error, ('there need to be one or two fields '
 			      'in a {} expression')
-	    index = end + 1
 
 	elif char == '}':
 	    raise error, 'unbalanced close curly brace'
@@ -1121,9 +1133,13 @@
 	    # Kleene closure
 	    if len(stack) == 0:
 		raise error, '* needs something to repeat'
-	    if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
+
+	    if lastop in ['(', '|']:
 		raise error, '* needs something to repeat'
-	    registers = registers_used(stack[-1])
+
+	    if lastop in repetition_operators:
+		raise error, 'nested repetition operators'
+	    
 	    if (index < len(pattern)) and (pattern[index] == '?'):
 		# non-greedy matching
 		expr = [Jump(label + 1),
@@ -1132,6 +1148,7 @@
 		       [Label(label + 1),
 			FailureJump(label)]
 		index = index + 1
+		lastop = '*?'
 	    else:
 		# greedy matching
 		expr = [Label(label),
@@ -1139,6 +1156,7 @@
 		       stack[-1] + \
 		       [StarJump(label),
 			Label(label + 1)]
+		lastop = '*'
 	    del stack[-1]
 	    stack.append(expr)
 	    label = label + 2
@@ -1148,9 +1166,12 @@
 	    if len(stack) == 0:
 		raise error, '+ needs something to repeat'
 	    
-	    if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
+	    if lastop in ['(', '|']:
 		raise error, '+ needs something to repeat'
-	    
+
+	    if lastop in repetition_operators:
+		raise error, 'nested repetition operators'
+
 	    if (index < len(pattern)) and (pattern[index] == '?'):
 		# non-greedy
 		expr = [Label(label)] + \
@@ -1158,6 +1179,8 @@
 		       [FailureJump(label)]
 		label = label + 1
 		index = index + 1
+		lastop = '+?'
+		
 	    else:
 		# greedy
 		expr = [DummyFailureJump(label + 1),
@@ -1168,12 +1191,24 @@
 		       [StarJump(label),
 			Label(label + 2)]
 		label = label + 3
+		lastop = '+'
+		
 	    del stack[-1]
 	    stack.append(expr)
 
 	elif char == '?':
 	    if len(stack) == 0:
 		raise error, 'need something to be optional'
+	    
+	    if len(stack) == 0:
+		raise error, '? needs something to repeat'
+	    
+	    if lastop in ['(', '|']:
+		raise error, '? needs something to repeat'
+
+	    if lastop in repetition_operators:
+		raise error, 'nested repetition operators'
+
 	    if (index < len(pattern)) and (pattern[index] == '?'):
 		# non-greedy matching
 		expr = [FailureJump(label),
@@ -1183,12 +1218,16 @@
 		       [Label(label + 1)]
 		label = label + 2
 		index = index + 1
+		lastop = '??'
+		
 	    else:
 		# greedy matching
 		expr = [FailureJump(label)] + \
 		       stack[-1] + \
 		       [Label(label)]
 		label = label + 1
+		lastop = '?'
+		
 	    del stack[-1]
 	    stack.append(expr)
 
@@ -1197,18 +1236,21 @@
 		stack.append([Set(map(chr, range(256)))])
 	    else:
 		stack.append([AnyChar()])
+	    lastop = '.'
 
 	elif char == '^':
 	    if flags & MULTILINE:
 		stack.append([Bol()])
 	    else:
 		stack.append([BegBuf()])
+	    lastop = '^'
 
 	elif char == '$':
 	    if flags & MULTILINE:
 		stack.append([Eol()])
 	    else:
 		stack.append([EndBuf()])
+	    lastop = '$'
 
 	elif char == '#':
 	    if flags & VERBOSE:
@@ -1219,12 +1261,15 @@
 		    index = len(pattern)
 		else:
 		    index = end + 1
+		# do not change lastop
 	    else:
 		stack.append([Exact(char)])
+		lastop = '#'
 
 	elif char in string.whitespace:
 	    if not (flags & VERBOSE):
 		stack.append([Exact(char)])
+		lastop = char
 
 	elif char == '[':
 	    # compile character class
@@ -1343,8 +1388,11 @@
 		    raise error, 'empty set'
 		stack.append([Set(set)])
 
+	    lastop = '[]'
+
 	else:
 	    stack.append([Exact(char)])
+	    lastop = char
 
     code = []
     while len(stack) > 0:
commit	a4f1a78b6eaed4d5d2c609d2e3160ec64535d22a	[log] [tgz]
author	Guido van Rossum <guido@python.org>	Thu Jul 17 22:38:10 1997 +0000
committer	Guido van Rossum <guido@python.org>	Thu Jul 17 22:38:10 1997 +0000
tree	64c0aae44cba4c7f03b7d8fbbad8760d75736df2
parent	32d6f3c0ce22c07d512c70f34af83403acb75068 [diff] [blame]