| # This module compiles a regular expression that recognizes Python tokens. | 
 | # It is designed to match the working of the Python tokenizer exactly. | 
 | # It takes care of everything except indentation; | 
 | # note that un-escaped newlines are tokens, too. | 
 | # tokenprog.regs[3] gives the location of the token without whitespace | 
 | # It also defines various subexpressions, but doesn't compile them. | 
 | # See the function test() below for an example of how to use. | 
 |  | 
 | import regex | 
 |  | 
 | # Note: to get a quoted backslash in a regexp, it must be quadrupled. | 
 |  | 
 | Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?' | 
 |  | 
 | Name = '[a-zA-Z_][a-zA-Z0-9_]*' | 
 |  | 
 | Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' | 
 | Octnumber = '0[0-7]*[lL]?' | 
 | Decnumber = '[1-9][0-9]*[lL]?' | 
 | Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber | 
 | Exponent = '[eE][-+]?[0-9]+' | 
 | Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?' | 
 | Expfloat = '[0-9]+' + Exponent | 
 | Floatnumber = Pointfloat + '\|' + Expfloat | 
 | Number = Floatnumber + '\|' + Intnumber | 
 |  | 
 | String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"' | 
 | # Note: this module *recognizes* double quotes, but for backward | 
 | # compatibility, it doesn't *use* them! | 
 |  | 
 | Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>' | 
 | Bracket = '[][(){}]' | 
 | Special = '[:;.,`\n]' | 
 | Funny = Operator + '\|' + Bracket + '\|' + Special | 
 |  | 
 | PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny | 
 |  | 
 | Token = Ignore + '\(' + PlainToken + '\)' | 
 |  | 
 | try: | 
 | 	save_syntax = regex.set_syntax(0) # Use default syntax | 
 | 	tokenprog = regex.compile(Token) | 
 | finally: | 
 | 	if save_syntax != 0: | 
 | 		dummy = regex.set_syntax(save_syntax) # Restore original syntax | 
 |  | 
 |  | 
 | def test(file): | 
 | 	f = open(file, 'r') | 
 | 	while 1: | 
 | 		line = f.readline() | 
 | 		if not line: break | 
 | 		i, n = 0, len(line) | 
 | 		while i < n: | 
 | 			j = tokenprog.match(line, i) | 
 | 			if j < 0: | 
 | 				print 'No token at', `line[i:i+20]` + '...' | 
 | 				i = i+1 | 
 | 			else: | 
 | 				i = i+j | 
 | 				a, b = tokenprog.regs[3] | 
 | 				if a < b: | 
 | 					print 'Token:', `line[a:b]` |