Blame - Lib/tokenize.py - platform/external/python/cpython2

blob: 6b3d991a960c570ef278ebbb89a72cbaaca44275 [file] [log] [blame]

Guido van Rossum	4d8e859	1992-01-01 19:34:47 +0000	[diff] [blame^]	1	# This module compiles a regular expression that recognizes Python tokens.
				2	# It is designed to match the working of the Python tokenizer exactly.
				3	# It takes care of everything except indentation;
				4	# note that un-escaped newlines are tokens, too.
				5	# tokenprog.regs[3] gives the location of the token without whitespace
				6	# It also defines various subexpressions, but doesn't compile them.
				7	# See the function test() below for an example of how to use.
				8
				9	import regex
				10
				11	# Note: to get a quoted backslash in a regexp, it must be quadrupled.
				12
				13	Ignore = '[ \t]\(\\\\\n[ \t]\)\(#.\)?'
				14
				15	Name = '[a-zA-Z_][a-zA-Z0-9_]*'
				16
				17	Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
				18	Octnumber = '0[0-7]*[lL]?'
				19	Decnumber = '[1-9][0-9]*[lL]?'
				20	Intnumber = Hexnumber + '\\|' + Octnumber + '\\|' + Decnumber
				21	Exponent = '[eE][-+]?[0-9]+'
				22	Pointfloat = '\([0-9]+\.[0-9]*\\|\.[0-9]+\)\(' + Exponent + '\)?'
				23	Expfloat = '[0-9]+' + Exponent
				24	Floatnumber = Pointfloat + '\\|' + Expfloat
				25	Number = Intnumber + '\\|' + Floatnumber
				26
				27	String = '\'\(\\\\.\\|[^\\\n\']\)*\''
				28
				29	Operator = '~\\|\+\\|-\\|\*\\|/\\|%\\|\^\\|&\\|\|\\|<<\\|>>\\|==\\|<=\\|<>\\|!=\\|>=\\|=\\|<\\|>'
				30	Bracket = '[][(){}]'
				31	Special = '[:;.,`\n]'
				32	Funny = Operator + '\\|' + Bracket + '\\|' + Special
				33
				34	PlainToken = Name + '\\|' + Number + '\\|' + String + '\\|' + Funny
				35
				36	Token = Ignore + '\(' + PlainToken + '\)'
				37
				38	try:
				39	save_syntax = regex.set_syntax(0) # Use default syntax
				40	tokenprog = regex.compile(Token)
				41	finally:
				42	dummy = regex.set_syntax(save_syntax) # Restore original syntax
				43
				44
				45	def test(file):
				46	f = open(file, 'r')
				47	while 1:
				48	line = f.readline()
				49	if not line: break
				50	i, n = 0, len(line)
				51	while i < n:
				52	j = tokenprog.match(line, i)
				53	if j < 0:
				54	print 'No token at', `line[i:i+20]` + '...'
				55	i = i+1
				56	else:
				57	i = i+j
				58	a, b = tokenprog.regs[3]
				59	if a < b:
				60	print 'Token:', `line[a:b]`