Lib/tokenize.py - platform/external/python/cpython2 - Gitiles

 # This module compiles a regular expression that recognizes Python tokens.
 # It is designed to match the working of the Python tokenizer exactly.
 # It takes care of everything except indentation;
 # note that un-escaped newlines are tokens, too.
 # tokenprog.regs[3] gives the location of the token without whitespace
 # It also defines various subexpressions, but doesn't compile them.
 # See the function test() below for an example of how to use.

 import regex

 # Note: to get a quoted backslash in a regexp, it must be quadrupled.

 Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'

 Name = '[a-zA-Z_][a-zA-Z0-9_]*'

 Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
 Octnumber = '0[0-7]*[lL]?'
 Decnumber = '[1-9][0-9]*[lL]?'
 Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
 Exponent = '[eE][-+]?[0-9]+'
 Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
 Expfloat = '[0-9]+' + Exponent
 Floatnumber = Pointfloat + '\|' + Expfloat
 Number = Floatnumber + '\|' + Intnumber

 String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
 # Note: this module *recognizes* double quotes, but for backward
 # compatibility, it doesn't *use* them!

 Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
 Bracket = '[][(){}]'
 Special = '[:;.,`\n]'
 Funny = Operator + '\|' + Bracket + '\|' + Special

 PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny

 Token = Ignore + '\(' + PlainToken + '\)'

 try:
 	save_syntax = regex.set_syntax(0) # Use default syntax
 	tokenprog = regex.compile(Token)
 finally:
 	if save_syntax != 0:
 		dummy = regex.set_syntax(save_syntax) # Restore original syntax


 def test(file):
 	f = open(file, 'r')
 	while 1:
 		line = f.readline()
 		if not line: break
 		i, n = 0, len(line)
 		while i < n:
 			j = tokenprog.match(line, i)
 			if j < 0:
 				print 'No token at', `line[i:i+20]` + '...'
 				i = i+1
 			else:
 				i = i+j
 				a, b = tokenprog.regs[3]
 				if a < b:
 					print 'Token:', `line[a:b]`
	# This module compiles a regular expression that recognizes Python tokens.
	# It is designed to match the working of the Python tokenizer exactly.
	# It takes care of everything except indentation;
	# note that un-escaped newlines are tokens, too.
	# tokenprog.regs[3] gives the location of the token without whitespace
	# It also defines various subexpressions, but doesn't compile them.
	# See the function test() below for an example of how to use.

	import regex

	# Note: to get a quoted backslash in a regexp, it must be quadrupled.

	Ignore = '[ \t]\(\\\\\n[ \t]\)\(#.\)?'

	Name = '[a-zA-Z_][a-zA-Z0-9_]*'

	Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
	Octnumber = '0[0-7]*[lL]?'
	Decnumber = '[1-9][0-9]*[lL]?'
	Intnumber = Hexnumber + '\\|' + Octnumber + '\\|' + Decnumber
	Exponent = '[eE][-+]?[0-9]+'
	Pointfloat = '\([0-9]+\.[0-9]*\\|\.[0-9]+\)\(' + Exponent + '\)?'
	Expfloat = '[0-9]+' + Exponent
	Floatnumber = Pointfloat + '\\|' + Expfloat
	Number = Floatnumber + '\\|' + Intnumber

	String = '\'\(\\\\.\\|[^\\\n\']\)\'' + '\\|' + '"\(\\\\.\\|[^\\\n"]\)"'
	# Note: this module recognizes double quotes, but for backward
	# compatibility, it doesn't use them!

	Operator = '~\\|\+\\|-\\|\*\\|/\\|%\\|\^\\|&\\|\|\\|<<\\|>>\\|==\\|<=\\|<>\\|!=\\|>=\\|=\\|<\\|>'
	Bracket = '[][(){}]'
	Special = '[:;.,`\n]'
	Funny = Operator + '\\|' + Bracket + '\\|' + Special

	PlainToken = Name + '\\|' + Number + '\\|' + String + '\\|' + Funny

	Token = Ignore + '\(' + PlainToken + '\)'

	try:
	save_syntax = regex.set_syntax(0) # Use default syntax
	tokenprog = regex.compile(Token)
	finally:
	if save_syntax != 0:
	dummy = regex.set_syntax(save_syntax) # Restore original syntax


	def test(file):
	f = open(file, 'r')
	while 1:
	line = f.readline()
	if not line: break
	i, n = 0, len(line)
	while i < n:
	j = tokenprog.match(line, i)
	if j < 0:
	print 'No token at', `line[i:i+20]` + '...'
	i = i+1
	else:
	i = i+j
	a, b = tokenprog.regs[3]
	if a < b:
	print 'Token:', `line[a:b]`