Lib/tokenize.py - platform/external/python/cpython2 - Gitiles

 """Tokenization help for Python programs.

 This module compiles a regular expression that recognizes Python
 tokens in individual lines of text.  The regular expression handles
 everything except indentation, continuations, and triple-quoted
 strings.  The function 'tokenize.tokenize()' takes care of these
 things for streams of text.  It accepts a readline-like function which
 is called repeatedly to come up with the next input line (or "" for
 EOF), and a "token-eater" function which is called for each token
 found, passing its type, a string containing the token, the line
 number, the line, and the starting and ending positions of the token
 within the line.  It is designed to match the working of the Python
 tokenizer exactly.

 """

 __version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"

 import string, regex
 from token import *

 def group(*choices): return '\(' + string.join(choices, '\|') + '\)'

 Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
 Name = '[a-zA-Z_][a-zA-Z0-9_]*'

 ImagZero = '0[jJ]' # This is not caught by any of the following
 Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
 Octnumber = '0[0-7]*[lL]?'
 Decnumber = '[1-9][0-9]*[lLjJ]?'
 Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)
 Exponent = '[eE][-+]?[0-9]+'
 Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
 Expfloat = '[0-9]+' + Exponent
 Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"
 Number = group(Floatnumber, Intnumber)

 Single = group('^\'', '[^\]\'')
 Double = group('^"', '[^\]"')
 Tsingle = group('^\'\'\'', '[^\]\'\'\'')
 Tdouble = group('^"""', '[^\]"""')
 Triple = group('\'\'\'', '"""')
 String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
                '"'  + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))

 Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
                  '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
 Bracket = '[][(){}]'
 Special = group('[\]?\r?\n', '[:;.,`\f]')
 Funny = group(Operator, Bracket, Special)

 PlainToken = group(Name, Number, Triple, String, Funny)
 Token = Ignore + PlainToken

 try:
     save_syntax = regex.set_syntax(0)          # use default syntax
     tokenprog = regex.compile(Token)
     endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
         '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
 finally:
     regex.set_syntax(save_syntax)              # restore original syntax

 tabsize = 8
 TokenError = 'TokenError'
 def printtoken(type, string, linenum, line, start, end):   # for testing
     print `linenum` + ':', tok_name[type], repr(string)

 def tokenize(readline, tokeneater = printtoken):
     linenum = parenlev = continued = 0
     namechars, numchars = string.letters + '_', string.digits
     contstr = ''
     indents = [0]
     while 1:                                   # loop over lines in stream
         line = readline()
         linenum = linenum + 1
         if line[-2:] == '\r\n': line = line[:-2] + '\n'
         pos, max = 0, len(line)

         if contstr:                            # continued string
             if not line: raise TokenError, "EOF within multi-line string"
             if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
             if endprog.search(line) >= 0:
                 pos = end = endprog.regs[0][1]
                 tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
                 contstr = ''
             else:
                 contstr = contstr + line
                 continue

         elif parenlev == 0 and not continued:  # this is a new statement
             if not line: break
             column = 0
             while 1:                           # measure leading whitespace
                 if line[pos] == ' ': column = column + 1
                 elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
                 elif line[pos] == '\f': column = 0
                 else: break
                 pos = pos + 1
             if line[pos] in '#\n': continue    # skip comments or blank lines

             if column > indents[-1]:           # count indents or dedents
                 indents.append(column)
                 tokeneater(INDENT, '\t', linenum, line, 0, 0)
             while column < indents[-1]:
                 indents = indents[:-1]
                 tokeneater(DEDENT, '\t', linenum, line, 0, 0)

         else:                                  # continued statement
             if not line: raise TokenError, "EOF within multi-line statement"
             continued = 0

         while pos < max:
             if tokenprog.match(line, pos) > 0:             # scan for tokens
                 start, end = tokenprog.regs[3]
                 token = line[start:end]
                 pos = end

                 if token[0] in namechars:                  # ordinary name
                     tokeneater(NAME, token, linenum, line, start, end)
                 elif token[0] in numchars:                 # ordinary number
                     tokeneater(NUMBER, token, linenum, line, start, end)

                 elif token in ('\'\'\'', '"""'):           # triple-quoted
                     endprog = endprogs[token]
                     if endprog.search(line, pos) >= 0:     # all on one line
                         pos = endprog.regs[0][1]
 			token = line[start:pos]
                         tokeneater(STRING, token, linenum, line, start, pos)
                     else:
                         contstr = line[start:]             # multiple lines
                         break
                 elif token[0] in '\'"':
                     if token[-1] == '\n':                  # continued string
                         endprog, contstr = endprogs[token[0]], line[start:]
                         break
                     else:                                  # ordinary string
                         tokeneater(STRING, token, linenum, line, start, end)

                 elif token[0] == '\n':
                     tokeneater(NEWLINE, token, linenum, line, start, end)
                 elif token[0] == '\\':                     # continued stmt
                     continued = 1

                 else:
                     if token[0] in '([{': parenlev = parenlev + 1
                     if token[0] in ')]}': parenlev = parenlev - 1
                     tokeneater(OP, token, linenum, line, start, end)
             else:
                 tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
                 pos = pos + 1

     for indent in indents[1:]:                 # pop remaining indent levels
         tokeneater(DEDENT, '\t', linenum, line, 0, 0)

 if __name__ == '__main__':                     # testing
     import sys
     file = open(sys.argv[-1])
     tokenize(file.readline)
	"""Tokenization help for Python programs.

	This module compiles a regular expression that recognizes Python
	tokens in individual lines of text. The regular expression handles
	everything except indentation, continuations, and triple-quoted
	strings. The function 'tokenize.tokenize()' takes care of these
	things for streams of text. It accepts a readline-like function which
	is called repeatedly to come up with the next input line (or "" for
	EOF), and a "token-eater" function which is called for each token
	found, passing its type, a string containing the token, the line
	number, the line, and the starting and ending positions of the token
	within the line. It is designed to match the working of the Python
	tokenizer exactly.

	"""

	__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"

	import string, regex
	from token import *

	def group(*choices): return '\(' + string.join(choices, '\\|') + '\)'

	Ignore = '[ \f\t]\([\]\r?\n[ \t]\)\(#.\)?'
	Name = '[a-zA-Z_][a-zA-Z0-9_]*'

	ImagZero = '0[jJ]' # This is not caught by any of the following
	Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
	Octnumber = '0[0-7]*[lL]?'
	Decnumber = '[1-9][0-9]*[lLjJ]?'
	Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)
	Exponent = '[eE][-+]?[0-9]+'
	Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
	Expfloat = '[0-9]+' + Exponent
	Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"
	Number = group(Floatnumber, Intnumber)

	Single = group('^\'', '[^\]\'')
	Double = group('^"', '[^\]"')
	Tsingle = group('^\'\'\'', '[^\]\'\'\'')
	Tdouble = group('^"""', '[^\]"""')
	Triple = group('\'\'\'', '"""')
	String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
	'"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))

	Operator = group('\+', '\-', '\\', '\*', '\^', '~', '/', '%', '&', '\|',
	'<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
	Bracket = '[][(){}]'
	Special = group('[\]?\r?\n', '[:;.,`\f]')
	Funny = group(Operator, Bracket, Special)

	PlainToken = group(Name, Number, Triple, String, Funny)
	Token = Ignore + PlainToken

	try:
	save_syntax = regex.set_syntax(0) # use default syntax
	tokenprog = regex.compile(Token)
	endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
	'\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
	finally:
	regex.set_syntax(save_syntax) # restore original syntax

	tabsize = 8
	TokenError = 'TokenError'
	def printtoken(type, string, linenum, line, start, end): # for testing
	print `linenum` + ':', tok_name[type], repr(string)

	def tokenize(readline, tokeneater = printtoken):
	linenum = parenlev = continued = 0
	namechars, numchars = string.letters + '_', string.digits
	contstr = ''
	indents = [0]
	while 1: # loop over lines in stream
	line = readline()
	linenum = linenum + 1
	if line[-2:] == '\r\n': line = line[:-2] + '\n'
	pos, max = 0, len(line)

	if contstr: # continued string
	if not line: raise TokenError, "EOF within multi-line string"
	if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
	if endprog.search(line) >= 0:
	pos = end = endprog.regs[0][1]
	tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
	contstr = ''
	else:
	contstr = contstr + line
	continue

	elif parenlev == 0 and not continued: # this is a new statement
	if not line: break
	column = 0
	while 1: # measure leading whitespace
	if line[pos] == ' ': column = column + 1
	elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
	elif line[pos] == '\f': column = 0
	else: break
	pos = pos + 1
	if line[pos] in '#\n': continue # skip comments or blank lines

	if column > indents[-1]: # count indents or dedents
	indents.append(column)
	tokeneater(INDENT, '\t', linenum, line, 0, 0)
	while column < indents[-1]:
	indents = indents[:-1]
	tokeneater(DEDENT, '\t', linenum, line, 0, 0)

	else: # continued statement
	if not line: raise TokenError, "EOF within multi-line statement"
	continued = 0

	while pos < max:
	if tokenprog.match(line, pos) > 0: # scan for tokens
	start, end = tokenprog.regs[3]
	token = line[start:end]
	pos = end

	if token[0] in namechars: # ordinary name
	tokeneater(NAME, token, linenum, line, start, end)
	elif token[0] in numchars: # ordinary number
	tokeneater(NUMBER, token, linenum, line, start, end)

	elif token in ('\'\'\'', '"""'): # triple-quoted
	endprog = endprogs[token]
	if endprog.search(line, pos) >= 0: # all on one line
	pos = endprog.regs[0][1]
	token = line[start:pos]
	tokeneater(STRING, token, linenum, line, start, pos)
	else:
	contstr = line[start:] # multiple lines
	break
	elif token[0] in '\'"':
	if token[-1] == '\n': # continued string
	endprog, contstr = endprogs[token[0]], line[start:]
	break
	else: # ordinary string
	tokeneater(STRING, token, linenum, line, start, end)

	elif token[0] == '\n':
	tokeneater(NEWLINE, token, linenum, line, start, end)
	elif token[0] == '\\': # continued stmt
	continued = 1

	else:
	if token[0] in '([{': parenlev = parenlev + 1
	if token[0] in ')]}': parenlev = parenlev - 1
	tokeneater(OP, token, linenum, line, start, end)
	else:
	tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
	pos = pos + 1

	for indent in indents[1:]: # pop remaining indent levels
	tokeneater(DEDENT, '\t', linenum, line, 0, 0)

	if __name__ == '__main__': # testing
	import sys
	file = open(sys.argv[-1])
	tokenize(file.readline)