Lib/tokenize.py - platform/external/python/cpython2 - Gitiles

 """Tokenization help for Python programs.

 This module exports a function called 'tokenize()' that breaks a stream of
 text into Python tokens.  It accepts a readline-like method which is called
 repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
 function which is called once for each token found.  The latter function is
 passed the token type, a string containing the token, the starting and
 ending (row, column) coordinates of the token, and the original line.  It is
 designed to match the working of the Python tokenizer exactly, except that
 it produces COMMENT tokens for comments and gives type OP for all operators."""

 __version__ = "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98"

 import string, re
 from token import *

 COMMENT = N_TOKENS
 tok_name[COMMENT] = 'COMMENT'
 NL = N_TOKENS + 1
 tok_name[NL] = 'NL'


 # Changes from 1.3:
 #     Ignore now accepts \f as whitespace.  Operator now includes '**'.
 #     Ignore and Special now accept \n or \r\n at the end of a line.
 #     Imagnumber is new.  Expfloat is corrected to reject '0e4'.
 # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.

 def group(*choices): return '(' + string.join(choices, '|') + ')'
 def any(*choices): return apply(group, choices) + '*'
 def maybe(*choices): return apply(group, choices) + '?'

 Whitespace = r'[ \f\t]*'
 Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 Name = r'[a-zA-Z_]\w*'

 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
 Octnumber = r'0[0-7]*[lL]?'
 Decnumber = r'[1-9]\d*[lL]?'
 Intnumber = group(Hexnumber, Octnumber, Decnumber)
 Exponent = r'[eE][-+]?\d+'
 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
 Expfloat = r'[1-9]\d*' + Exponent
 Floatnumber = group(Pointfloat, Expfloat)
 Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)

 # Tail end of ' string.
 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 # Tail end of " string.
 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 # Tail end of ''' string.
 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 # Tail end of """ string.
 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
 # Single-line ' or " string.
 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

 # Because of leftmost-then-longest match semantics, be sure to put the
 # longest operators first (e.g., if = came before ==, == would get
 # recognized as two instances of =).
 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
                  r"[+\-*/%&|^=<>]=?",
                  r"~")

 Bracket = '[][(){}]'
 Special = group(r'\r?\n', r'[:;.,`]')
 Funny = group(Operator, Bracket, Special)

 PlainToken = group(Number, Funny, String, Name)
 Token = Ignore + PlainToken

 # First (or only) line of ' or " string.
 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                 group("'", r'\\\r?\n'),
                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                 group('"', r'\\\r?\n'))
 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

 tokenprog, pseudoprog, single3prog, double3prog = map(
     re.compile, (Token, PseudoToken, Single3, Double3))
 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
             "'''": single3prog, '"""': double3prog,
             "r'''": single3prog, 'r"""': double3prog,
             "u'''": single3prog, 'u"""': double3prog,
             "ur'''": single3prog, 'ur"""': double3prog,
             "R'''": single3prog, 'R"""': double3prog,
             "U'''": single3prog, 'U"""': double3prog,
             "uR'''": single3prog, 'uR"""': double3prog,
             "Ur'''": single3prog, 'Ur"""': double3prog,
             "UR'''": single3prog, 'UR"""': double3prog,
             'r': None, 'R': None, 'u': None, 'U': None}

 tabsize = 8

 class TokenError(Exception):
     pass

 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
     print "%d,%d-%d,%d:\t%s\t%s" % \
         (srow, scol, erow, ecol, tok_name[type], repr(token))

 def tokenize(readline, tokeneater=printtoken):
     lnum = parenlev = continued = 0
     namechars, numchars = string.letters + '_', string.digits
     contstr, needcont = '', 0
     contline = None
     indents = [0]

     while 1:                                   # loop over lines in stream
         line = readline()
         lnum = lnum + 1
         pos, max = 0, len(line)

         if contstr:                            # continued string
             if not line:
                 raise TokenError, ("EOF in multi-line string", strstart)
             endmatch = endprog.match(line)
             if endmatch:
                 pos = end = endmatch.end(0)
                 tokeneater(STRING, contstr + line[:end],
                            strstart, (lnum, end), contline + line)
                 contstr, needcont = '', 0
                 contline = None
             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
                 tokeneater(ERRORTOKEN, contstr + line,
                            strstart, (lnum, len(line)), contline)
                 contstr = ''
                 contline = None
                 continue
             else:
                 contstr = contstr + line
                 contline = contline + line
                 continue

         elif parenlev == 0 and not continued:  # new statement
             if not line: break
             column = 0
             while pos < max:                   # measure leading whitespace
                 if line[pos] == ' ': column = column + 1
                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
                 elif line[pos] == '\f': column = 0
                 else: break
                 pos = pos + 1
             if pos == max: break

             if line[pos] in '#\r\n':           # skip comments or blank lines
                 tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
                            (lnum, pos), (lnum, len(line)), line)
                 continue

             if column > indents[-1]:           # count indents or dedents
                 indents.append(column)
                 tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
             while column < indents[-1]:
                 indents = indents[:-1]
                 tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)

         else:                                  # continued statement
             if not line:
                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
             continued = 0

         while pos < max:
             pseudomatch = pseudoprog.match(line, pos)
             if pseudomatch:                                # scan for tokens
                 start, end = pseudomatch.span(1)
                 spos, epos, pos = (lnum, start), (lnum, end), end
                 token, initial = line[start:end], line[start]

                 if initial in numchars \
                     or (initial == '.' and token != '.'):  # ordinary number
                     tokeneater(NUMBER, token, spos, epos, line)
                 elif initial in '\r\n':
                     tokeneater(parenlev > 0 and NL or NEWLINE,
                                token, spos, epos, line)
                 elif initial == '#':
                     tokeneater(COMMENT, token, spos, epos, line)
                 elif token in ("'''", '"""',               # triple-quoted
                                "r'''", 'r"""', "R'''", 'R"""',
                                "u'''", 'u"""', "U'''", 'U"""',
                                "ur'''", 'ur"""', "Ur'''", 'Ur"""',
                                "uR'''", 'uR"""', "UR'''", 'UR"""'):
                     endprog = endprogs[token]
                     endmatch = endprog.match(line, pos)
                     if endmatch:                           # all on one line
                         pos = endmatch.end(0)
                         token = line[start:pos]
                         tokeneater(STRING, token, spos, (lnum, pos), line)
                     else:
                         strstart = (lnum, start)           # multiple lines
                         contstr = line[start:]
                         contline = line
                         break
                 elif initial in ("'", '"') or \
                     token[:2] in ("r'", 'r"', "R'", 'R"',
                                   "u'", 'u"', "U'", 'U"') or \
                     token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
                                   "uR'", 'uR"', "UR'", 'UR"' ):
                     if token[-1] == '\n':                  # continued string
                         strstart = (lnum, start)
                         endprog = (endprogs[initial] or endprogs[token[1]] or
                                    endprogs[token[2]])
                         contstr, needcont = line[start:], 1
                         contline = line
                         break
                     else:                                  # ordinary string
                         tokeneater(STRING, token, spos, epos, line)
                 elif initial in namechars:                 # ordinary name
                     tokeneater(NAME, token, spos, epos, line)
                 elif initial == '\\':                      # continued stmt
                     continued = 1
                 else:
                     if initial in '([{': parenlev = parenlev + 1
                     elif initial in ')]}': parenlev = parenlev - 1
                     tokeneater(OP, token, spos, epos, line)
             else:
                 tokeneater(ERRORTOKEN, line[pos],
                            (lnum, pos), (lnum, pos+1), line)
                 pos = pos + 1

     for indent in indents[1:]:                 # pop remaining indent levels
         tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
     tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

 if __name__ == '__main__':                     # testing
     import sys
     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
     else: tokenize(sys.stdin.readline)
	"""Tokenization help for Python programs.

	This module exports a function called 'tokenize()' that breaks a stream of
	text into Python tokens. It accepts a readline-like method which is called
	repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
	function which is called once for each token found. The latter function is
	passed the token type, a string containing the token, the starting and
	ending (row, column) coordinates of the token, and the original line. It is
	designed to match the working of the Python tokenizer exactly, except that
	it produces COMMENT tokens for comments and gives type OP for all operators."""

	__version__ = "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98"

	import string, re
	from token import *

	COMMENT = N_TOKENS
	tok_name[COMMENT] = 'COMMENT'
	NL = N_TOKENS + 1
	tok_name[NL] = 'NL'


	# Changes from 1.3:
	# Ignore now accepts \f as whitespace. Operator now includes '**'.
	# Ignore and Special now accept \n or \r\n at the end of a line.
	# Imagnumber is new. Expfloat is corrected to reject '0e4'.
	# Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.

	def group(*choices): return '(' + string.join(choices, '\|') + ')'
	def any(choices): return apply(group, choices) + ''
	def maybe(*choices): return apply(group, choices) + '?'

	Whitespace = r'[ \f\t]*'
	Comment = r'#[^\r\n]*'
	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	Name = r'[a-zA-Z_]\w*'

	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
	Octnumber = r'0[0-7]*[lL]?'
	Decnumber = r'[1-9]\d*[lL]?'
	Intnumber = group(Hexnumber, Octnumber, Decnumber)
	Exponent = r'[eE][-+]?\d+'
	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
	Expfloat = r'[1-9]\d*' + Exponent
	Floatnumber = group(Pointfloat, Expfloat)
	Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
	Number = group(Imagnumber, Floatnumber, Intnumber)

	# Tail end of ' string.
	Single = r"[^'\\](?:\\.[^'\\])*'"
	# Tail end of " string.
	Double = r'[^"\\](?:\\.[^"\\])*"'
	# Tail end of ''' string.
	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
	# Tail end of """ string.
	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
	# Single-line ' or " string.
	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')

	# Because of leftmost-then-longest match semantics, be sure to put the
	# longest operators first (e.g., if = came before ==, == would get
	# recognized as two instances of =).
	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
	r"[+\-*/%&\|^=<>]=?",
	r"~")

	Bracket = '[][(){}]'
	Special = group(r'\r?\n', r'[:;.,`]')
	Funny = group(Operator, Bracket, Special)

	PlainToken = group(Number, Funny, String, Name)
	Token = Ignore + PlainToken

	# First (or only) line of ' or " string.
	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
	group("'", r'\\\r?\n'),
	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
	group('"', r'\\\r?\n'))
	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

	tokenprog, pseudoprog, single3prog, double3prog = map(
	re.compile, (Token, PseudoToken, Single3, Double3))
	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
	"'''": single3prog, '"""': double3prog,
	"r'''": single3prog, 'r"""': double3prog,
	"u'''": single3prog, 'u"""': double3prog,
	"ur'''": single3prog, 'ur"""': double3prog,
	"R'''": single3prog, 'R"""': double3prog,
	"U'''": single3prog, 'U"""': double3prog,
	"uR'''": single3prog, 'uR"""': double3prog,
	"Ur'''": single3prog, 'Ur"""': double3prog,
	"UR'''": single3prog, 'UR"""': double3prog,
	'r': None, 'R': None, 'u': None, 'U': None}

	tabsize = 8

	class TokenError(Exception):
	pass

	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
	print "%d,%d-%d,%d:\t%s\t%s" % \
	(srow, scol, erow, ecol, tok_name[type], repr(token))

	def tokenize(readline, tokeneater=printtoken):
	lnum = parenlev = continued = 0
	namechars, numchars = string.letters + '_', string.digits
	contstr, needcont = '', 0
	contline = None
	indents = [0]

	while 1: # loop over lines in stream
	line = readline()
	lnum = lnum + 1
	pos, max = 0, len(line)

	if contstr: # continued string
	if not line:
	raise TokenError, ("EOF in multi-line string", strstart)
	endmatch = endprog.match(line)
	if endmatch:
	pos = end = endmatch.end(0)
	tokeneater(STRING, contstr + line[:end],
	strstart, (lnum, end), contline + line)
	contstr, needcont = '', 0
	contline = None
	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
	tokeneater(ERRORTOKEN, contstr + line,
	strstart, (lnum, len(line)), contline)
	contstr = ''
	contline = None
	continue
	else:
	contstr = contstr + line
	contline = contline + line
	continue

	elif parenlev == 0 and not continued: # new statement
	if not line: break
	column = 0
	while pos < max: # measure leading whitespace
	if line[pos] == ' ': column = column + 1
	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
	elif line[pos] == '\f': column = 0
	else: break
	pos = pos + 1
	if pos == max: break

	if line[pos] in '#\r\n': # skip comments or blank lines
	tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
	(lnum, pos), (lnum, len(line)), line)
	continue

	if column > indents[-1]: # count indents or dedents
	indents.append(column)
	tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
	while column < indents[-1]:
	indents = indents[:-1]
	tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)

	else: # continued statement
	if not line:
	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
	continued = 0

	while pos < max:
	pseudomatch = pseudoprog.match(line, pos)
	if pseudomatch: # scan for tokens
	start, end = pseudomatch.span(1)
	spos, epos, pos = (lnum, start), (lnum, end), end
	token, initial = line[start:end], line[start]

	if initial in numchars \
	or (initial == '.' and token != '.'): # ordinary number
	tokeneater(NUMBER, token, spos, epos, line)
	elif initial in '\r\n':
	tokeneater(parenlev > 0 and NL or NEWLINE,
	token, spos, epos, line)
	elif initial == '#':
	tokeneater(COMMENT, token, spos, epos, line)
	elif token in ("'''", '"""', # triple-quoted
	"r'''", 'r"""', "R'''", 'R"""',
	"u'''", 'u"""', "U'''", 'U"""',
	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
	"uR'''", 'uR"""', "UR'''", 'UR"""'):
	endprog = endprogs[token]
	endmatch = endprog.match(line, pos)
	if endmatch: # all on one line
	pos = endmatch.end(0)
	token = line[start:pos]
	tokeneater(STRING, token, spos, (lnum, pos), line)
	else:
	strstart = (lnum, start) # multiple lines
	contstr = line[start:]
	contline = line
	break
	elif initial in ("'", '"') or \
	token[:2] in ("r'", 'r"', "R'", 'R"',
	"u'", 'u"', "U'", 'U"') or \
	token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
	"uR'", 'uR"', "UR'", 'UR"' ):
	if token[-1] == '\n': # continued string
	strstart = (lnum, start)
	endprog = (endprogs[initial] or endprogs[token[1]] or
	endprogs[token[2]])
	contstr, needcont = line[start:], 1
	contline = line
	break
	else: # ordinary string
	tokeneater(STRING, token, spos, epos, line)
	elif initial in namechars: # ordinary name
	tokeneater(NAME, token, spos, epos, line)
	elif initial == '\\': # continued stmt
	continued = 1
	else:
	if initial in '([{': parenlev = parenlev + 1
	elif initial in ')]}': parenlev = parenlev - 1
	tokeneater(OP, token, spos, epos, line)
	else:
	tokeneater(ERRORTOKEN, line[pos],
	(lnum, pos), (lnum, pos+1), line)
	pos = pos + 1

	for indent in indents[1:]: # pop remaining indent levels
	tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
	tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

	if __name__ == '__main__': # testing
	import sys
	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
	else: tokenize(sys.stdin.readline)