Lib/tokenize.py - platform/external/python/cpython3 - Gitiles

 """Tokenization help for Python programs.

 generate_tokens(readline) is a generator that breaks a stream of
 text into Python tokens.  It accepts a readline-like method which is called
 repeatedly to get the next line of input (or "" for EOF).  It generates
 5-tuples with these members:

     the token type (see token.py)
     the token (a string)
     the starting (row, column) indices of the token (a 2-tuple of ints)
     the ending (row, column) indices of the token (a 2-tuple of ints)
     the original line (string)

 It is designed to match the working of the Python tokenizer exactly, except
 that it produces COMMENT tokens for comments and gives type OP for all
 operators

 Older entry points
     tokenize_loop(readline, tokeneater)
     tokenize(readline, tokeneater=printtoken)
 are the same, except instead of generating tokens, tokeneater is a callback
 function to which the 5 fields described above are passed as 5 arguments,
 each time a new token is found."""

 from __future__ import generators

 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
 __credits__ = \
     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

 import string, re
 from token import *

 import token
 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
 del token

 COMMENT = N_TOKENS
 tok_name[COMMENT] = 'COMMENT'
 NL = N_TOKENS + 1
 tok_name[NL] = 'NL'
 N_TOKENS += 2

 def group(*choices): return '(' + '|'.join(choices) + ')'
 def any(*choices): return apply(group, choices) + '*'
 def maybe(*choices): return apply(group, choices) + '?'

 Whitespace = r'[ \f\t]*'
 Comment = r'#[^\r\n]*'
 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
 Name = r'[a-zA-Z_]\w*'

 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
 Octnumber = r'0[0-7]*[lL]?'
 Decnumber = r'[1-9]\d*[lL]?'
 Intnumber = group(Hexnumber, Octnumber, Decnumber)
 Exponent = r'[eE][-+]?\d+'
 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
 Expfloat = r'\d+' + Exponent
 Floatnumber = group(Pointfloat, Expfloat)
 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
 Number = group(Imagnumber, Floatnumber, Intnumber)

 # Tail end of ' string.
 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 # Tail end of " string.
 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 # Tail end of ''' string.
 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 # Tail end of """ string.
 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
 # Single-line ' or " string.
 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

 # Because of leftmost-then-longest match semantics, be sure to put the
 # longest operators first (e.g., if = came before ==, == would get
 # recognized as two instances of =).
 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
                  r"//=?",
                  r"[+\-*/%&|^=<>]=?",
                  r"~")

 Bracket = '[][(){}]'
 Special = group(r'\r?\n', r'[:;.,`]')
 Funny = group(Operator, Bracket, Special)

 PlainToken = group(Number, Funny, String, Name)
 Token = Ignore + PlainToken

 # First (or only) line of ' or " string.
 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                 group("'", r'\\\r?\n'),
                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                 group('"', r'\\\r?\n'))
 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

 tokenprog, pseudoprog, single3prog, double3prog = map(
     re.compile, (Token, PseudoToken, Single3, Double3))
 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
             "'''": single3prog, '"""': double3prog,
             "r'''": single3prog, 'r"""': double3prog,
             "u'''": single3prog, 'u"""': double3prog,
             "ur'''": single3prog, 'ur"""': double3prog,
             "R'''": single3prog, 'R"""': double3prog,
             "U'''": single3prog, 'U"""': double3prog,
             "uR'''": single3prog, 'uR"""': double3prog,
             "Ur'''": single3prog, 'Ur"""': double3prog,
             "UR'''": single3prog, 'UR"""': double3prog,
             'r': None, 'R': None, 'u': None, 'U': None}

 tabsize = 8

 class TokenError(Exception): pass

 class StopTokenizing(Exception): pass

 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
     print "%d,%d-%d,%d:\t%s\t%s" % \
         (srow, scol, erow, ecol, tok_name[type], repr(token))

 def tokenize(readline, tokeneater=printtoken):
     try:
         tokenize_loop(readline, tokeneater)
     except StopTokenizing:
         pass

 # backwards compatible interface
 def tokenize_loop(readline, tokeneater):
     for token_info in generate_tokens(readline):
         apply(tokeneater, token_info)

 def generate_tokens(readline):
     lnum = parenlev = continued = 0
     namechars, numchars = string.ascii_letters + '_', '0123456789'
     contstr, needcont = '', 0
     contline = None
     indents = [0]

     while 1:                                   # loop over lines in stream
         line = readline()
         lnum = lnum + 1
         pos, max = 0, len(line)

         if contstr:                            # continued string
             if not line:
                 raise TokenError, ("EOF in multi-line string", strstart)
             endmatch = endprog.match(line)
             if endmatch:
                 pos = end = endmatch.end(0)
                 yield (STRING, contstr + line[:end],
                            strstart, (lnum, end), contline + line)
                 contstr, needcont = '', 0
                 contline = None
             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
                 yield (ERRORTOKEN, contstr + line,
                            strstart, (lnum, len(line)), contline)
                 contstr = ''
                 contline = None
                 continue
             else:
                 contstr = contstr + line
                 contline = contline + line
                 continue

         elif parenlev == 0 and not continued:  # new statement
             if not line: break
             column = 0
             while pos < max:                   # measure leading whitespace
                 if line[pos] == ' ': column = column + 1
                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
                 elif line[pos] == '\f': column = 0
                 else: break
                 pos = pos + 1
             if pos == max: break

             if line[pos] in '#\r\n':           # skip comments or blank lines
                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
                            (lnum, pos), (lnum, len(line)), line)
                 continue

             if column > indents[-1]:           # count indents or dedents
                 indents.append(column)
                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
             while column < indents[-1]:
                 indents = indents[:-1]
                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

         else:                                  # continued statement
             if not line:
                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
             continued = 0

         while pos < max:
             pseudomatch = pseudoprog.match(line, pos)
             if pseudomatch:                                # scan for tokens
                 start, end = pseudomatch.span(1)
                 spos, epos, pos = (lnum, start), (lnum, end), end
                 token, initial = line[start:end], line[start]

                 if initial in numchars or \
                    (initial == '.' and token != '.'):      # ordinary number
                     yield (NUMBER, token, spos, epos, line)
                 elif initial in '\r\n':
                     yield (parenlev > 0 and NL or NEWLINE,
                                token, spos, epos, line)
                 elif initial == '#':
                     yield (COMMENT, token, spos, epos, line)
                 elif token in ("'''", '"""',               # triple-quoted
                                "r'''", 'r"""', "R'''", 'R"""',
                                "u'''", 'u"""', "U'''", 'U"""',
                                "ur'''", 'ur"""', "Ur'''", 'Ur"""',
                                "uR'''", 'uR"""', "UR'''", 'UR"""'):
                     endprog = endprogs[token]
                     endmatch = endprog.match(line, pos)
                     if endmatch:                           # all on one line
                         pos = endmatch.end(0)
                         token = line[start:pos]
                         yield (STRING, token, spos, (lnum, pos), line)
                     else:
                         strstart = (lnum, start)           # multiple lines
                         contstr = line[start:]
                         contline = line
                         break
                 elif initial in ("'", '"') or \
                     token[:2] in ("r'", 'r"', "R'", 'R"',
                                   "u'", 'u"', "U'", 'U"') or \
                     token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
                                   "uR'", 'uR"', "UR'", 'UR"' ):
                     if token[-1] == '\n':                  # continued string
                         strstart = (lnum, start)
                         endprog = (endprogs[initial] or endprogs[token[1]] or
                                    endprogs[token[2]])
                         contstr, needcont = line[start:], 1
                         contline = line
                         break
                     else:                                  # ordinary string
                         yield (STRING, token, spos, epos, line)
                 elif initial in namechars:                 # ordinary name
                     yield (NAME, token, spos, epos, line)
                 elif initial == '\\':                      # continued stmt
                     continued = 1
                 else:
                     if initial in '([{': parenlev = parenlev + 1
                     elif initial in ')]}': parenlev = parenlev - 1
                     yield (OP, token, spos, epos, line)
             else:
                 yield (ERRORTOKEN, line[pos],
                            (lnum, pos), (lnum, pos+1), line)
                 pos = pos + 1

     for indent in indents[1:]:                 # pop remaining indent levels
         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')

 if __name__ == '__main__':                     # testing
     import sys
     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
     else: tokenize(sys.stdin.readline)
	"""Tokenization help for Python programs.

	generate_tokens(readline) is a generator that breaks a stream of
	text into Python tokens. It accepts a readline-like method which is called
	repeatedly to get the next line of input (or "" for EOF). It generates
	5-tuples with these members:

	the token type (see token.py)
	the token (a string)
	the starting (row, column) indices of the token (a 2-tuple of ints)
	the ending (row, column) indices of the token (a 2-tuple of ints)
	the original line (string)

	It is designed to match the working of the Python tokenizer exactly, except
	that it produces COMMENT tokens for comments and gives type OP for all
	operators

	Older entry points
	tokenize_loop(readline, tokeneater)
	tokenize(readline, tokeneater=printtoken)
	are the same, except instead of generating tokens, tokeneater is a callback
	function to which the 5 fields described above are passed as 5 arguments,
	each time a new token is found."""

	from __future__ import generators

	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
	__credits__ = \
	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

	import string, re
	from token import *

	import token
	__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
	del token

	COMMENT = N_TOKENS
	tok_name[COMMENT] = 'COMMENT'
	NL = N_TOKENS + 1
	tok_name[NL] = 'NL'
	N_TOKENS += 2

	def group(*choices): return '(' + '\|'.join(choices) + ')'
	def any(choices): return apply(group, choices) + ''
	def maybe(*choices): return apply(group, choices) + '?'

	Whitespace = r'[ \f\t]*'
	Comment = r'#[^\r\n]*'
	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	Name = r'[a-zA-Z_]\w*'

	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
	Octnumber = r'0[0-7]*[lL]?'
	Decnumber = r'[1-9]\d*[lL]?'
	Intnumber = group(Hexnumber, Octnumber, Decnumber)
	Exponent = r'[eE][-+]?\d+'
	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
	Expfloat = r'\d+' + Exponent
	Floatnumber = group(Pointfloat, Expfloat)
	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
	Number = group(Imagnumber, Floatnumber, Intnumber)

	# Tail end of ' string.
	Single = r"[^'\\](?:\\.[^'\\])*'"
	# Tail end of " string.
	Double = r'[^"\\](?:\\.[^"\\])*"'
	# Tail end of ''' string.
	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
	# Tail end of """ string.
	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
	# Single-line ' or " string.
	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')

	# Because of leftmost-then-longest match semantics, be sure to put the
	# longest operators first (e.g., if = came before ==, == would get
	# recognized as two instances of =).
	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
	r"//=?",
	r"[+\-*/%&\|^=<>]=?",
	r"~")

	Bracket = '[][(){}]'
	Special = group(r'\r?\n', r'[:;.,`]')
	Funny = group(Operator, Bracket, Special)

	PlainToken = group(Number, Funny, String, Name)
	Token = Ignore + PlainToken

	# First (or only) line of ' or " string.
	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
	group("'", r'\\\r?\n'),
	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
	group('"', r'\\\r?\n'))
	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

	tokenprog, pseudoprog, single3prog, double3prog = map(
	re.compile, (Token, PseudoToken, Single3, Double3))
	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
	"'''": single3prog, '"""': double3prog,
	"r'''": single3prog, 'r"""': double3prog,
	"u'''": single3prog, 'u"""': double3prog,
	"ur'''": single3prog, 'ur"""': double3prog,
	"R'''": single3prog, 'R"""': double3prog,
	"U'''": single3prog, 'U"""': double3prog,
	"uR'''": single3prog, 'uR"""': double3prog,
	"Ur'''": single3prog, 'Ur"""': double3prog,
	"UR'''": single3prog, 'UR"""': double3prog,
	'r': None, 'R': None, 'u': None, 'U': None}

	tabsize = 8

	class TokenError(Exception): pass

	class StopTokenizing(Exception): pass

	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
	print "%d,%d-%d,%d:\t%s\t%s" % \
	(srow, scol, erow, ecol, tok_name[type], repr(token))

	def tokenize(readline, tokeneater=printtoken):
	try:
	tokenize_loop(readline, tokeneater)
	except StopTokenizing:
	pass

	# backwards compatible interface
	def tokenize_loop(readline, tokeneater):
	for token_info in generate_tokens(readline):
	apply(tokeneater, token_info)

	def generate_tokens(readline):
	lnum = parenlev = continued = 0
	namechars, numchars = string.ascii_letters + '_', '0123456789'
	contstr, needcont = '', 0
	contline = None
	indents = [0]

	while 1: # loop over lines in stream
	line = readline()
	lnum = lnum + 1
	pos, max = 0, len(line)

	if contstr: # continued string
	if not line:
	raise TokenError, ("EOF in multi-line string", strstart)
	endmatch = endprog.match(line)
	if endmatch:
	pos = end = endmatch.end(0)
	yield (STRING, contstr + line[:end],
	strstart, (lnum, end), contline + line)
	contstr, needcont = '', 0
	contline = None
	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
	yield (ERRORTOKEN, contstr + line,
	strstart, (lnum, len(line)), contline)
	contstr = ''
	contline = None
	continue
	else:
	contstr = contstr + line
	contline = contline + line
	continue

	elif parenlev == 0 and not continued: # new statement
	if not line: break
	column = 0
	while pos < max: # measure leading whitespace
	if line[pos] == ' ': column = column + 1
	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
	elif line[pos] == '\f': column = 0
	else: break
	pos = pos + 1
	if pos == max: break

	if line[pos] in '#\r\n': # skip comments or blank lines
	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
	(lnum, pos), (lnum, len(line)), line)
	continue

	if column > indents[-1]: # count indents or dedents
	indents.append(column)
	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
	while column < indents[-1]:
	indents = indents[:-1]
	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

	else: # continued statement
	if not line:
	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
	continued = 0

	while pos < max:
	pseudomatch = pseudoprog.match(line, pos)
	if pseudomatch: # scan for tokens
	start, end = pseudomatch.span(1)
	spos, epos, pos = (lnum, start), (lnum, end), end
	token, initial = line[start:end], line[start]

	if initial in numchars or \
	(initial == '.' and token != '.'): # ordinary number
	yield (NUMBER, token, spos, epos, line)
	elif initial in '\r\n':
	yield (parenlev > 0 and NL or NEWLINE,
	token, spos, epos, line)
	elif initial == '#':
	yield (COMMENT, token, spos, epos, line)
	elif token in ("'''", '"""', # triple-quoted
	"r'''", 'r"""', "R'''", 'R"""',
	"u'''", 'u"""', "U'''", 'U"""',
	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
	"uR'''", 'uR"""', "UR'''", 'UR"""'):
	endprog = endprogs[token]
	endmatch = endprog.match(line, pos)
	if endmatch: # all on one line
	pos = endmatch.end(0)
	token = line[start:pos]
	yield (STRING, token, spos, (lnum, pos), line)
	else:
	strstart = (lnum, start) # multiple lines
	contstr = line[start:]
	contline = line
	break
	elif initial in ("'", '"') or \
	token[:2] in ("r'", 'r"', "R'", 'R"',
	"u'", 'u"', "U'", 'U"') or \
	token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
	"uR'", 'uR"', "UR'", 'UR"' ):
	if token[-1] == '\n': # continued string
	strstart = (lnum, start)
	endprog = (endprogs[initial] or endprogs[token[1]] or
	endprogs[token[2]])
	contstr, needcont = line[start:], 1
	contline = line
	break
	else: # ordinary string
	yield (STRING, token, spos, epos, line)
	elif initial in namechars: # ordinary name
	yield (NAME, token, spos, epos, line)
	elif initial == '\\': # continued stmt
	continued = 1
	else:
	if initial in '([{': parenlev = parenlev + 1
	elif initial in ')]}': parenlev = parenlev - 1
	yield (OP, token, spos, epos, line)
	else:
	yield (ERRORTOKEN, line[pos],
	(lnum, pos), (lnum, pos+1), line)
	pos = pos + 1

	for indent in indents[1:]: # pop remaining indent levels
	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')

	if __name__ == '__main__': # testing
	import sys
	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
	else: tokenize(sys.stdin.readline)