Blame - python/lib/Lib/tokenize.py - platform/tools/idea

blob: 0db3867283cd8406ca041e2af632a6af38ac4639 [file] [log] [blame]

Tor Norbye	3a2425a	2013-11-04 10:16:08 -0800	[diff] [blame^]	1	"""Tokenization help for Python programs.
				2
				3	generate_tokens(readline) is a generator that breaks a stream of
				4	text into Python tokens. It accepts a readline-like method which is called
				5	repeatedly to get the next line of input (or "" for EOF). It generates
				6	5-tuples with these members:
				7
				8	the token type (see token.py)
				9	the token (a string)
				10	the starting (row, column) indices of the token (a 2-tuple of ints)
				11	the ending (row, column) indices of the token (a 2-tuple of ints)
				12	the original line (string)
				13
				14	It is designed to match the working of the Python tokenizer exactly, except
				15	that it produces COMMENT tokens for comments and gives type OP for all
				16	operators
				17
				18	Older entry points
				19	tokenize_loop(readline, tokeneater)
				20	tokenize(readline, tokeneater=printtoken)
				21	are the same, except instead of generating tokens, tokeneater is a callback
				22	function to which the 5 fields described above are passed as 5 arguments,
				23	each time a new token is found."""
				24
				25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				26	__credits__ = \
				27	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				28
				29	import string, re
				30	from token import *
				31
				32	import token
				33	__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
				34	"generate_tokens", "NL", "untokenize"]
				35	del x
				36	del token
				37
				38	COMMENT = N_TOKENS
				39	tok_name[COMMENT] = 'COMMENT'
				40	NL = N_TOKENS + 1
				41	tok_name[NL] = 'NL'
				42	N_TOKENS += 2
				43
				44	def group(*choices): return '(' + '\|'.join(choices) + ')'
				45	def any(choices): return group(choices) + '*'
				46	def maybe(choices): return group(choices) + '?'
				47
				48	Whitespace = r'[ \f\t]*'
				49	Comment = r'#[^\r\n]*'
				50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				51	Name = r'[a-zA-Z_]\w*'
				52
				53	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				54	Octnumber = r'0[0-7]*[lL]?'
				55	Decnumber = r'[1-9]\d*[lL]?'
				56	Intnumber = group(Hexnumber, Octnumber, Decnumber)
				57	Exponent = r'[eE][-+]?\d+'
				58	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				59	Expfloat = r'\d+' + Exponent
				60	Floatnumber = group(Pointfloat, Expfloat)
				61	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				62	Number = group(Imagnumber, Floatnumber, Intnumber)
				63
				64	# Tail end of ' string.
				65	Single = r"[^'\\](?:\\.[^'\\])*'"
				66	# Tail end of " string.
				67	Double = r'[^"\\](?:\\.[^"\\])*"'
				68	# Tail end of ''' string.
				69	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				70	# Tail end of """ string.
				71	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				72	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
				73	# Single-line ' or " string.
				74	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				75	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				76
				77	# Because of leftmost-then-longest match semantics, be sure to put the
				78	# longest operators first (e.g., if = came before ==, == would get
				79	# recognized as two instances of =).
				80	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				81	r"//=?",
				82	r"[+\-*/%&\|^=<>]=?",
				83	r"~")
				84
				85	Bracket = '[][(){}]'
				86	Special = group(r'\r?\n', r'[:;.,`@]')
				87	Funny = group(Operator, Bracket, Special)
				88
				89	PlainToken = group(Number, Funny, String, Name)
				90	Token = Ignore + PlainToken
				91
				92	# First (or only) line of ' or " string.
				93	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				94	group("'", r'\\\r?\n'),
				95	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				96	group('"', r'\\\r?\n'))
				97	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				98	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				99
				100	tokenprog, pseudoprog, single3prog, double3prog = map(
				101	re.compile, (Token, PseudoToken, Single3, Double3))
				102	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				103	"'''": single3prog, '"""': double3prog,
				104	"r'''": single3prog, 'r"""': double3prog,
				105	"u'''": single3prog, 'u"""': double3prog,
				106	"ur'''": single3prog, 'ur"""': double3prog,
				107	"R'''": single3prog, 'R"""': double3prog,
				108	"U'''": single3prog, 'U"""': double3prog,
				109	"uR'''": single3prog, 'uR"""': double3prog,
				110	"Ur'''": single3prog, 'Ur"""': double3prog,
				111	"UR'''": single3prog, 'UR"""': double3prog,
				112	'r': None, 'R': None, 'u': None, 'U': None}
				113
				114	triple_quoted = {}
				115	for t in ("'''", '"""',
				116	"r'''", 'r"""', "R'''", 'R"""',
				117	"u'''", 'u"""', "U'''", 'U"""',
				118	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				119	"uR'''", 'uR"""', "UR'''", 'UR"""'):
				120	triple_quoted[t] = t
				121	single_quoted = {}
				122	for t in ("'", '"',
				123	"r'", 'r"', "R'", 'R"',
				124	"u'", 'u"', "U'", 'U"',
				125	"ur'", 'ur"', "Ur'", 'Ur"',
				126	"uR'", 'uR"', "UR'", 'UR"' ):
				127	single_quoted[t] = t
				128
				129	tabsize = 8
				130
				131	class TokenError(Exception): pass
				132
				133	class StopTokenizing(Exception): pass
				134
				135	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
				136	print "%d,%d-%d,%d:\t%s\t%s" % \
				137	(srow, scol, erow, ecol, tok_name[type], repr(token))
				138
				139	def tokenize(readline, tokeneater=printtoken):
				140	"""
				141	The tokenize() function accepts two parameters: one representing the
				142	input stream, and one providing an output mechanism for tokenize().
				143
				144	The first parameter, readline, must be a callable object which provides
				145	the same interface as the readline() method of built-in file objects.
				146	Each call to the function should return one line of input as a string.
				147
				148	The second parameter, tokeneater, must also be a callable object. It is
				149	called once for each token, with five arguments, corresponding to the
				150	tuples generated by generate_tokens().
				151	"""
				152	try:
				153	tokenize_loop(readline, tokeneater)
				154	except StopTokenizing:
				155	pass
				156
				157	# backwards compatible interface
				158	def tokenize_loop(readline, tokeneater):
				159	for token_info in generate_tokens(readline):
				160	tokeneater(*token_info)
				161
				162
				163	def untokenize(iterable):
				164	"""Transform tokens back into Python source code.
				165
				166	Each element returned by the iterable must be a token sequence
				167	with at least two elements, a token number and token value.
				168
				169	Round-trip invariant:
				170	# Output text will tokenize the back to the input
				171	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				172	newcode = untokenize(t1)
				173	readline = iter(newcode.splitlines(1)).next
				174	t2 = [tok[:2] for tok in generate_tokens(readline)]
				175	assert t1 == t2
				176	"""
				177
				178	startline = False
				179	prevstring = False
				180	indents = []
				181	toks = []
				182	toks_append = toks.append
				183	for tok in iterable:
				184	toknum, tokval = tok[:2]
				185
				186	if toknum in (NAME, NUMBER):
				187	tokval += ' '
				188
				189	# Insert a space between two consecutive strings
				190	if toknum == STRING:
				191	if prevstring:
				192	tokval = ' ' + tokval
				193	prevstring = True
				194	else:
				195	prevstring = False
				196
				197	if toknum == INDENT:
				198	indents.append(tokval)
				199	continue
				200	elif toknum == DEDENT:
				201	indents.pop()
				202	continue
				203	elif toknum in (NEWLINE, COMMENT, NL):
				204	startline = True
				205	elif startline and indents:
				206	toks_append(indents[-1])
				207	startline = False
				208	toks_append(tokval)
				209	return ''.join(toks)
				210
				211
				212	def generate_tokens(readline):
				213	"""
				214	The generate_tokens() generator requires one argment, readline, which
				215	must be a callable object which provides the same interface as the
				216	readline() method of built-in file objects. Each call to the function
				217	should return one line of input as a string. Alternately, readline
				218	can be a callable function terminating with StopIteration:
				219	readline = open(myfile).next # Example of alternate readline
				220
				221	The generator produces 5-tuples with these members: the token type; the
				222	token string; a 2-tuple (srow, scol) of ints specifying the row and
				223	column where the token begins in the source; a 2-tuple (erow, ecol) of
				224	ints specifying the row and column where the token ends in the source;
				225	and the line on which the token was found. The line passed is the
				226	logical line; continuation lines are included.
				227	"""
				228	lnum = parenlev = continued = 0
				229	namechars, numchars = string.ascii_letters + '_', '0123456789'
				230	contstr, needcont = '', 0
				231	contline = None
				232	indents = [0]
				233
				234	while 1: # loop over lines in stream
				235	try:
				236	line = readline()
				237	except StopIteration:
				238	line = ''
				239	lnum = lnum + 1
				240	pos, max = 0, len(line)
				241
				242	if contstr: # continued string
				243	if not line:
				244	raise TokenError, ("EOF in multi-line string", strstart)
				245	endmatch = endprog.match(line)
				246	if endmatch:
				247	pos = end = endmatch.end(0)
				248	yield (STRING, contstr + line[:end],
				249	strstart, (lnum, end), contline + line)
				250	contstr, needcont = '', 0
				251	contline = None
				252	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				253	yield (ERRORTOKEN, contstr + line,
				254	strstart, (lnum, len(line)), contline)
				255	contstr = ''
				256	contline = None
				257	continue
				258	else:
				259	contstr = contstr + line
				260	contline = contline + line
				261	continue
				262
				263	elif parenlev == 0 and not continued: # new statement
				264	if not line: break
				265	column = 0
				266	while pos < max: # measure leading whitespace
				267	if line[pos] == ' ': column = column + 1
				268	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
				269	elif line[pos] == '\f': column = 0
				270	else: break
				271	pos = pos + 1
				272	if pos == max: break
				273
				274	if line[pos] in '#\r\n': # skip comments or blank lines
				275	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				276	(lnum, pos), (lnum, len(line)), line)
				277	continue
				278
				279	if column > indents[-1]: # count indents or dedents
				280	indents.append(column)
				281	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				282	while column < indents[-1]:
				283	if column not in indents:
				284	raise IndentationError(
				285	"unindent does not match any outer indentation level",
				286	("<tokenize>", lnum, pos, line))
				287	indents = indents[:-1]
				288	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				289
				290	else: # continued statement
				291	if not line:
				292	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
				293	continued = 0
				294
				295	while pos < max:
				296	pseudomatch = pseudoprog.match(line, pos)
				297	if pseudomatch: # scan for tokens
				298	start, end = pseudomatch.span(1)
				299	spos, epos, pos = (lnum, start), (lnum, end), end
				300	token, initial = line[start:end], line[start]
				301
				302	if initial in numchars or \
				303	(initial == '.' and token != '.'): # ordinary number
				304	yield (NUMBER, token, spos, epos, line)
				305	elif initial in '\r\n':
				306	yield (parenlev > 0 and NL or NEWLINE,
				307	token, spos, epos, line)
				308	elif initial == '#':
				309	yield (COMMENT, token, spos, epos, line)
				310	elif token in triple_quoted:
				311	endprog = endprogs[token]
				312	endmatch = endprog.match(line, pos)
				313	if endmatch: # all on one line
				314	pos = endmatch.end(0)
				315	token = line[start:pos]
				316	yield (STRING, token, spos, (lnum, pos), line)
				317	else:
				318	strstart = (lnum, start) # multiple lines
				319	contstr = line[start:]
				320	contline = line
				321	break
				322	elif initial in single_quoted or \
				323	token[:2] in single_quoted or \
				324	token[:3] in single_quoted:
				325	if token[-1] == '\n': # continued string
				326	strstart = (lnum, start)
				327	endprog = (endprogs[initial] or endprogs[token[1]] or
				328	endprogs[token[2]])
				329	contstr, needcont = line[start:], 1
				330	contline = line
				331	break
				332	else: # ordinary string
				333	yield (STRING, token, spos, epos, line)
				334	elif initial in namechars: # ordinary name
				335	yield (NAME, token, spos, epos, line)
				336	elif initial == '\\': # continued stmt
				337	continued = 1
				338	else:
				339	if initial in '([{': parenlev = parenlev + 1
				340	elif initial in ')]}': parenlev = parenlev - 1
				341	yield (OP, token, spos, epos, line)
				342	else:
				343	yield (ERRORTOKEN, line[pos],
				344	(lnum, pos), (lnum, pos+1), line)
				345	pos = pos + 1
				346
				347	for indent in indents[1:]: # pop remaining indent levels
				348	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				349	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				350
				351	if __name__ == '__main__': # testing
				352	import sys
				353	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				354	else: tokenize(sys.stdin.readline)