Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: c31d54921043e7dc9abb770742eaa1ef8d13350a [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
				33	from lib2to3.pgen2.token import *
				34
				35	from . import token
				36	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				37	"generate_tokens", "untokenize"]
				38	del token
				39
				40	def group(*choices): return '(' + '\|'.join(choices) + ')'
				41	def any(choices): return group(choices) + '*'
				42	def maybe(choices): return group(choices) + '?'
				43
				44	Whitespace = r'[ \f\t]*'
				45	Comment = r'#[^\r\n]*'
				46	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				47	Name = r'[a-zA-Z_]\w*'
				48
				49	Binnumber = r'0[bB][01]*'
				50	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				51	Octnumber = r'0[oO]?[0-7]*[lL]?'
				52	Decnumber = r'[1-9]\d*[lL]?'
				53	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				54	Exponent = r'[eE][-+]?\d+'
				55	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				56	Expfloat = r'\d+' + Exponent
				57	Floatnumber = group(Pointfloat, Expfloat)
				58	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				59	Number = group(Imagnumber, Floatnumber, Intnumber)
				60
				61	# Tail end of ' string.
				62	Single = r"[^'\\](?:\\.[^'\\])*'"
				63	# Tail end of " string.
				64	Double = r'[^"\\](?:\\.[^"\\])*"'
				65	# Tail end of ''' string.
				66	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				67	# Tail end of """ string.
				68	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				69	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				70	# Single-line ' or " string.
				71	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				72	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				73
				74	# Because of leftmost-then-longest match semantics, be sure to put the
				75	# longest operators first (e.g., if = came before ==, == would get
				76	# recognized as two instances of =).
				77	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				78	r"//=?", r"->",
				79	r"[+\-*/%&\|^=<>]=?",
				80	r"~")
				81
				82	Bracket = '[][(){}]'
				83	Special = group(r'\r?\n', r'[:;.,`@]')
				84	Funny = group(Operator, Bracket, Special)
				85
				86	PlainToken = group(Number, Funny, String, Name)
				87	Token = Ignore + PlainToken
				88
				89	# First (or only) line of ' or " string.
				90	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				91	group("'", r'\\\r?\n'),
				92	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				93	group('"', r'\\\r?\n'))
				94	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				95	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				96
				97	tokenprog, pseudoprog, single3prog, double3prog = map(
				98	re.compile, (Token, PseudoToken, Single3, Double3))
				99	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				100	"'''": single3prog, '"""': double3prog,
				101	"r'''": single3prog, 'r"""': double3prog,
				102	"u'''": single3prog, 'u"""': double3prog,
				103	"b'''": single3prog, 'b"""': double3prog,
				104	"ur'''": single3prog, 'ur"""': double3prog,
				105	"br'''": single3prog, 'br"""': double3prog,
				106	"R'''": single3prog, 'R"""': double3prog,
				107	"U'''": single3prog, 'U"""': double3prog,
				108	"B'''": single3prog, 'B"""': double3prog,
				109	"uR'''": single3prog, 'uR"""': double3prog,
				110	"Ur'''": single3prog, 'Ur"""': double3prog,
				111	"UR'''": single3prog, 'UR"""': double3prog,
				112	"bR'''": single3prog, 'bR"""': double3prog,
				113	"Br'''": single3prog, 'Br"""': double3prog,
				114	"BR'''": single3prog, 'BR"""': double3prog,
				115	'r': None, 'R': None,
				116	'u': None, 'U': None,
				117	'b': None, 'B': None}
				118
				119	triple_quoted = {}
				120	for t in ("'''", '"""',
				121	"r'''", 'r"""', "R'''", 'R"""',
				122	"u'''", 'u"""', "U'''", 'U"""',
				123	"b'''", 'b"""', "B'''", 'B"""',
				124	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				125	"uR'''", 'uR"""', "UR'''", 'UR"""',
				126	"br'''", 'br"""', "Br'''", 'Br"""',
				127	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				128	triple_quoted[t] = t
				129	single_quoted = {}
				130	for t in ("'", '"',
				131	"r'", 'r"', "R'", 'R"',
				132	"u'", 'u"', "U'", 'U"',
				133	"b'", 'b"', "B'", 'B"',
				134	"ur'", 'ur"', "Ur'", 'Ur"',
				135	"uR'", 'uR"', "UR'", 'UR"',
				136	"br'", 'br"', "Br'", 'Br"',
				137	"bR'", 'bR"', "BR'", 'BR"', ):
				138	single_quoted[t] = t
				139
				140	tabsize = 8
				141
				142	class TokenError(Exception): pass
				143
				144	class StopTokenizing(Exception): pass
				145
				146	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
				147	print "%d,%d-%d,%d:\t%s\t%s" % \
				148	(srow, scol, erow, ecol, tok_name[type], repr(token))
				149
				150	def tokenize(readline, tokeneater=printtoken):
				151	"""
				152	The tokenize() function accepts two parameters: one representing the
				153	input stream, and one providing an output mechanism for tokenize().
				154
				155	The first parameter, readline, must be a callable object which provides
				156	the same interface as the readline() method of built-in file objects.
				157	Each call to the function should return one line of input as a string.
				158
				159	The second parameter, tokeneater, must also be a callable object. It is
				160	called once for each token, with five arguments, corresponding to the
				161	tuples generated by generate_tokens().
				162	"""
				163	try:
				164	tokenize_loop(readline, tokeneater)
				165	except StopTokenizing:
				166	pass
				167
				168	# backwards compatible interface
				169	def tokenize_loop(readline, tokeneater):
				170	for token_info in generate_tokens(readline):
				171	tokeneater(*token_info)
				172
				173	class Untokenizer:
				174
				175	def __init__(self):
				176	self.tokens = []
				177	self.prev_row = 1
				178	self.prev_col = 0
				179
				180	def add_whitespace(self, start):
				181	row, col = start
				182	assert row <= self.prev_row
				183	col_offset = col - self.prev_col
				184	if col_offset:
				185	self.tokens.append(" " * col_offset)
				186
				187	def untokenize(self, iterable):
				188	for t in iterable:
				189	if len(t) == 2:
				190	self.compat(t, iterable)
				191	break
				192	tok_type, token, start, end, line = t
				193	self.add_whitespace(start)
				194	self.tokens.append(token)
				195	self.prev_row, self.prev_col = end
				196	if tok_type in (NEWLINE, NL):
				197	self.prev_row += 1
				198	self.prev_col = 0
				199	return "".join(self.tokens)
				200
				201	def compat(self, token, iterable):
				202	startline = False
				203	indents = []
				204	toks_append = self.tokens.append
				205	toknum, tokval = token
				206	if toknum in (NAME, NUMBER):
				207	tokval += ' '
				208	if toknum in (NEWLINE, NL):
				209	startline = True
				210	for tok in iterable:
				211	toknum, tokval = tok[:2]
				212
				213	if toknum in (NAME, NUMBER):
				214	tokval += ' '
				215
				216	if toknum == INDENT:
				217	indents.append(tokval)
				218	continue
				219	elif toknum == DEDENT:
				220	indents.pop()
				221	continue
				222	elif toknum in (NEWLINE, NL):
				223	startline = True
				224	elif startline and indents:
				225	toks_append(indents[-1])
				226	startline = False
				227	toks_append(tokval)
				228
				229	def untokenize(iterable):
				230	"""Transform tokens back into Python source code.
				231
				232	Each element returned by the iterable must be a token sequence
				233	with at least two elements, a token number and token value. If
				234	only two tokens are passed, the resulting output is poor.
				235
				236	Round-trip invariant for full input:
				237	Untokenized source will match input source exactly
				238
				239	Round-trip invariant for limited intput:
				240	# Output text will tokenize the back to the input
				241	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				242	newcode = untokenize(t1)
				243	readline = iter(newcode.splitlines(1)).next
				244	t2 = [tok[:2] for tokin generate_tokens(readline)]
				245	assert t1 == t2
				246	"""
				247	ut = Untokenizer()
				248	return ut.untokenize(iterable)
				249
				250	def generate_tokens(readline):
				251	"""
				252	The generate_tokens() generator requires one argment, readline, which
				253	must be a callable object which provides the same interface as the
				254	readline() method of built-in file objects. Each call to the function
				255	should return one line of input as a string. Alternately, readline
				256	can be a callable function terminating with StopIteration:
				257	readline = open(myfile).next # Example of alternate readline
				258
				259	The generator produces 5-tuples with these members: the token type; the
				260	token string; a 2-tuple (srow, scol) of ints specifying the row and
				261	column where the token begins in the source; a 2-tuple (erow, ecol) of
				262	ints specifying the row and column where the token ends in the source;
				263	and the line on which the token was found. The line passed is the
				264	logical line; continuation lines are included.
				265	"""
				266	lnum = parenlev = continued = 0
				267	namechars, numchars = string.ascii_letters + '_', '0123456789'
				268	contstr, needcont = '', 0
				269	contline = None
				270	indents = [0]
				271
				272	while 1: # loop over lines in stream
				273	try:
				274	line = readline()
				275	except StopIteration:
				276	line = ''
				277	lnum = lnum + 1
				278	pos, max = 0, len(line)
				279
				280	if contstr: # continued string
				281	if not line:
				282	raise TokenError, ("EOF in multi-line string", strstart)
				283	endmatch = endprog.match(line)
				284	if endmatch:
				285	pos = end = endmatch.end(0)
				286	yield (STRING, contstr + line[:end],
				287	strstart, (lnum, end), contline + line)
				288	contstr, needcont = '', 0
				289	contline = None
				290	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				291	yield (ERRORTOKEN, contstr + line,
				292	strstart, (lnum, len(line)), contline)
				293	contstr = ''
				294	contline = None
				295	continue
				296	else:
				297	contstr = contstr + line
				298	contline = contline + line
				299	continue
				300
				301	elif parenlev == 0 and not continued: # new statement
				302	if not line: break
				303	column = 0
				304	while pos < max: # measure leading whitespace
				305	if line[pos] == ' ': column = column + 1
				306	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
				307	elif line[pos] == '\f': column = 0
				308	else: break
				309	pos = pos + 1
				310	if pos == max: break
				311
				312	if line[pos] in '#\r\n': # skip comments or blank lines
				313	if line[pos] == '#':
				314	comment_token = line[pos:].rstrip('\r\n')
				315	nl_pos = pos + len(comment_token)
				316	yield (COMMENT, comment_token,
				317	(lnum, pos), (lnum, pos + len(comment_token)), line)
				318	yield (NL, line[nl_pos:],
				319	(lnum, nl_pos), (lnum, len(line)), line)
				320	else:
				321	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				322	(lnum, pos), (lnum, len(line)), line)
				323	continue
				324
				325	if column > indents[-1]: # count indents or dedents
				326	indents.append(column)
				327	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				328	while column < indents[-1]:
				329	if column not in indents:
				330	raise IndentationError(
				331	"unindent does not match any outer indentation level",
				332	("<tokenize>", lnum, pos, line))
				333	indents = indents[:-1]
				334	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				335
				336	else: # continued statement
				337	if not line:
				338	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
				339	continued = 0
				340
				341	while pos < max:
				342	pseudomatch = pseudoprog.match(line, pos)
				343	if pseudomatch: # scan for tokens
				344	start, end = pseudomatch.span(1)
				345	spos, epos, pos = (lnum, start), (lnum, end), end
				346	token, initial = line[start:end], line[start]
				347
				348	if initial in numchars or \
				349	(initial == '.' and token != '.'): # ordinary number
				350	yield (NUMBER, token, spos, epos, line)
				351	elif initial in '\r\n':
				352	newline = NEWLINE
				353	if parenlev > 0:
				354	newline = NL
				355	yield (newline, token, spos, epos, line)
				356	elif initial == '#':
				357	assert not token.endswith("\n")
				358	yield (COMMENT, token, spos, epos, line)
				359	elif token in triple_quoted:
				360	endprog = endprogs[token]
				361	endmatch = endprog.match(line, pos)
				362	if endmatch: # all on one line
				363	pos = endmatch.end(0)
				364	token = line[start:pos]
				365	yield (STRING, token, spos, (lnum, pos), line)
				366	else:
				367	strstart = (lnum, start) # multiple lines
				368	contstr = line[start:]
				369	contline = line
				370	break
				371	elif initial in single_quoted or \
				372	token[:2] in single_quoted or \
				373	token[:3] in single_quoted:
				374	if token[-1] == '\n': # continued string
				375	strstart = (lnum, start)
				376	endprog = (endprogs[initial] or endprogs[token[1]] or
				377	endprogs[token[2]])
				378	contstr, needcont = line[start:], 1
				379	contline = line
				380	break
				381	else: # ordinary string
				382	yield (STRING, token, spos, epos, line)
				383	elif initial in namechars: # ordinary name
				384	yield (NAME, token, spos, epos, line)
				385	elif initial == '\\': # continued stmt
				386	# This yield is new; needed for better idempotency:
				387	yield (NL, token, spos, (lnum, pos), line)
				388	continued = 1
				389	else:
				390	if initial in '([{': parenlev = parenlev + 1
				391	elif initial in ')]}': parenlev = parenlev - 1
				392	yield (OP, token, spos, epos, line)
				393	else:
				394	yield (ERRORTOKEN, line[pos],
				395	(lnum, pos), (lnum, pos+1), line)
				396	pos = pos + 1
				397
				398	for indent in indents[1:]: # pop remaining indent levels
				399	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				400	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				401
				402	if __name__ == '__main__': # testing
				403	import sys
				404	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				405	else: tokenize(sys.stdin.readline)