Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython2

blob: 9cf2a69dc559e7abb3c83c51a7bbee327bacd74a [file] [log] [blame]

Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
Benjamin Peterson	5dfad9d	2010-05-07 18:58:23 +0000	[diff] [blame]	41	try:
				42	bytes
				43	except NameError:
				44	# Support bytes type in Python <= 2.5, so 2to3 turns itself into
				45	# valid Python 3 code.
				46	bytes = str
				47
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	48	def group(*choices): return '(' + '\|'.join(choices) + ')'
				49	def any(choices): return group(choices) + '*'
				50	def maybe(choices): return group(choices) + '?'
				51
				52	Whitespace = r'[ \f\t]*'
				53	Comment = r'#[^\r\n]*'
				54	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				55	Name = r'[a-zA-Z_]\w*'
				56
				57	Binnumber = r'0[bB][01]*'
				58	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				59	Octnumber = r'0[oO]?[0-7]*[lL]?'
				60	Decnumber = r'[1-9]\d*[lL]?'
				61	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				62	Exponent = r'[eE][-+]?\d+'
				63	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				64	Expfloat = r'\d+' + Exponent
				65	Floatnumber = group(Pointfloat, Expfloat)
				66	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				67	Number = group(Imagnumber, Floatnumber, Intnumber)
				68
				69	# Tail end of ' string.
				70	Single = r"[^'\\](?:\\.[^'\\])*'"
				71	# Tail end of " string.
				72	Double = r'[^"\\](?:\\.[^"\\])*"'
				73	# Tail end of ''' string.
				74	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				75	# Tail end of """ string.
				76	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				77	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				78	# Single-line ' or " string.
				79	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				80	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				81
				82	# Because of leftmost-then-longest match semantics, be sure to put the
				83	# longest operators first (e.g., if = came before ==, == would get
				84	# recognized as two instances of =).
				85	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				86	r"//=?", r"->",
				87	r"[+\-*/%&\|^=<>]=?",
				88	r"~")
				89
				90	Bracket = '[][(){}]'
				91	Special = group(r'\r?\n', r'[:;.,`@]')
				92	Funny = group(Operator, Bracket, Special)
				93
				94	PlainToken = group(Number, Funny, String, Name)
				95	Token = Ignore + PlainToken
				96
				97	# First (or only) line of ' or " string.
				98	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				99	group("'", r'\\\r?\n'),
				100	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				101	group('"', r'\\\r?\n'))
				102	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				103	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				104
				105	tokenprog, pseudoprog, single3prog, double3prog = map(
				106	re.compile, (Token, PseudoToken, Single3, Double3))
				107	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				108	"'''": single3prog, '"""': double3prog,
				109	"r'''": single3prog, 'r"""': double3prog,
				110	"u'''": single3prog, 'u"""': double3prog,
				111	"b'''": single3prog, 'b"""': double3prog,
				112	"ur'''": single3prog, 'ur"""': double3prog,
				113	"br'''": single3prog, 'br"""': double3prog,
				114	"R'''": single3prog, 'R"""': double3prog,
				115	"U'''": single3prog, 'U"""': double3prog,
				116	"B'''": single3prog, 'B"""': double3prog,
				117	"uR'''": single3prog, 'uR"""': double3prog,
				118	"Ur'''": single3prog, 'Ur"""': double3prog,
				119	"UR'''": single3prog, 'UR"""': double3prog,
				120	"bR'''": single3prog, 'bR"""': double3prog,
				121	"Br'''": single3prog, 'Br"""': double3prog,
				122	"BR'''": single3prog, 'BR"""': double3prog,
				123	'r': None, 'R': None,
				124	'u': None, 'U': None,
				125	'b': None, 'B': None}
				126
				127	triple_quoted = {}
				128	for t in ("'''", '"""',
				129	"r'''", 'r"""', "R'''", 'R"""',
				130	"u'''", 'u"""', "U'''", 'U"""',
				131	"b'''", 'b"""', "B'''", 'B"""',
				132	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				133	"uR'''", 'uR"""', "UR'''", 'UR"""',
				134	"br'''", 'br"""', "Br'''", 'Br"""',
				135	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				136	triple_quoted[t] = t
				137	single_quoted = {}
				138	for t in ("'", '"',
				139	"r'", 'r"', "R'", 'R"',
				140	"u'", 'u"', "U'", 'U"',
				141	"b'", 'b"', "B'", 'B"',
				142	"ur'", 'ur"', "Ur'", 'Ur"',
				143	"uR'", 'uR"', "UR'", 'UR"',
				144	"br'", 'br"', "Br'", 'Br"',
				145	"bR'", 'bR"', "BR'", 'BR"', ):
				146	single_quoted[t] = t
				147
				148	tabsize = 8
				149
				150	class TokenError(Exception): pass
				151
				152	class StopTokenizing(Exception): pass
				153
Antoine Pitrou	b9d4963	2010-01-04 23:22:44 +0000	[diff] [blame]	154	def printtoken(type, token, start, end, line): # for testing
				155	(srow, scol) = start
				156	(erow, ecol) = end
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	157	print "%d,%d-%d,%d:\t%s\t%s" % \
				158	(srow, scol, erow, ecol, tok_name[type], repr(token))
				159
				160	def tokenize(readline, tokeneater=printtoken):
				161	"""
				162	The tokenize() function accepts two parameters: one representing the
				163	input stream, and one providing an output mechanism for tokenize().
				164
				165	The first parameter, readline, must be a callable object which provides
				166	the same interface as the readline() method of built-in file objects.
				167	Each call to the function should return one line of input as a string.
				168
				169	The second parameter, tokeneater, must also be a callable object. It is
				170	called once for each token, with five arguments, corresponding to the
				171	tuples generated by generate_tokens().
				172	"""
				173	try:
				174	tokenize_loop(readline, tokeneater)
				175	except StopTokenizing:
				176	pass
				177
				178	# backwards compatible interface
				179	def tokenize_loop(readline, tokeneater):
				180	for token_info in generate_tokens(readline):
				181	tokeneater(*token_info)
				182
				183	class Untokenizer:
				184
				185	def __init__(self):
				186	self.tokens = []
				187	self.prev_row = 1
				188	self.prev_col = 0
				189
				190	def add_whitespace(self, start):
				191	row, col = start
				192	assert row <= self.prev_row
				193	col_offset = col - self.prev_col
				194	if col_offset:
				195	self.tokens.append(" " * col_offset)
				196
				197	def untokenize(self, iterable):
				198	for t in iterable:
				199	if len(t) == 2:
				200	self.compat(t, iterable)
				201	break
				202	tok_type, token, start, end, line = t
				203	self.add_whitespace(start)
				204	self.tokens.append(token)
				205	self.prev_row, self.prev_col = end
				206	if tok_type in (NEWLINE, NL):
				207	self.prev_row += 1
				208	self.prev_col = 0
				209	return "".join(self.tokens)
				210
				211	def compat(self, token, iterable):
				212	startline = False
				213	indents = []
				214	toks_append = self.tokens.append
				215	toknum, tokval = token
				216	if toknum in (NAME, NUMBER):
				217	tokval += ' '
				218	if toknum in (NEWLINE, NL):
				219	startline = True
				220	for tok in iterable:
				221	toknum, tokval = tok[:2]
				222
				223	if toknum in (NAME, NUMBER):
				224	tokval += ' '
				225
				226	if toknum == INDENT:
				227	indents.append(tokval)
				228	continue
				229	elif toknum == DEDENT:
				230	indents.pop()
				231	continue
				232	elif toknum in (NEWLINE, NL):
				233	startline = True
				234	elif startline and indents:
				235	toks_append(indents[-1])
				236	startline = False
				237	toks_append(tokval)
				238
Serhiy Storchaka	e787bce	2013-09-17 00:00:46 +0300	[diff] [blame^]	239	cookie_re = re.compile(r'^[ \t\f]#.coding[:=][ \t]*([-\w.]+)', re.ASCII)
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	240
Benjamin Peterson	f9e7d54	2009-11-02 18:12:12 +0000	[diff] [blame]	241	def _get_normal_name(orig_enc):
				242	"""Imitates get_normal_name in tokenizer.c."""
				243	# Only care about the first 12 characters.
				244	enc = orig_enc[:12].lower().replace("_", "-")
				245	if enc == "utf-8" or enc.startswith("utf-8-"):
				246	return "utf-8"
				247	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
				248	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
				249	return "iso-8859-1"
				250	return orig_enc
				251
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	252	def detect_encoding(readline):
				253	"""
				254	The detect_encoding() function is used to detect the encoding that should
				255	be used to decode a Python source file. It requires one argment, readline,
				256	in the same way as the tokenize() generator.
				257
				258	It will call readline a maximum of twice, and return the encoding used
				259	(as a string) and a list of any lines (left as bytes) it has read
				260	in.
				261
				262	It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson	798e540	2010-03-22 22:40:06 +0000	[diff] [blame]	263	cookie as specified in pep-0263. If both a bom and a cookie are present, but
				264	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
				265	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
				266	'utf-8-sig' is returned.
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	267
				268	If no encoding is specified, then the default of 'utf-8' will be returned.
				269	"""
				270	bom_found = False
				271	encoding = None
Benjamin Peterson	798e540	2010-03-22 22:40:06 +0000	[diff] [blame]	272	default = 'utf-8'
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	273	def read_or_stop():
				274	try:
				275	return readline()
				276	except StopIteration:
Benjamin Peterson	5dfad9d	2010-05-07 18:58:23 +0000	[diff] [blame]	277	return bytes()
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	278
				279	def find_cookie(line):
				280	try:
				281	line_string = line.decode('ascii')
				282	except UnicodeDecodeError:
				283	return None
Serhiy Storchaka	e787bce	2013-09-17 00:00:46 +0300	[diff] [blame^]	284	match = cookie_re.match(line_string)
				285	if not match:
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	286	return None
Serhiy Storchaka	e787bce	2013-09-17 00:00:46 +0300	[diff] [blame^]	287	encoding = _get_normal_name(match.group(1))
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	288	try:
				289	codec = lookup(encoding)
				290	except LookupError:
				291	# This behaviour mimics the Python interpreter
				292	raise SyntaxError("unknown encoding: " + encoding)
				293
Benjamin Peterson	42d26d9	2009-11-25 18:16:46 +0000	[diff] [blame]	294	if bom_found:
				295	if codec.name != 'utf-8':
				296	# This behaviour mimics the Python interpreter
				297	raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson	798e540	2010-03-22 22:40:06 +0000	[diff] [blame]	298	encoding += '-sig'
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	299	return encoding
				300
				301	first = read_or_stop()
				302	if first.startswith(BOM_UTF8):
				303	bom_found = True
				304	first = first[3:]
Benjamin Peterson	798e540	2010-03-22 22:40:06 +0000	[diff] [blame]	305	default = 'utf-8-sig'
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	306	if not first:
Benjamin Peterson	798e540	2010-03-22 22:40:06 +0000	[diff] [blame]	307	return default, []
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	308
				309	encoding = find_cookie(first)
				310	if encoding:
				311	return encoding, [first]
				312
				313	second = read_or_stop()
				314	if not second:
Benjamin Peterson	798e540	2010-03-22 22:40:06 +0000	[diff] [blame]	315	return default, [first]
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	316
				317	encoding = find_cookie(second)
				318	if encoding:
				319	return encoding, [first, second]
				320
Benjamin Peterson	798e540	2010-03-22 22:40:06 +0000	[diff] [blame]	321	return default, [first, second]
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	322
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	323	def untokenize(iterable):
				324	"""Transform tokens back into Python source code.
				325
				326	Each element returned by the iterable must be a token sequence
				327	with at least two elements, a token number and token value. If
				328	only two tokens are passed, the resulting output is poor.
				329
				330	Round-trip invariant for full input:
				331	Untokenized source will match input source exactly
				332
				333	Round-trip invariant for limited intput:
				334	# Output text will tokenize the back to the input
				335	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				336	newcode = untokenize(t1)
				337	readline = iter(newcode.splitlines(1)).next
				338	t2 = [tok[:2] for tokin generate_tokens(readline)]
				339	assert t1 == t2
				340	"""
				341	ut = Untokenizer()
				342	return ut.untokenize(iterable)
				343
				344	def generate_tokens(readline):
				345	"""
				346	The generate_tokens() generator requires one argment, readline, which
				347	must be a callable object which provides the same interface as the
				348	readline() method of built-in file objects. Each call to the function
				349	should return one line of input as a string. Alternately, readline
				350	can be a callable function terminating with StopIteration:
				351	readline = open(myfile).next # Example of alternate readline
				352
				353	The generator produces 5-tuples with these members: the token type; the
				354	token string; a 2-tuple (srow, scol) of ints specifying the row and
				355	column where the token begins in the source; a 2-tuple (erow, ecol) of
				356	ints specifying the row and column where the token ends in the source;
				357	and the line on which the token was found. The line passed is the
				358	logical line; continuation lines are included.
				359	"""
				360	lnum = parenlev = continued = 0
				361	namechars, numchars = string.ascii_letters + '_', '0123456789'
				362	contstr, needcont = '', 0
				363	contline = None
				364	indents = [0]
				365
				366	while 1: # loop over lines in stream
				367	try:
				368	line = readline()
				369	except StopIteration:
				370	line = ''
				371	lnum = lnum + 1
				372	pos, max = 0, len(line)
				373
				374	if contstr: # continued string
				375	if not line:
				376	raise TokenError, ("EOF in multi-line string", strstart)
				377	endmatch = endprog.match(line)
				378	if endmatch:
				379	pos = end = endmatch.end(0)
				380	yield (STRING, contstr + line[:end],
				381	strstart, (lnum, end), contline + line)
				382	contstr, needcont = '', 0
				383	contline = None
				384	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				385	yield (ERRORTOKEN, contstr + line,
				386	strstart, (lnum, len(line)), contline)
				387	contstr = ''
				388	contline = None
				389	continue
				390	else:
				391	contstr = contstr + line
				392	contline = contline + line
				393	continue
				394
				395	elif parenlev == 0 and not continued: # new statement
				396	if not line: break
				397	column = 0
				398	while pos < max: # measure leading whitespace
				399	if line[pos] == ' ': column = column + 1
Benjamin Peterson	f9e7d54	2009-11-02 18:12:12 +0000	[diff] [blame]	400	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	401	elif line[pos] == '\f': column = 0
				402	else: break
				403	pos = pos + 1
				404	if pos == max: break
				405
				406	if line[pos] in '#\r\n': # skip comments or blank lines
				407	if line[pos] == '#':
				408	comment_token = line[pos:].rstrip('\r\n')
				409	nl_pos = pos + len(comment_token)
				410	yield (COMMENT, comment_token,
				411	(lnum, pos), (lnum, pos + len(comment_token)), line)
				412	yield (NL, line[nl_pos:],
				413	(lnum, nl_pos), (lnum, len(line)), line)
				414	else:
				415	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				416	(lnum, pos), (lnum, len(line)), line)
				417	continue
				418
				419	if column > indents[-1]: # count indents or dedents
				420	indents.append(column)
				421	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				422	while column < indents[-1]:
				423	if column not in indents:
				424	raise IndentationError(
				425	"unindent does not match any outer indentation level",
				426	("<tokenize>", lnum, pos, line))
				427	indents = indents[:-1]
				428	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				429
				430	else: # continued statement
				431	if not line:
				432	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
				433	continued = 0
				434
				435	while pos < max:
				436	pseudomatch = pseudoprog.match(line, pos)
				437	if pseudomatch: # scan for tokens
				438	start, end = pseudomatch.span(1)
				439	spos, epos, pos = (lnum, start), (lnum, end), end
				440	token, initial = line[start:end], line[start]
				441
				442	if initial in numchars or \
				443	(initial == '.' and token != '.'): # ordinary number
				444	yield (NUMBER, token, spos, epos, line)
				445	elif initial in '\r\n':
				446	newline = NEWLINE
				447	if parenlev > 0:
				448	newline = NL
				449	yield (newline, token, spos, epos, line)
				450	elif initial == '#':
				451	assert not token.endswith("\n")
				452	yield (COMMENT, token, spos, epos, line)
				453	elif token in triple_quoted:
				454	endprog = endprogs[token]
				455	endmatch = endprog.match(line, pos)
				456	if endmatch: # all on one line
				457	pos = endmatch.end(0)
				458	token = line[start:pos]
				459	yield (STRING, token, spos, (lnum, pos), line)
				460	else:
				461	strstart = (lnum, start) # multiple lines
				462	contstr = line[start:]
				463	contline = line
				464	break
				465	elif initial in single_quoted or \
				466	token[:2] in single_quoted or \
				467	token[:3] in single_quoted:
				468	if token[-1] == '\n': # continued string
				469	strstart = (lnum, start)
				470	endprog = (endprogs[initial] or endprogs[token[1]] or
				471	endprogs[token[2]])
				472	contstr, needcont = line[start:], 1
				473	contline = line
				474	break
				475	else: # ordinary string
				476	yield (STRING, token, spos, epos, line)
				477	elif initial in namechars: # ordinary name
				478	yield (NAME, token, spos, epos, line)
				479	elif initial == '\\': # continued stmt
				480	# This yield is new; needed for better idempotency:
				481	yield (NL, token, spos, (lnum, pos), line)
				482	continued = 1
				483	else:
				484	if initial in '([{': parenlev = parenlev + 1
				485	elif initial in ')]}': parenlev = parenlev - 1
				486	yield (OP, token, spos, epos, line)
				487	else:
				488	yield (ERRORTOKEN, line[pos],
				489	(lnum, pos), (lnum, pos+1), line)
				490	pos = pos + 1
				491
				492	for indent in indents[1:]: # pop remaining indent levels
				493	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				494	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				495
				496	if __name__ == '__main__': # testing
				497	import sys
				498	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				499	else: tokenize(sys.stdin.readline)