Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: 701daf8a8d334bc84563100304e6bfa0e9311838 [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
				41	def group(*choices): return '(' + '\|'.join(choices) + ')'
				42	def any(choices): return group(choices) + '*'
				43	def maybe(choices): return group(choices) + '?'
				44
				45	Whitespace = r'[ \f\t]*'
				46	Comment = r'#[^\r\n]*'
				47	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				48	Name = r'[a-zA-Z_]\w*'
				49
				50	Binnumber = r'0[bB][01]*'
				51	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				52	Octnumber = r'0[oO]?[0-7]*[lL]?'
				53	Decnumber = r'[1-9]\d*[lL]?'
				54	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				55	Exponent = r'[eE][-+]?\d+'
				56	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				57	Expfloat = r'\d+' + Exponent
				58	Floatnumber = group(Pointfloat, Expfloat)
				59	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				60	Number = group(Imagnumber, Floatnumber, Intnumber)
				61
				62	# Tail end of ' string.
				63	Single = r"[^'\\](?:\\.[^'\\])*'"
				64	# Tail end of " string.
				65	Double = r'[^"\\](?:\\.[^"\\])*"'
				66	# Tail end of ''' string.
				67	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				68	# Tail end of """ string.
				69	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				70	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				71	# Single-line ' or " string.
				72	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				73	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				74
				75	# Because of leftmost-then-longest match semantics, be sure to put the
				76	# longest operators first (e.g., if = came before ==, == would get
				77	# recognized as two instances of =).
				78	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				79	r"//=?", r"->",
				80	r"[+\-*/%&\|^=<>]=?",
				81	r"~")
				82
				83	Bracket = '[][(){}]'
				84	Special = group(r'\r?\n', r'[:;.,`@]')
				85	Funny = group(Operator, Bracket, Special)
				86
				87	PlainToken = group(Number, Funny, String, Name)
				88	Token = Ignore + PlainToken
				89
				90	# First (or only) line of ' or " string.
				91	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				92	group("'", r'\\\r?\n'),
				93	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				94	group('"', r'\\\r?\n'))
				95	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				96	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				97
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	98	tokenprog, pseudoprog, single3prog, double3prog = list(map(
				99	re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	100	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				101	"'''": single3prog, '"""': double3prog,
				102	"r'''": single3prog, 'r"""': double3prog,
				103	"u'''": single3prog, 'u"""': double3prog,
				104	"b'''": single3prog, 'b"""': double3prog,
				105	"ur'''": single3prog, 'ur"""': double3prog,
				106	"br'''": single3prog, 'br"""': double3prog,
				107	"R'''": single3prog, 'R"""': double3prog,
				108	"U'''": single3prog, 'U"""': double3prog,
				109	"B'''": single3prog, 'B"""': double3prog,
				110	"uR'''": single3prog, 'uR"""': double3prog,
				111	"Ur'''": single3prog, 'Ur"""': double3prog,
				112	"UR'''": single3prog, 'UR"""': double3prog,
				113	"bR'''": single3prog, 'bR"""': double3prog,
				114	"Br'''": single3prog, 'Br"""': double3prog,
				115	"BR'''": single3prog, 'BR"""': double3prog,
				116	'r': None, 'R': None,
				117	'u': None, 'U': None,
				118	'b': None, 'B': None}
				119
				120	triple_quoted = {}
				121	for t in ("'''", '"""',
				122	"r'''", 'r"""', "R'''", 'R"""',
				123	"u'''", 'u"""', "U'''", 'U"""',
				124	"b'''", 'b"""', "B'''", 'B"""',
				125	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				126	"uR'''", 'uR"""', "UR'''", 'UR"""',
				127	"br'''", 'br"""', "Br'''", 'Br"""',
				128	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				129	triple_quoted[t] = t
				130	single_quoted = {}
				131	for t in ("'", '"',
				132	"r'", 'r"', "R'", 'R"',
				133	"u'", 'u"', "U'", 'U"',
				134	"b'", 'b"', "B'", 'B"',
				135	"ur'", 'ur"', "Ur'", 'Ur"',
				136	"uR'", 'uR"', "UR'", 'UR"',
				137	"br'", 'br"', "Br'", 'Br"',
				138	"bR'", 'bR"', "BR'", 'BR"', ):
				139	single_quoted[t] = t
				140
				141	tabsize = 8
				142
				143	class TokenError(Exception): pass
				144
				145	class StopTokenizing(Exception): pass
				146
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	147	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
				148	(srow, scol) = xxx_todo_changeme
				149	(erow, ecol) = xxx_todo_changeme1
				150	print("%d,%d-%d,%d:\t%s\t%s" % \
				151	(srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	152
				153	def tokenize(readline, tokeneater=printtoken):
				154	"""
				155	The tokenize() function accepts two parameters: one representing the
				156	input stream, and one providing an output mechanism for tokenize().
				157
				158	The first parameter, readline, must be a callable object which provides
				159	the same interface as the readline() method of built-in file objects.
				160	Each call to the function should return one line of input as a string.
				161
				162	The second parameter, tokeneater, must also be a callable object. It is
				163	called once for each token, with five arguments, corresponding to the
				164	tuples generated by generate_tokens().
				165	"""
				166	try:
				167	tokenize_loop(readline, tokeneater)
				168	except StopTokenizing:
				169	pass
				170
				171	# backwards compatible interface
				172	def tokenize_loop(readline, tokeneater):
				173	for token_info in generate_tokens(readline):
				174	tokeneater(*token_info)
				175
				176	class Untokenizer:
				177
				178	def __init__(self):
				179	self.tokens = []
				180	self.prev_row = 1
				181	self.prev_col = 0
				182
				183	def add_whitespace(self, start):
				184	row, col = start
				185	assert row <= self.prev_row
				186	col_offset = col - self.prev_col
				187	if col_offset:
				188	self.tokens.append(" " * col_offset)
				189
				190	def untokenize(self, iterable):
				191	for t in iterable:
				192	if len(t) == 2:
				193	self.compat(t, iterable)
				194	break
				195	tok_type, token, start, end, line = t
				196	self.add_whitespace(start)
				197	self.tokens.append(token)
				198	self.prev_row, self.prev_col = end
				199	if tok_type in (NEWLINE, NL):
				200	self.prev_row += 1
				201	self.prev_col = 0
				202	return "".join(self.tokens)
				203
				204	def compat(self, token, iterable):
				205	startline = False
				206	indents = []
				207	toks_append = self.tokens.append
				208	toknum, tokval = token
				209	if toknum in (NAME, NUMBER):
				210	tokval += ' '
				211	if toknum in (NEWLINE, NL):
				212	startline = True
				213	for tok in iterable:
				214	toknum, tokval = tok[:2]
				215
				216	if toknum in (NAME, NUMBER):
				217	tokval += ' '
				218
				219	if toknum == INDENT:
				220	indents.append(tokval)
				221	continue
				222	elif toknum == DEDENT:
				223	indents.pop()
				224	continue
				225	elif toknum in (NEWLINE, NL):
				226	startline = True
				227	elif startline and indents:
				228	toks_append(indents[-1])
				229	startline = False
				230	toks_append(tokval)
				231
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	232	cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
				233
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	234	def _get_normal_name(orig_enc):
				235	"""Imitates get_normal_name in tokenizer.c."""
				236	# Only care about the first 12 characters.
				237	enc = orig_enc[:12].lower().replace("_", "-")
				238	if enc == "utf-8" or enc.startswith("utf-8-"):
				239	return "utf-8"
				240	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
				241	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
				242	return "iso-8859-1"
				243	return orig_enc
				244
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	245	def detect_encoding(readline):
				246	"""
				247	The detect_encoding() function is used to detect the encoding that should
				248	be used to decode a Python source file. It requires one argment, readline,
				249	in the same way as the tokenize() generator.
				250
				251	It will call readline a maximum of twice, and return the encoding used
				252	(as a string) and a list of any lines (left as bytes) it has read
				253	in.
				254
				255	It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame^]	256	cookie as specified in pep-0263. If both a bom and a cookie are present, but
				257	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
				258	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
				259	'utf-8-sig' is returned.
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	260
				261	If no encoding is specified, then the default of 'utf-8' will be returned.
				262	"""
				263	bom_found = False
				264	encoding = None
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame^]	265	default = 'utf-8'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	266	def read_or_stop():
				267	try:
				268	return readline()
				269	except StopIteration:
				270	return b''
				271
				272	def find_cookie(line):
				273	try:
				274	line_string = line.decode('ascii')
				275	except UnicodeDecodeError:
				276	return None
				277
				278	matches = cookie_re.findall(line_string)
				279	if not matches:
				280	return None
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	281	encoding = _get_normal_name(matches[0])
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	282	try:
				283	codec = lookup(encoding)
				284	except LookupError:
				285	# This behaviour mimics the Python interpreter
				286	raise SyntaxError("unknown encoding: " + encoding)
				287
Benjamin Peterson	2021100	2009-11-25 18:34:42 +0000	[diff] [blame]	288	if bom_found:
				289	if codec.name != 'utf-8':
				290	# This behaviour mimics the Python interpreter
				291	raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame^]	292	encoding += '-sig'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	293	return encoding
				294
				295	first = read_or_stop()
				296	if first.startswith(BOM_UTF8):
				297	bom_found = True
				298	first = first[3:]
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame^]	299	default = 'utf-8-sig'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	300	if not first:
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame^]	301	return default, []
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	302
				303	encoding = find_cookie(first)
				304	if encoding:
				305	return encoding, [first]
				306
				307	second = read_or_stop()
				308	if not second:
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame^]	309	return default, [first]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	310
				311	encoding = find_cookie(second)
				312	if encoding:
				313	return encoding, [first, second]
				314
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame^]	315	return default, [first, second]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	316
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	317	def untokenize(iterable):
				318	"""Transform tokens back into Python source code.
				319
				320	Each element returned by the iterable must be a token sequence
				321	with at least two elements, a token number and token value. If
				322	only two tokens are passed, the resulting output is poor.
				323
				324	Round-trip invariant for full input:
				325	Untokenized source will match input source exactly
				326
				327	Round-trip invariant for limited intput:
				328	# Output text will tokenize the back to the input
				329	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				330	newcode = untokenize(t1)
				331	readline = iter(newcode.splitlines(1)).next
				332	t2 = [tok[:2] for tokin generate_tokens(readline)]
				333	assert t1 == t2
				334	"""
				335	ut = Untokenizer()
				336	return ut.untokenize(iterable)
				337
				338	def generate_tokens(readline):
				339	"""
				340	The generate_tokens() generator requires one argment, readline, which
				341	must be a callable object which provides the same interface as the
				342	readline() method of built-in file objects. Each call to the function
				343	should return one line of input as a string. Alternately, readline
				344	can be a callable function terminating with StopIteration:
				345	readline = open(myfile).next # Example of alternate readline
				346
				347	The generator produces 5-tuples with these members: the token type; the
				348	token string; a 2-tuple (srow, scol) of ints specifying the row and
				349	column where the token begins in the source; a 2-tuple (erow, ecol) of
				350	ints specifying the row and column where the token ends in the source;
				351	and the line on which the token was found. The line passed is the
				352	logical line; continuation lines are included.
				353	"""
				354	lnum = parenlev = continued = 0
				355	namechars, numchars = string.ascii_letters + '_', '0123456789'
				356	contstr, needcont = '', 0
				357	contline = None
				358	indents = [0]
				359
				360	while 1: # loop over lines in stream
				361	try:
				362	line = readline()
				363	except StopIteration:
				364	line = ''
				365	lnum = lnum + 1
				366	pos, max = 0, len(line)
				367
				368	if contstr: # continued string
				369	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	370	raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	371	endmatch = endprog.match(line)
				372	if endmatch:
				373	pos = end = endmatch.end(0)
				374	yield (STRING, contstr + line[:end],
				375	strstart, (lnum, end), contline + line)
				376	contstr, needcont = '', 0
				377	contline = None
				378	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				379	yield (ERRORTOKEN, contstr + line,
				380	strstart, (lnum, len(line)), contline)
				381	contstr = ''
				382	contline = None
				383	continue
				384	else:
				385	contstr = contstr + line
				386	contline = contline + line
				387	continue
				388
				389	elif parenlev == 0 and not continued: # new statement
				390	if not line: break
				391	column = 0
				392	while pos < max: # measure leading whitespace
				393	if line[pos] == ' ': column = column + 1
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	394	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	395	elif line[pos] == '\f': column = 0
				396	else: break
				397	pos = pos + 1
				398	if pos == max: break
				399
				400	if line[pos] in '#\r\n': # skip comments or blank lines
				401	if line[pos] == '#':
				402	comment_token = line[pos:].rstrip('\r\n')
				403	nl_pos = pos + len(comment_token)
				404	yield (COMMENT, comment_token,
				405	(lnum, pos), (lnum, pos + len(comment_token)), line)
				406	yield (NL, line[nl_pos:],
				407	(lnum, nl_pos), (lnum, len(line)), line)
				408	else:
				409	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				410	(lnum, pos), (lnum, len(line)), line)
				411	continue
				412
				413	if column > indents[-1]: # count indents or dedents
				414	indents.append(column)
				415	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				416	while column < indents[-1]:
				417	if column not in indents:
				418	raise IndentationError(
				419	"unindent does not match any outer indentation level",
				420	("<tokenize>", lnum, pos, line))
				421	indents = indents[:-1]
				422	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				423
				424	else: # continued statement
				425	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	426	raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	427	continued = 0
				428
				429	while pos < max:
				430	pseudomatch = pseudoprog.match(line, pos)
				431	if pseudomatch: # scan for tokens
				432	start, end = pseudomatch.span(1)
				433	spos, epos, pos = (lnum, start), (lnum, end), end
				434	token, initial = line[start:end], line[start]
				435
				436	if initial in numchars or \
				437	(initial == '.' and token != '.'): # ordinary number
				438	yield (NUMBER, token, spos, epos, line)
				439	elif initial in '\r\n':
				440	newline = NEWLINE
				441	if parenlev > 0:
				442	newline = NL
				443	yield (newline, token, spos, epos, line)
				444	elif initial == '#':
				445	assert not token.endswith("\n")
				446	yield (COMMENT, token, spos, epos, line)
				447	elif token in triple_quoted:
				448	endprog = endprogs[token]
				449	endmatch = endprog.match(line, pos)
				450	if endmatch: # all on one line
				451	pos = endmatch.end(0)
				452	token = line[start:pos]
				453	yield (STRING, token, spos, (lnum, pos), line)
				454	else:
				455	strstart = (lnum, start) # multiple lines
				456	contstr = line[start:]
				457	contline = line
				458	break
				459	elif initial in single_quoted or \
				460	token[:2] in single_quoted or \
				461	token[:3] in single_quoted:
				462	if token[-1] == '\n': # continued string
				463	strstart = (lnum, start)
				464	endprog = (endprogs[initial] or endprogs[token[1]] or
				465	endprogs[token[2]])
				466	contstr, needcont = line[start:], 1
				467	contline = line
				468	break
				469	else: # ordinary string
				470	yield (STRING, token, spos, epos, line)
				471	elif initial in namechars: # ordinary name
				472	yield (NAME, token, spos, epos, line)
				473	elif initial == '\\': # continued stmt
				474	# This yield is new; needed for better idempotency:
				475	yield (NL, token, spos, (lnum, pos), line)
				476	continued = 1
				477	else:
				478	if initial in '([{': parenlev = parenlev + 1
				479	elif initial in ')]}': parenlev = parenlev - 1
				480	yield (OP, token, spos, epos, line)
				481	else:
				482	yield (ERRORTOKEN, line[pos],
				483	(lnum, pos), (lnum, pos+1), line)
				484	pos = pos + 1
				485
				486	for indent in indents[1:]: # pop remaining indent levels
				487	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				488	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				489
				490	if __name__ == '__main__': # testing
				491	import sys
				492	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				493	else: tokenize(sys.stdin.readline)