Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: 7ae02801ea448bc69c353b9d48664c74cb4b4ba7 [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
				41	def group(*choices): return '(' + '\|'.join(choices) + ')'
				42	def any(choices): return group(choices) + '*'
				43	def maybe(choices): return group(choices) + '?'
				44
				45	Whitespace = r'[ \f\t]*'
				46	Comment = r'#[^\r\n]*'
				47	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				48	Name = r'[a-zA-Z_]\w*'
				49
				50	Binnumber = r'0[bB][01]*'
				51	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				52	Octnumber = r'0[oO]?[0-7]*[lL]?'
				53	Decnumber = r'[1-9]\d*[lL]?'
				54	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				55	Exponent = r'[eE][-+]?\d+'
				56	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				57	Expfloat = r'\d+' + Exponent
				58	Floatnumber = group(Pointfloat, Expfloat)
				59	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				60	Number = group(Imagnumber, Floatnumber, Intnumber)
				61
				62	# Tail end of ' string.
				63	Single = r"[^'\\](?:\\.[^'\\])*'"
				64	# Tail end of " string.
				65	Double = r'[^"\\](?:\\.[^"\\])*"'
				66	# Tail end of ''' string.
				67	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				68	# Tail end of """ string.
				69	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				70	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				71	# Single-line ' or " string.
				72	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				73	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				74
				75	# Because of leftmost-then-longest match semantics, be sure to put the
				76	# longest operators first (e.g., if = came before ==, == would get
				77	# recognized as two instances of =).
				78	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				79	r"//=?", r"->",
				80	r"[+\-*/%&\|^=<>]=?",
				81	r"~")
				82
				83	Bracket = '[][(){}]'
				84	Special = group(r'\r?\n', r'[:;.,`@]')
				85	Funny = group(Operator, Bracket, Special)
				86
				87	PlainToken = group(Number, Funny, String, Name)
				88	Token = Ignore + PlainToken
				89
				90	# First (or only) line of ' or " string.
				91	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				92	group("'", r'\\\r?\n'),
				93	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				94	group('"', r'\\\r?\n'))
				95	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				96	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				97
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	98	tokenprog, pseudoprog, single3prog, double3prog = list(map(
				99	re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	100	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				101	"'''": single3prog, '"""': double3prog,
				102	"r'''": single3prog, 'r"""': double3prog,
				103	"u'''": single3prog, 'u"""': double3prog,
				104	"b'''": single3prog, 'b"""': double3prog,
				105	"ur'''": single3prog, 'ur"""': double3prog,
				106	"br'''": single3prog, 'br"""': double3prog,
				107	"R'''": single3prog, 'R"""': double3prog,
				108	"U'''": single3prog, 'U"""': double3prog,
				109	"B'''": single3prog, 'B"""': double3prog,
				110	"uR'''": single3prog, 'uR"""': double3prog,
				111	"Ur'''": single3prog, 'Ur"""': double3prog,
				112	"UR'''": single3prog, 'UR"""': double3prog,
				113	"bR'''": single3prog, 'bR"""': double3prog,
				114	"Br'''": single3prog, 'Br"""': double3prog,
				115	"BR'''": single3prog, 'BR"""': double3prog,
				116	'r': None, 'R': None,
				117	'u': None, 'U': None,
				118	'b': None, 'B': None}
				119
				120	triple_quoted = {}
				121	for t in ("'''", '"""',
				122	"r'''", 'r"""', "R'''", 'R"""',
				123	"u'''", 'u"""', "U'''", 'U"""',
				124	"b'''", 'b"""', "B'''", 'B"""',
				125	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				126	"uR'''", 'uR"""', "UR'''", 'UR"""',
				127	"br'''", 'br"""', "Br'''", 'Br"""',
				128	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				129	triple_quoted[t] = t
				130	single_quoted = {}
				131	for t in ("'", '"',
				132	"r'", 'r"', "R'", 'R"',
				133	"u'", 'u"', "U'", 'U"',
				134	"b'", 'b"', "B'", 'B"',
				135	"ur'", 'ur"', "Ur'", 'Ur"',
				136	"uR'", 'uR"', "UR'", 'UR"',
				137	"br'", 'br"', "Br'", 'Br"',
				138	"bR'", 'bR"', "BR'", 'BR"', ):
				139	single_quoted[t] = t
				140
				141	tabsize = 8
				142
				143	class TokenError(Exception): pass
				144
				145	class StopTokenizing(Exception): pass
				146
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	147	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
				148	(srow, scol) = xxx_todo_changeme
				149	(erow, ecol) = xxx_todo_changeme1
				150	print("%d,%d-%d,%d:\t%s\t%s" % \
				151	(srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	152
				153	def tokenize(readline, tokeneater=printtoken):
				154	"""
				155	The tokenize() function accepts two parameters: one representing the
				156	input stream, and one providing an output mechanism for tokenize().
				157
				158	The first parameter, readline, must be a callable object which provides
				159	the same interface as the readline() method of built-in file objects.
				160	Each call to the function should return one line of input as a string.
				161
				162	The second parameter, tokeneater, must also be a callable object. It is
				163	called once for each token, with five arguments, corresponding to the
				164	tuples generated by generate_tokens().
				165	"""
				166	try:
				167	tokenize_loop(readline, tokeneater)
				168	except StopTokenizing:
				169	pass
				170
				171	# backwards compatible interface
				172	def tokenize_loop(readline, tokeneater):
				173	for token_info in generate_tokens(readline):
				174	tokeneater(*token_info)
				175
				176	class Untokenizer:
				177
				178	def __init__(self):
				179	self.tokens = []
				180	self.prev_row = 1
				181	self.prev_col = 0
				182
				183	def add_whitespace(self, start):
				184	row, col = start
				185	assert row <= self.prev_row
				186	col_offset = col - self.prev_col
				187	if col_offset:
				188	self.tokens.append(" " * col_offset)
				189
				190	def untokenize(self, iterable):
				191	for t in iterable:
				192	if len(t) == 2:
				193	self.compat(t, iterable)
				194	break
				195	tok_type, token, start, end, line = t
				196	self.add_whitespace(start)
				197	self.tokens.append(token)
				198	self.prev_row, self.prev_col = end
				199	if tok_type in (NEWLINE, NL):
				200	self.prev_row += 1
				201	self.prev_col = 0
				202	return "".join(self.tokens)
				203
				204	def compat(self, token, iterable):
				205	startline = False
				206	indents = []
				207	toks_append = self.tokens.append
				208	toknum, tokval = token
				209	if toknum in (NAME, NUMBER):
				210	tokval += ' '
				211	if toknum in (NEWLINE, NL):
				212	startline = True
				213	for tok in iterable:
				214	toknum, tokval = tok[:2]
				215
				216	if toknum in (NAME, NUMBER):
				217	tokval += ' '
				218
				219	if toknum == INDENT:
				220	indents.append(tokval)
				221	continue
				222	elif toknum == DEDENT:
				223	indents.pop()
				224	continue
				225	elif toknum in (NEWLINE, NL):
				226	startline = True
				227	elif startline and indents:
				228	toks_append(indents[-1])
				229	startline = False
				230	toks_append(tokval)
				231
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	232	cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
				233
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	234	def _get_normal_name(orig_enc):
				235	"""Imitates get_normal_name in tokenizer.c."""
				236	# Only care about the first 12 characters.
				237	enc = orig_enc[:12].lower().replace("_", "-")
				238	if enc == "utf-8" or enc.startswith("utf-8-"):
				239	return "utf-8"
				240	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
				241	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
				242	return "iso-8859-1"
				243	return orig_enc
				244
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	245	def detect_encoding(readline):
				246	"""
				247	The detect_encoding() function is used to detect the encoding that should
				248	be used to decode a Python source file. It requires one argment, readline,
				249	in the same way as the tokenize() generator.
				250
				251	It will call readline a maximum of twice, and return the encoding used
				252	(as a string) and a list of any lines (left as bytes) it has read
				253	in.
				254
				255	It detects the encoding from the presence of a utf-8 bom or an encoding
				256	cookie as specified in pep-0263. If both a bom and a cookie are present,
				257	but disagree, a SyntaxError will be raised. If the encoding cookie is an
				258	invalid charset, raise a SyntaxError.
				259
				260	If no encoding is specified, then the default of 'utf-8' will be returned.
				261	"""
				262	bom_found = False
				263	encoding = None
				264	def read_or_stop():
				265	try:
				266	return readline()
				267	except StopIteration:
				268	return b''
				269
				270	def find_cookie(line):
				271	try:
				272	line_string = line.decode('ascii')
				273	except UnicodeDecodeError:
				274	return None
				275
				276	matches = cookie_re.findall(line_string)
				277	if not matches:
				278	return None
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	279	encoding = _get_normal_name(matches[0])
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	280	try:
				281	codec = lookup(encoding)
				282	except LookupError:
				283	# This behaviour mimics the Python interpreter
				284	raise SyntaxError("unknown encoding: " + encoding)
				285
Benjamin Peterson	2021100	2009-11-25 18:34:42 +0000	[diff] [blame^]	286	if bom_found:
				287	if codec.name != 'utf-8':
				288	# This behaviour mimics the Python interpreter
				289	raise SyntaxError('encoding problem: utf-8')
				290	else:
				291	# Allow it to be properly encoded and decoded.
				292	encoding = 'utf-8-sig'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	293	return encoding
				294
				295	first = read_or_stop()
				296	if first.startswith(BOM_UTF8):
				297	bom_found = True
				298	first = first[3:]
				299	if not first:
				300	return 'utf-8', []
				301
				302	encoding = find_cookie(first)
				303	if encoding:
				304	return encoding, [first]
				305
				306	second = read_or_stop()
				307	if not second:
				308	return 'utf-8', [first]
				309
				310	encoding = find_cookie(second)
				311	if encoding:
				312	return encoding, [first, second]
				313
				314	return 'utf-8', [first, second]
				315
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	316	def untokenize(iterable):
				317	"""Transform tokens back into Python source code.
				318
				319	Each element returned by the iterable must be a token sequence
				320	with at least two elements, a token number and token value. If
				321	only two tokens are passed, the resulting output is poor.
				322
				323	Round-trip invariant for full input:
				324	Untokenized source will match input source exactly
				325
				326	Round-trip invariant for limited intput:
				327	# Output text will tokenize the back to the input
				328	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				329	newcode = untokenize(t1)
				330	readline = iter(newcode.splitlines(1)).next
				331	t2 = [tok[:2] for tokin generate_tokens(readline)]
				332	assert t1 == t2
				333	"""
				334	ut = Untokenizer()
				335	return ut.untokenize(iterable)
				336
				337	def generate_tokens(readline):
				338	"""
				339	The generate_tokens() generator requires one argment, readline, which
				340	must be a callable object which provides the same interface as the
				341	readline() method of built-in file objects. Each call to the function
				342	should return one line of input as a string. Alternately, readline
				343	can be a callable function terminating with StopIteration:
				344	readline = open(myfile).next # Example of alternate readline
				345
				346	The generator produces 5-tuples with these members: the token type; the
				347	token string; a 2-tuple (srow, scol) of ints specifying the row and
				348	column where the token begins in the source; a 2-tuple (erow, ecol) of
				349	ints specifying the row and column where the token ends in the source;
				350	and the line on which the token was found. The line passed is the
				351	logical line; continuation lines are included.
				352	"""
				353	lnum = parenlev = continued = 0
				354	namechars, numchars = string.ascii_letters + '_', '0123456789'
				355	contstr, needcont = '', 0
				356	contline = None
				357	indents = [0]
				358
				359	while 1: # loop over lines in stream
				360	try:
				361	line = readline()
				362	except StopIteration:
				363	line = ''
				364	lnum = lnum + 1
				365	pos, max = 0, len(line)
				366
				367	if contstr: # continued string
				368	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	369	raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	370	endmatch = endprog.match(line)
				371	if endmatch:
				372	pos = end = endmatch.end(0)
				373	yield (STRING, contstr + line[:end],
				374	strstart, (lnum, end), contline + line)
				375	contstr, needcont = '', 0
				376	contline = None
				377	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				378	yield (ERRORTOKEN, contstr + line,
				379	strstart, (lnum, len(line)), contline)
				380	contstr = ''
				381	contline = None
				382	continue
				383	else:
				384	contstr = contstr + line
				385	contline = contline + line
				386	continue
				387
				388	elif parenlev == 0 and not continued: # new statement
				389	if not line: break
				390	column = 0
				391	while pos < max: # measure leading whitespace
				392	if line[pos] == ' ': column = column + 1
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	393	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	394	elif line[pos] == '\f': column = 0
				395	else: break
				396	pos = pos + 1
				397	if pos == max: break
				398
				399	if line[pos] in '#\r\n': # skip comments or blank lines
				400	if line[pos] == '#':
				401	comment_token = line[pos:].rstrip('\r\n')
				402	nl_pos = pos + len(comment_token)
				403	yield (COMMENT, comment_token,
				404	(lnum, pos), (lnum, pos + len(comment_token)), line)
				405	yield (NL, line[nl_pos:],
				406	(lnum, nl_pos), (lnum, len(line)), line)
				407	else:
				408	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				409	(lnum, pos), (lnum, len(line)), line)
				410	continue
				411
				412	if column > indents[-1]: # count indents or dedents
				413	indents.append(column)
				414	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				415	while column < indents[-1]:
				416	if column not in indents:
				417	raise IndentationError(
				418	"unindent does not match any outer indentation level",
				419	("<tokenize>", lnum, pos, line))
				420	indents = indents[:-1]
				421	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				422
				423	else: # continued statement
				424	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	425	raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	426	continued = 0
				427
				428	while pos < max:
				429	pseudomatch = pseudoprog.match(line, pos)
				430	if pseudomatch: # scan for tokens
				431	start, end = pseudomatch.span(1)
				432	spos, epos, pos = (lnum, start), (lnum, end), end
				433	token, initial = line[start:end], line[start]
				434
				435	if initial in numchars or \
				436	(initial == '.' and token != '.'): # ordinary number
				437	yield (NUMBER, token, spos, epos, line)
				438	elif initial in '\r\n':
				439	newline = NEWLINE
				440	if parenlev > 0:
				441	newline = NL
				442	yield (newline, token, spos, epos, line)
				443	elif initial == '#':
				444	assert not token.endswith("\n")
				445	yield (COMMENT, token, spos, epos, line)
				446	elif token in triple_quoted:
				447	endprog = endprogs[token]
				448	endmatch = endprog.match(line, pos)
				449	if endmatch: # all on one line
				450	pos = endmatch.end(0)
				451	token = line[start:pos]
				452	yield (STRING, token, spos, (lnum, pos), line)
				453	else:
				454	strstart = (lnum, start) # multiple lines
				455	contstr = line[start:]
				456	contline = line
				457	break
				458	elif initial in single_quoted or \
				459	token[:2] in single_quoted or \
				460	token[:3] in single_quoted:
				461	if token[-1] == '\n': # continued string
				462	strstart = (lnum, start)
				463	endprog = (endprogs[initial] or endprogs[token[1]] or
				464	endprogs[token[2]])
				465	contstr, needcont = line[start:], 1
				466	contline = line
				467	break
				468	else: # ordinary string
				469	yield (STRING, token, spos, epos, line)
				470	elif initial in namechars: # ordinary name
				471	yield (NAME, token, spos, epos, line)
				472	elif initial == '\\': # continued stmt
				473	# This yield is new; needed for better idempotency:
				474	yield (NL, token, spos, (lnum, pos), line)
				475	continued = 1
				476	else:
				477	if initial in '([{': parenlev = parenlev + 1
				478	elif initial in ')]}': parenlev = parenlev - 1
				479	yield (OP, token, spos, epos, line)
				480	else:
				481	yield (ERRORTOKEN, line[pos],
				482	(lnum, pos), (lnum, pos+1), line)
				483	pos = pos + 1
				484
				485	for indent in indents[1:]: # pop remaining indent levels
				486	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				487	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				488
				489	if __name__ == '__main__': # testing
				490	import sys
				491	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				492	else: tokenize(sys.stdin.readline)