Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: 4585ca3f2b36d73f53c4ef7565aabf4a3606440c [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
				41	def group(*choices): return '(' + '\|'.join(choices) + ')'
				42	def any(choices): return group(choices) + '*'
				43	def maybe(choices): return group(choices) + '?'
				44
				45	Whitespace = r'[ \f\t]*'
				46	Comment = r'#[^\r\n]*'
				47	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				48	Name = r'[a-zA-Z_]\w*'
				49
				50	Binnumber = r'0[bB][01]*'
				51	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				52	Octnumber = r'0[oO]?[0-7]*[lL]?'
				53	Decnumber = r'[1-9]\d*[lL]?'
				54	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				55	Exponent = r'[eE][-+]?\d+'
				56	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				57	Expfloat = r'\d+' + Exponent
				58	Floatnumber = group(Pointfloat, Expfloat)
				59	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				60	Number = group(Imagnumber, Floatnumber, Intnumber)
				61
				62	# Tail end of ' string.
				63	Single = r"[^'\\](?:\\.[^'\\])*'"
				64	# Tail end of " string.
				65	Double = r'[^"\\](?:\\.[^"\\])*"'
				66	# Tail end of ''' string.
				67	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				68	# Tail end of """ string.
				69	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				70	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				71	# Single-line ' or " string.
				72	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				73	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				74
				75	# Because of leftmost-then-longest match semantics, be sure to put the
				76	# longest operators first (e.g., if = came before ==, == would get
				77	# recognized as two instances of =).
				78	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				79	r"//=?", r"->",
				80	r"[+\-*/%&\|^=<>]=?",
				81	r"~")
				82
				83	Bracket = '[][(){}]'
				84	Special = group(r'\r?\n', r'[:;.,`@]')
				85	Funny = group(Operator, Bracket, Special)
				86
				87	PlainToken = group(Number, Funny, String, Name)
				88	Token = Ignore + PlainToken
				89
				90	# First (or only) line of ' or " string.
				91	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				92	group("'", r'\\\r?\n'),
				93	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				94	group('"', r'\\\r?\n'))
				95	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				96	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				97
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	98	tokenprog, pseudoprog, single3prog, double3prog = list(map(
				99	re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	100	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				101	"'''": single3prog, '"""': double3prog,
				102	"r'''": single3prog, 'r"""': double3prog,
				103	"u'''": single3prog, 'u"""': double3prog,
				104	"b'''": single3prog, 'b"""': double3prog,
				105	"ur'''": single3prog, 'ur"""': double3prog,
				106	"br'''": single3prog, 'br"""': double3prog,
				107	"R'''": single3prog, 'R"""': double3prog,
				108	"U'''": single3prog, 'U"""': double3prog,
				109	"B'''": single3prog, 'B"""': double3prog,
				110	"uR'''": single3prog, 'uR"""': double3prog,
				111	"Ur'''": single3prog, 'Ur"""': double3prog,
				112	"UR'''": single3prog, 'UR"""': double3prog,
				113	"bR'''": single3prog, 'bR"""': double3prog,
				114	"Br'''": single3prog, 'Br"""': double3prog,
				115	"BR'''": single3prog, 'BR"""': double3prog,
				116	'r': None, 'R': None,
				117	'u': None, 'U': None,
				118	'b': None, 'B': None}
				119
				120	triple_quoted = {}
				121	for t in ("'''", '"""',
				122	"r'''", 'r"""', "R'''", 'R"""',
				123	"u'''", 'u"""', "U'''", 'U"""',
				124	"b'''", 'b"""', "B'''", 'B"""',
				125	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				126	"uR'''", 'uR"""', "UR'''", 'UR"""',
				127	"br'''", 'br"""', "Br'''", 'Br"""',
				128	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				129	triple_quoted[t] = t
				130	single_quoted = {}
				131	for t in ("'", '"',
				132	"r'", 'r"', "R'", 'R"',
				133	"u'", 'u"', "U'", 'U"',
				134	"b'", 'b"', "B'", 'B"',
				135	"ur'", 'ur"', "Ur'", 'Ur"',
				136	"uR'", 'uR"', "UR'", 'UR"',
				137	"br'", 'br"', "Br'", 'Br"',
				138	"bR'", 'bR"', "BR'", 'BR"', ):
				139	single_quoted[t] = t
				140
				141	tabsize = 8
				142
				143	class TokenError(Exception): pass
				144
				145	class StopTokenizing(Exception): pass
				146
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	147	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
				148	(srow, scol) = xxx_todo_changeme
				149	(erow, ecol) = xxx_todo_changeme1
				150	print("%d,%d-%d,%d:\t%s\t%s" % \
				151	(srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	152
				153	def tokenize(readline, tokeneater=printtoken):
				154	"""
				155	The tokenize() function accepts two parameters: one representing the
				156	input stream, and one providing an output mechanism for tokenize().
				157
				158	The first parameter, readline, must be a callable object which provides
				159	the same interface as the readline() method of built-in file objects.
				160	Each call to the function should return one line of input as a string.
				161
				162	The second parameter, tokeneater, must also be a callable object. It is
				163	called once for each token, with five arguments, corresponding to the
				164	tuples generated by generate_tokens().
				165	"""
				166	try:
				167	tokenize_loop(readline, tokeneater)
				168	except StopTokenizing:
				169	pass
				170
				171	# backwards compatible interface
				172	def tokenize_loop(readline, tokeneater):
				173	for token_info in generate_tokens(readline):
				174	tokeneater(*token_info)
				175
				176	class Untokenizer:
				177
				178	def __init__(self):
				179	self.tokens = []
				180	self.prev_row = 1
				181	self.prev_col = 0
				182
				183	def add_whitespace(self, start):
				184	row, col = start
				185	assert row <= self.prev_row
				186	col_offset = col - self.prev_col
				187	if col_offset:
				188	self.tokens.append(" " * col_offset)
				189
				190	def untokenize(self, iterable):
				191	for t in iterable:
				192	if len(t) == 2:
				193	self.compat(t, iterable)
				194	break
				195	tok_type, token, start, end, line = t
				196	self.add_whitespace(start)
				197	self.tokens.append(token)
				198	self.prev_row, self.prev_col = end
				199	if tok_type in (NEWLINE, NL):
				200	self.prev_row += 1
				201	self.prev_col = 0
				202	return "".join(self.tokens)
				203
				204	def compat(self, token, iterable):
				205	startline = False
				206	indents = []
				207	toks_append = self.tokens.append
				208	toknum, tokval = token
				209	if toknum in (NAME, NUMBER):
				210	tokval += ' '
				211	if toknum in (NEWLINE, NL):
				212	startline = True
				213	for tok in iterable:
				214	toknum, tokval = tok[:2]
				215
				216	if toknum in (NAME, NUMBER):
				217	tokval += ' '
				218
				219	if toknum == INDENT:
				220	indents.append(tokval)
				221	continue
				222	elif toknum == DEDENT:
				223	indents.pop()
				224	continue
				225	elif toknum in (NEWLINE, NL):
				226	startline = True
				227	elif startline and indents:
				228	toks_append(indents[-1])
				229	startline = False
				230	toks_append(tokval)
				231
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	232	cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
				233
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame^]	234	def _get_normal_name(orig_enc):
				235	"""Imitates get_normal_name in tokenizer.c."""
				236	# Only care about the first 12 characters.
				237	enc = orig_enc[:12].lower().replace("_", "-")
				238	if enc == "utf-8" or enc.startswith("utf-8-"):
				239	return "utf-8"
				240	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
				241	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
				242	return "iso-8859-1"
				243	return orig_enc
				244
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	245	def detect_encoding(readline):
				246	"""
				247	The detect_encoding() function is used to detect the encoding that should
				248	be used to decode a Python source file. It requires one argment, readline,
				249	in the same way as the tokenize() generator.
				250
				251	It will call readline a maximum of twice, and return the encoding used
				252	(as a string) and a list of any lines (left as bytes) it has read
				253	in.
				254
				255	It detects the encoding from the presence of a utf-8 bom or an encoding
				256	cookie as specified in pep-0263. If both a bom and a cookie are present,
				257	but disagree, a SyntaxError will be raised. If the encoding cookie is an
				258	invalid charset, raise a SyntaxError.
				259
				260	If no encoding is specified, then the default of 'utf-8' will be returned.
				261	"""
				262	bom_found = False
				263	encoding = None
				264	def read_or_stop():
				265	try:
				266	return readline()
				267	except StopIteration:
				268	return b''
				269
				270	def find_cookie(line):
				271	try:
				272	line_string = line.decode('ascii')
				273	except UnicodeDecodeError:
				274	return None
				275
				276	matches = cookie_re.findall(line_string)
				277	if not matches:
				278	return None
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame^]	279	encoding = _get_normal_name(matches[0])
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	280	try:
				281	codec = lookup(encoding)
				282	except LookupError:
				283	# This behaviour mimics the Python interpreter
				284	raise SyntaxError("unknown encoding: " + encoding)
				285
				286	if bom_found and codec.name != 'utf-8':
				287	# This behaviour mimics the Python interpreter
				288	raise SyntaxError('encoding problem: utf-8')
				289	return encoding
				290
				291	first = read_or_stop()
				292	if first.startswith(BOM_UTF8):
				293	bom_found = True
				294	first = first[3:]
				295	if not first:
				296	return 'utf-8', []
				297
				298	encoding = find_cookie(first)
				299	if encoding:
				300	return encoding, [first]
				301
				302	second = read_or_stop()
				303	if not second:
				304	return 'utf-8', [first]
				305
				306	encoding = find_cookie(second)
				307	if encoding:
				308	return encoding, [first, second]
				309
				310	return 'utf-8', [first, second]
				311
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	312	def untokenize(iterable):
				313	"""Transform tokens back into Python source code.
				314
				315	Each element returned by the iterable must be a token sequence
				316	with at least two elements, a token number and token value. If
				317	only two tokens are passed, the resulting output is poor.
				318
				319	Round-trip invariant for full input:
				320	Untokenized source will match input source exactly
				321
				322	Round-trip invariant for limited intput:
				323	# Output text will tokenize the back to the input
				324	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				325	newcode = untokenize(t1)
				326	readline = iter(newcode.splitlines(1)).next
				327	t2 = [tok[:2] for tokin generate_tokens(readline)]
				328	assert t1 == t2
				329	"""
				330	ut = Untokenizer()
				331	return ut.untokenize(iterable)
				332
				333	def generate_tokens(readline):
				334	"""
				335	The generate_tokens() generator requires one argment, readline, which
				336	must be a callable object which provides the same interface as the
				337	readline() method of built-in file objects. Each call to the function
				338	should return one line of input as a string. Alternately, readline
				339	can be a callable function terminating with StopIteration:
				340	readline = open(myfile).next # Example of alternate readline
				341
				342	The generator produces 5-tuples with these members: the token type; the
				343	token string; a 2-tuple (srow, scol) of ints specifying the row and
				344	column where the token begins in the source; a 2-tuple (erow, ecol) of
				345	ints specifying the row and column where the token ends in the source;
				346	and the line on which the token was found. The line passed is the
				347	logical line; continuation lines are included.
				348	"""
				349	lnum = parenlev = continued = 0
				350	namechars, numchars = string.ascii_letters + '_', '0123456789'
				351	contstr, needcont = '', 0
				352	contline = None
				353	indents = [0]
				354
				355	while 1: # loop over lines in stream
				356	try:
				357	line = readline()
				358	except StopIteration:
				359	line = ''
				360	lnum = lnum + 1
				361	pos, max = 0, len(line)
				362
				363	if contstr: # continued string
				364	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	365	raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	366	endmatch = endprog.match(line)
				367	if endmatch:
				368	pos = end = endmatch.end(0)
				369	yield (STRING, contstr + line[:end],
				370	strstart, (lnum, end), contline + line)
				371	contstr, needcont = '', 0
				372	contline = None
				373	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				374	yield (ERRORTOKEN, contstr + line,
				375	strstart, (lnum, len(line)), contline)
				376	contstr = ''
				377	contline = None
				378	continue
				379	else:
				380	contstr = contstr + line
				381	contline = contline + line
				382	continue
				383
				384	elif parenlev == 0 and not continued: # new statement
				385	if not line: break
				386	column = 0
				387	while pos < max: # measure leading whitespace
				388	if line[pos] == ' ': column = column + 1
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame^]	389	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	390	elif line[pos] == '\f': column = 0
				391	else: break
				392	pos = pos + 1
				393	if pos == max: break
				394
				395	if line[pos] in '#\r\n': # skip comments or blank lines
				396	if line[pos] == '#':
				397	comment_token = line[pos:].rstrip('\r\n')
				398	nl_pos = pos + len(comment_token)
				399	yield (COMMENT, comment_token,
				400	(lnum, pos), (lnum, pos + len(comment_token)), line)
				401	yield (NL, line[nl_pos:],
				402	(lnum, nl_pos), (lnum, len(line)), line)
				403	else:
				404	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				405	(lnum, pos), (lnum, len(line)), line)
				406	continue
				407
				408	if column > indents[-1]: # count indents or dedents
				409	indents.append(column)
				410	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				411	while column < indents[-1]:
				412	if column not in indents:
				413	raise IndentationError(
				414	"unindent does not match any outer indentation level",
				415	("<tokenize>", lnum, pos, line))
				416	indents = indents[:-1]
				417	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				418
				419	else: # continued statement
				420	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	421	raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	422	continued = 0
				423
				424	while pos < max:
				425	pseudomatch = pseudoprog.match(line, pos)
				426	if pseudomatch: # scan for tokens
				427	start, end = pseudomatch.span(1)
				428	spos, epos, pos = (lnum, start), (lnum, end), end
				429	token, initial = line[start:end], line[start]
				430
				431	if initial in numchars or \
				432	(initial == '.' and token != '.'): # ordinary number
				433	yield (NUMBER, token, spos, epos, line)
				434	elif initial in '\r\n':
				435	newline = NEWLINE
				436	if parenlev > 0:
				437	newline = NL
				438	yield (newline, token, spos, epos, line)
				439	elif initial == '#':
				440	assert not token.endswith("\n")
				441	yield (COMMENT, token, spos, epos, line)
				442	elif token in triple_quoted:
				443	endprog = endprogs[token]
				444	endmatch = endprog.match(line, pos)
				445	if endmatch: # all on one line
				446	pos = endmatch.end(0)
				447	token = line[start:pos]
				448	yield (STRING, token, spos, (lnum, pos), line)
				449	else:
				450	strstart = (lnum, start) # multiple lines
				451	contstr = line[start:]
				452	contline = line
				453	break
				454	elif initial in single_quoted or \
				455	token[:2] in single_quoted or \
				456	token[:3] in single_quoted:
				457	if token[-1] == '\n': # continued string
				458	strstart = (lnum, start)
				459	endprog = (endprogs[initial] or endprogs[token[1]] or
				460	endprogs[token[2]])
				461	contstr, needcont = line[start:], 1
				462	contline = line
				463	break
				464	else: # ordinary string
				465	yield (STRING, token, spos, epos, line)
				466	elif initial in namechars: # ordinary name
				467	yield (NAME, token, spos, epos, line)
				468	elif initial == '\\': # continued stmt
				469	# This yield is new; needed for better idempotency:
				470	yield (NL, token, spos, (lnum, pos), line)
				471	continued = 1
				472	else:
				473	if initial in '([{': parenlev = parenlev + 1
				474	elif initial in ')]}': parenlev = parenlev - 1
				475	yield (OP, token, spos, epos, line)
				476	else:
				477	yield (ERRORTOKEN, line[pos],
				478	(lnum, pos), (lnum, pos+1), line)
				479	pos = pos + 1
				480
				481	for indent in indents[1:]: # pop remaining indent levels
				482	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				483	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				484
				485	if __name__ == '__main__': # testing
				486	import sys
				487	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				488	else: tokenize(sys.stdin.readline)