Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: 799566b383d60830c138e899e8c8ba6a8fd23042 [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
				41	def group(*choices): return '(' + '\|'.join(choices) + ')'
				42	def any(choices): return group(choices) + '*'
				43	def maybe(choices): return group(choices) + '?'
				44
				45	Whitespace = r'[ \f\t]*'
				46	Comment = r'#[^\r\n]*'
				47	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				48	Name = r'[a-zA-Z_]\w*'
				49
				50	Binnumber = r'0[bB][01]*'
				51	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				52	Octnumber = r'0[oO]?[0-7]*[lL]?'
				53	Decnumber = r'[1-9]\d*[lL]?'
				54	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				55	Exponent = r'[eE][-+]?\d+'
				56	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				57	Expfloat = r'\d+' + Exponent
				58	Floatnumber = group(Pointfloat, Expfloat)
				59	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				60	Number = group(Imagnumber, Floatnumber, Intnumber)
				61
				62	# Tail end of ' string.
				63	Single = r"[^'\\](?:\\.[^'\\])*'"
				64	# Tail end of " string.
				65	Double = r'[^"\\](?:\\.[^"\\])*"'
				66	# Tail end of ''' string.
				67	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				68	# Tail end of """ string.
				69	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				70	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				71	# Single-line ' or " string.
				72	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				73	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				74
				75	# Because of leftmost-then-longest match semantics, be sure to put the
				76	# longest operators first (e.g., if = came before ==, == would get
				77	# recognized as two instances of =).
				78	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				79	r"//=?", r"->",
				80	r"[+\-*/%&\|^=<>]=?",
				81	r"~")
				82
				83	Bracket = '[][(){}]'
				84	Special = group(r'\r?\n', r'[:;.,`@]')
				85	Funny = group(Operator, Bracket, Special)
				86
				87	PlainToken = group(Number, Funny, String, Name)
				88	Token = Ignore + PlainToken
				89
				90	# First (or only) line of ' or " string.
				91	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				92	group("'", r'\\\r?\n'),
				93	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				94	group('"', r'\\\r?\n'))
				95	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				96	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				97
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	98	tokenprog, pseudoprog, single3prog, double3prog = list(map(
				99	re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	100	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				101	"'''": single3prog, '"""': double3prog,
				102	"r'''": single3prog, 'r"""': double3prog,
				103	"u'''": single3prog, 'u"""': double3prog,
				104	"b'''": single3prog, 'b"""': double3prog,
				105	"ur'''": single3prog, 'ur"""': double3prog,
				106	"br'''": single3prog, 'br"""': double3prog,
				107	"R'''": single3prog, 'R"""': double3prog,
				108	"U'''": single3prog, 'U"""': double3prog,
				109	"B'''": single3prog, 'B"""': double3prog,
				110	"uR'''": single3prog, 'uR"""': double3prog,
				111	"Ur'''": single3prog, 'Ur"""': double3prog,
				112	"UR'''": single3prog, 'UR"""': double3prog,
				113	"bR'''": single3prog, 'bR"""': double3prog,
				114	"Br'''": single3prog, 'Br"""': double3prog,
				115	"BR'''": single3prog, 'BR"""': double3prog,
				116	'r': None, 'R': None,
				117	'u': None, 'U': None,
				118	'b': None, 'B': None}
				119
				120	triple_quoted = {}
				121	for t in ("'''", '"""',
				122	"r'''", 'r"""', "R'''", 'R"""',
				123	"u'''", 'u"""', "U'''", 'U"""',
				124	"b'''", 'b"""', "B'''", 'B"""',
				125	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				126	"uR'''", 'uR"""', "UR'''", 'UR"""',
				127	"br'''", 'br"""', "Br'''", 'Br"""',
				128	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				129	triple_quoted[t] = t
				130	single_quoted = {}
				131	for t in ("'", '"',
				132	"r'", 'r"', "R'", 'R"',
				133	"u'", 'u"', "U'", 'U"',
				134	"b'", 'b"', "B'", 'B"',
				135	"ur'", 'ur"', "Ur'", 'Ur"',
				136	"uR'", 'uR"', "UR'", 'UR"',
				137	"br'", 'br"', "Br'", 'Br"',
				138	"bR'", 'bR"', "BR'", 'BR"', ):
				139	single_quoted[t] = t
				140
				141	tabsize = 8
				142
				143	class TokenError(Exception): pass
				144
				145	class StopTokenizing(Exception): pass
				146
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	147	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
				148	(srow, scol) = xxx_todo_changeme
				149	(erow, ecol) = xxx_todo_changeme1
				150	print("%d,%d-%d,%d:\t%s\t%s" % \
				151	(srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	152
				153	def tokenize(readline, tokeneater=printtoken):
				154	"""
				155	The tokenize() function accepts two parameters: one representing the
				156	input stream, and one providing an output mechanism for tokenize().
				157
				158	The first parameter, readline, must be a callable object which provides
				159	the same interface as the readline() method of built-in file objects.
				160	Each call to the function should return one line of input as a string.
				161
				162	The second parameter, tokeneater, must also be a callable object. It is
				163	called once for each token, with five arguments, corresponding to the
				164	tuples generated by generate_tokens().
				165	"""
				166	try:
				167	tokenize_loop(readline, tokeneater)
				168	except StopTokenizing:
				169	pass
				170
				171	# backwards compatible interface
				172	def tokenize_loop(readline, tokeneater):
				173	for token_info in generate_tokens(readline):
				174	tokeneater(*token_info)
				175
				176	class Untokenizer:
				177
				178	def __init__(self):
				179	self.tokens = []
				180	self.prev_row = 1
				181	self.prev_col = 0
				182
				183	def add_whitespace(self, start):
				184	row, col = start
				185	assert row <= self.prev_row
				186	col_offset = col - self.prev_col
				187	if col_offset:
				188	self.tokens.append(" " * col_offset)
				189
				190	def untokenize(self, iterable):
				191	for t in iterable:
				192	if len(t) == 2:
				193	self.compat(t, iterable)
				194	break
				195	tok_type, token, start, end, line = t
				196	self.add_whitespace(start)
				197	self.tokens.append(token)
				198	self.prev_row, self.prev_col = end
				199	if tok_type in (NEWLINE, NL):
				200	self.prev_row += 1
				201	self.prev_col = 0
				202	return "".join(self.tokens)
				203
				204	def compat(self, token, iterable):
				205	startline = False
				206	indents = []
				207	toks_append = self.tokens.append
				208	toknum, tokval = token
				209	if toknum in (NAME, NUMBER):
				210	tokval += ' '
				211	if toknum in (NEWLINE, NL):
				212	startline = True
				213	for tok in iterable:
				214	toknum, tokval = tok[:2]
				215
				216	if toknum in (NAME, NUMBER):
				217	tokval += ' '
				218
				219	if toknum == INDENT:
				220	indents.append(tokval)
				221	continue
				222	elif toknum == DEDENT:
				223	indents.pop()
				224	continue
				225	elif toknum in (NEWLINE, NL):
				226	startline = True
				227	elif startline and indents:
				228	toks_append(indents[-1])
				229	startline = False
				230	toks_append(tokval)
				231
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	232	cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
				233
				234	def detect_encoding(readline):
				235	"""
				236	The detect_encoding() function is used to detect the encoding that should
				237	be used to decode a Python source file. It requires one argment, readline,
				238	in the same way as the tokenize() generator.
				239
				240	It will call readline a maximum of twice, and return the encoding used
				241	(as a string) and a list of any lines (left as bytes) it has read
				242	in.
				243
				244	It detects the encoding from the presence of a utf-8 bom or an encoding
				245	cookie as specified in pep-0263. If both a bom and a cookie are present,
				246	but disagree, a SyntaxError will be raised. If the encoding cookie is an
				247	invalid charset, raise a SyntaxError.
				248
				249	If no encoding is specified, then the default of 'utf-8' will be returned.
				250	"""
				251	bom_found = False
				252	encoding = None
				253	def read_or_stop():
				254	try:
				255	return readline()
				256	except StopIteration:
				257	return b''
				258
				259	def find_cookie(line):
				260	try:
				261	line_string = line.decode('ascii')
				262	except UnicodeDecodeError:
				263	return None
				264
				265	matches = cookie_re.findall(line_string)
				266	if not matches:
				267	return None
				268	encoding = matches[0]
				269	try:
				270	codec = lookup(encoding)
				271	except LookupError:
				272	# This behaviour mimics the Python interpreter
				273	raise SyntaxError("unknown encoding: " + encoding)
				274
				275	if bom_found and codec.name != 'utf-8':
				276	# This behaviour mimics the Python interpreter
				277	raise SyntaxError('encoding problem: utf-8')
				278	return encoding
				279
				280	first = read_or_stop()
				281	if first.startswith(BOM_UTF8):
				282	bom_found = True
				283	first = first[3:]
				284	if not first:
				285	return 'utf-8', []
				286
				287	encoding = find_cookie(first)
				288	if encoding:
				289	return encoding, [first]
				290
				291	second = read_or_stop()
				292	if not second:
				293	return 'utf-8', [first]
				294
				295	encoding = find_cookie(second)
				296	if encoding:
				297	return encoding, [first, second]
				298
				299	return 'utf-8', [first, second]
				300
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	301	def untokenize(iterable):
				302	"""Transform tokens back into Python source code.
				303
				304	Each element returned by the iterable must be a token sequence
				305	with at least two elements, a token number and token value. If
				306	only two tokens are passed, the resulting output is poor.
				307
				308	Round-trip invariant for full input:
				309	Untokenized source will match input source exactly
				310
				311	Round-trip invariant for limited intput:
				312	# Output text will tokenize the back to the input
				313	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				314	newcode = untokenize(t1)
				315	readline = iter(newcode.splitlines(1)).next
				316	t2 = [tok[:2] for tokin generate_tokens(readline)]
				317	assert t1 == t2
				318	"""
				319	ut = Untokenizer()
				320	return ut.untokenize(iterable)
				321
				322	def generate_tokens(readline):
				323	"""
				324	The generate_tokens() generator requires one argment, readline, which
				325	must be a callable object which provides the same interface as the
				326	readline() method of built-in file objects. Each call to the function
				327	should return one line of input as a string. Alternately, readline
				328	can be a callable function terminating with StopIteration:
				329	readline = open(myfile).next # Example of alternate readline
				330
				331	The generator produces 5-tuples with these members: the token type; the
				332	token string; a 2-tuple (srow, scol) of ints specifying the row and
				333	column where the token begins in the source; a 2-tuple (erow, ecol) of
				334	ints specifying the row and column where the token ends in the source;
				335	and the line on which the token was found. The line passed is the
				336	logical line; continuation lines are included.
				337	"""
				338	lnum = parenlev = continued = 0
				339	namechars, numchars = string.ascii_letters + '_', '0123456789'
				340	contstr, needcont = '', 0
				341	contline = None
				342	indents = [0]
				343
				344	while 1: # loop over lines in stream
				345	try:
				346	line = readline()
				347	except StopIteration:
				348	line = ''
				349	lnum = lnum + 1
				350	pos, max = 0, len(line)
				351
				352	if contstr: # continued string
				353	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	354	raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	355	endmatch = endprog.match(line)
				356	if endmatch:
				357	pos = end = endmatch.end(0)
				358	yield (STRING, contstr + line[:end],
				359	strstart, (lnum, end), contline + line)
				360	contstr, needcont = '', 0
				361	contline = None
				362	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				363	yield (ERRORTOKEN, contstr + line,
				364	strstart, (lnum, len(line)), contline)
				365	contstr = ''
				366	contline = None
				367	continue
				368	else:
				369	contstr = contstr + line
				370	contline = contline + line
				371	continue
				372
				373	elif parenlev == 0 and not continued: # new statement
				374	if not line: break
				375	column = 0
				376	while pos < max: # measure leading whitespace
				377	if line[pos] == ' ': column = column + 1
				378	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
				379	elif line[pos] == '\f': column = 0
				380	else: break
				381	pos = pos + 1
				382	if pos == max: break
				383
				384	if line[pos] in '#\r\n': # skip comments or blank lines
				385	if line[pos] == '#':
				386	comment_token = line[pos:].rstrip('\r\n')
				387	nl_pos = pos + len(comment_token)
				388	yield (COMMENT, comment_token,
				389	(lnum, pos), (lnum, pos + len(comment_token)), line)
				390	yield (NL, line[nl_pos:],
				391	(lnum, nl_pos), (lnum, len(line)), line)
				392	else:
				393	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				394	(lnum, pos), (lnum, len(line)), line)
				395	continue
				396
				397	if column > indents[-1]: # count indents or dedents
				398	indents.append(column)
				399	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				400	while column < indents[-1]:
				401	if column not in indents:
				402	raise IndentationError(
				403	"unindent does not match any outer indentation level",
				404	("<tokenize>", lnum, pos, line))
				405	indents = indents[:-1]
				406	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				407
				408	else: # continued statement
				409	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	410	raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	411	continued = 0
				412
				413	while pos < max:
				414	pseudomatch = pseudoprog.match(line, pos)
				415	if pseudomatch: # scan for tokens
				416	start, end = pseudomatch.span(1)
				417	spos, epos, pos = (lnum, start), (lnum, end), end
				418	token, initial = line[start:end], line[start]
				419
				420	if initial in numchars or \
				421	(initial == '.' and token != '.'): # ordinary number
				422	yield (NUMBER, token, spos, epos, line)
				423	elif initial in '\r\n':
				424	newline = NEWLINE
				425	if parenlev > 0:
				426	newline = NL
				427	yield (newline, token, spos, epos, line)
				428	elif initial == '#':
				429	assert not token.endswith("\n")
				430	yield (COMMENT, token, spos, epos, line)
				431	elif token in triple_quoted:
				432	endprog = endprogs[token]
				433	endmatch = endprog.match(line, pos)
				434	if endmatch: # all on one line
				435	pos = endmatch.end(0)
				436	token = line[start:pos]
				437	yield (STRING, token, spos, (lnum, pos), line)
				438	else:
				439	strstart = (lnum, start) # multiple lines
				440	contstr = line[start:]
				441	contline = line
				442	break
				443	elif initial in single_quoted or \
				444	token[:2] in single_quoted or \
				445	token[:3] in single_quoted:
				446	if token[-1] == '\n': # continued string
				447	strstart = (lnum, start)
				448	endprog = (endprogs[initial] or endprogs[token[1]] or
				449	endprogs[token[2]])
				450	contstr, needcont = line[start:], 1
				451	contline = line
				452	break
				453	else: # ordinary string
				454	yield (STRING, token, spos, epos, line)
				455	elif initial in namechars: # ordinary name
				456	yield (NAME, token, spos, epos, line)
				457	elif initial == '\\': # continued stmt
				458	# This yield is new; needed for better idempotency:
				459	yield (NL, token, spos, (lnum, pos), line)
				460	continued = 1
				461	else:
				462	if initial in '([{': parenlev = parenlev + 1
				463	elif initial in ')]}': parenlev = parenlev - 1
				464	yield (OP, token, spos, epos, line)
				465	else:
				466	yield (ERRORTOKEN, line[pos],
				467	(lnum, pos), (lnum, pos+1), line)
				468	pos = pos + 1
				469
				470	for indent in indents[1:]: # pop remaining indent levels
				471	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				472	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				473
				474	if __name__ == '__main__': # testing
				475	import sys
				476	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				477	else: tokenize(sys.stdin.readline)