Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython2

blob: 163c561100b6eb5aaf4869c7ad3d8fb10a981b22 [file] [log] [blame]

Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
				41	def group(*choices): return '(' + '\|'.join(choices) + ')'
				42	def any(choices): return group(choices) + '*'
				43	def maybe(choices): return group(choices) + '?'
				44
				45	Whitespace = r'[ \f\t]*'
				46	Comment = r'#[^\r\n]*'
				47	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				48	Name = r'[a-zA-Z_]\w*'
				49
				50	Binnumber = r'0[bB][01]*'
				51	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				52	Octnumber = r'0[oO]?[0-7]*[lL]?'
				53	Decnumber = r'[1-9]\d*[lL]?'
				54	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				55	Exponent = r'[eE][-+]?\d+'
				56	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				57	Expfloat = r'\d+' + Exponent
				58	Floatnumber = group(Pointfloat, Expfloat)
				59	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				60	Number = group(Imagnumber, Floatnumber, Intnumber)
				61
				62	# Tail end of ' string.
				63	Single = r"[^'\\](?:\\.[^'\\])*'"
				64	# Tail end of " string.
				65	Double = r'[^"\\](?:\\.[^"\\])*"'
				66	# Tail end of ''' string.
				67	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				68	# Tail end of """ string.
				69	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				70	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				71	# Single-line ' or " string.
				72	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				73	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				74
				75	# Because of leftmost-then-longest match semantics, be sure to put the
				76	# longest operators first (e.g., if = came before ==, == would get
				77	# recognized as two instances of =).
				78	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				79	r"//=?", r"->",
				80	r"[+\-*/%&\|^=<>]=?",
				81	r"~")
				82
				83	Bracket = '[][(){}]'
				84	Special = group(r'\r?\n', r'[:;.,`@]')
				85	Funny = group(Operator, Bracket, Special)
				86
				87	PlainToken = group(Number, Funny, String, Name)
				88	Token = Ignore + PlainToken
				89
				90	# First (or only) line of ' or " string.
				91	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				92	group("'", r'\\\r?\n'),
				93	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				94	group('"', r'\\\r?\n'))
				95	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				96	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				97
				98	tokenprog, pseudoprog, single3prog, double3prog = map(
				99	re.compile, (Token, PseudoToken, Single3, Double3))
				100	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				101	"'''": single3prog, '"""': double3prog,
				102	"r'''": single3prog, 'r"""': double3prog,
				103	"u'''": single3prog, 'u"""': double3prog,
				104	"b'''": single3prog, 'b"""': double3prog,
				105	"ur'''": single3prog, 'ur"""': double3prog,
				106	"br'''": single3prog, 'br"""': double3prog,
				107	"R'''": single3prog, 'R"""': double3prog,
				108	"U'''": single3prog, 'U"""': double3prog,
				109	"B'''": single3prog, 'B"""': double3prog,
				110	"uR'''": single3prog, 'uR"""': double3prog,
				111	"Ur'''": single3prog, 'Ur"""': double3prog,
				112	"UR'''": single3prog, 'UR"""': double3prog,
				113	"bR'''": single3prog, 'bR"""': double3prog,
				114	"Br'''": single3prog, 'Br"""': double3prog,
				115	"BR'''": single3prog, 'BR"""': double3prog,
				116	'r': None, 'R': None,
				117	'u': None, 'U': None,
				118	'b': None, 'B': None}
				119
				120	triple_quoted = {}
				121	for t in ("'''", '"""',
				122	"r'''", 'r"""', "R'''", 'R"""',
				123	"u'''", 'u"""', "U'''", 'U"""',
				124	"b'''", 'b"""', "B'''", 'B"""',
				125	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				126	"uR'''", 'uR"""', "UR'''", 'UR"""',
				127	"br'''", 'br"""', "Br'''", 'Br"""',
				128	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				129	triple_quoted[t] = t
				130	single_quoted = {}
				131	for t in ("'", '"',
				132	"r'", 'r"', "R'", 'R"',
				133	"u'", 'u"', "U'", 'U"',
				134	"b'", 'b"', "B'", 'B"',
				135	"ur'", 'ur"', "Ur'", 'Ur"',
				136	"uR'", 'uR"', "UR'", 'UR"',
				137	"br'", 'br"', "Br'", 'Br"',
				138	"bR'", 'bR"', "BR'", 'BR"', ):
				139	single_quoted[t] = t
				140
				141	tabsize = 8
				142
				143	class TokenError(Exception): pass
				144
				145	class StopTokenizing(Exception): pass
				146
				147	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
				148	print "%d,%d-%d,%d:\t%s\t%s" % \
				149	(srow, scol, erow, ecol, tok_name[type], repr(token))
				150
				151	def tokenize(readline, tokeneater=printtoken):
				152	"""
				153	The tokenize() function accepts two parameters: one representing the
				154	input stream, and one providing an output mechanism for tokenize().
				155
				156	The first parameter, readline, must be a callable object which provides
				157	the same interface as the readline() method of built-in file objects.
				158	Each call to the function should return one line of input as a string.
				159
				160	The second parameter, tokeneater, must also be a callable object. It is
				161	called once for each token, with five arguments, corresponding to the
				162	tuples generated by generate_tokens().
				163	"""
				164	try:
				165	tokenize_loop(readline, tokeneater)
				166	except StopTokenizing:
				167	pass
				168
				169	# backwards compatible interface
				170	def tokenize_loop(readline, tokeneater):
				171	for token_info in generate_tokens(readline):
				172	tokeneater(*token_info)
				173
				174	class Untokenizer:
				175
				176	def __init__(self):
				177	self.tokens = []
				178	self.prev_row = 1
				179	self.prev_col = 0
				180
				181	def add_whitespace(self, start):
				182	row, col = start
				183	assert row <= self.prev_row
				184	col_offset = col - self.prev_col
				185	if col_offset:
				186	self.tokens.append(" " * col_offset)
				187
				188	def untokenize(self, iterable):
				189	for t in iterable:
				190	if len(t) == 2:
				191	self.compat(t, iterable)
				192	break
				193	tok_type, token, start, end, line = t
				194	self.add_whitespace(start)
				195	self.tokens.append(token)
				196	self.prev_row, self.prev_col = end
				197	if tok_type in (NEWLINE, NL):
				198	self.prev_row += 1
				199	self.prev_col = 0
				200	return "".join(self.tokens)
				201
				202	def compat(self, token, iterable):
				203	startline = False
				204	indents = []
				205	toks_append = self.tokens.append
				206	toknum, tokval = token
				207	if toknum in (NAME, NUMBER):
				208	tokval += ' '
				209	if toknum in (NEWLINE, NL):
				210	startline = True
				211	for tok in iterable:
				212	toknum, tokval = tok[:2]
				213
				214	if toknum in (NAME, NUMBER):
				215	tokval += ' '
				216
				217	if toknum == INDENT:
				218	indents.append(tokval)
				219	continue
				220	elif toknum == DEDENT:
				221	indents.pop()
				222	continue
				223	elif toknum in (NEWLINE, NL):
				224	startline = True
				225	elif startline and indents:
				226	toks_append(indents[-1])
				227	startline = False
				228	toks_append(tokval)
				229
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	230	cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
				231
Benjamin Peterson	f9e7d54	2009-11-02 18:12:12 +0000	[diff] [blame]	232	def _get_normal_name(orig_enc):
				233	"""Imitates get_normal_name in tokenizer.c."""
				234	# Only care about the first 12 characters.
				235	enc = orig_enc[:12].lower().replace("_", "-")
				236	if enc == "utf-8" or enc.startswith("utf-8-"):
				237	return "utf-8"
				238	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
				239	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
				240	return "iso-8859-1"
				241	return orig_enc
				242
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	243	def detect_encoding(readline):
				244	"""
				245	The detect_encoding() function is used to detect the encoding that should
				246	be used to decode a Python source file. It requires one argment, readline,
				247	in the same way as the tokenize() generator.
				248
				249	It will call readline a maximum of twice, and return the encoding used
				250	(as a string) and a list of any lines (left as bytes) it has read
				251	in.
				252
				253	It detects the encoding from the presence of a utf-8 bom or an encoding
				254	cookie as specified in pep-0263. If both a bom and a cookie are present,
				255	but disagree, a SyntaxError will be raised. If the encoding cookie is an
				256	invalid charset, raise a SyntaxError.
				257
				258	If no encoding is specified, then the default of 'utf-8' will be returned.
				259	"""
				260	bom_found = False
				261	encoding = None
				262	def read_or_stop():
				263	try:
				264	return readline()
				265	except StopIteration:
				266	return b''
				267
				268	def find_cookie(line):
				269	try:
				270	line_string = line.decode('ascii')
				271	except UnicodeDecodeError:
				272	return None
				273
				274	matches = cookie_re.findall(line_string)
				275	if not matches:
				276	return None
Benjamin Peterson	f9e7d54	2009-11-02 18:12:12 +0000	[diff] [blame]	277	encoding = _get_normal_name(matches[0])
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	278	try:
				279	codec = lookup(encoding)
				280	except LookupError:
				281	# This behaviour mimics the Python interpreter
				282	raise SyntaxError("unknown encoding: " + encoding)
				283
Benjamin Peterson	42d26d9	2009-11-25 18:16:46 +0000	[diff] [blame]	284	if bom_found:
				285	if codec.name != 'utf-8':
				286	# This behaviour mimics the Python interpreter
				287	raise SyntaxError('encoding problem: utf-8')
				288	else:
				289	# Allow it to be properly encoded and decoded.
				290	encoding = 'utf-8-sig'
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	291	return encoding
				292
				293	first = read_or_stop()
				294	if first.startswith(BOM_UTF8):
				295	bom_found = True
				296	first = first[3:]
				297	if not first:
				298	return 'utf-8', []
				299
				300	encoding = find_cookie(first)
				301	if encoding:
				302	return encoding, [first]
				303
				304	second = read_or_stop()
				305	if not second:
				306	return 'utf-8', [first]
				307
				308	encoding = find_cookie(second)
				309	if encoding:
				310	return encoding, [first, second]
				311
				312	return 'utf-8', [first, second]
				313
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	314	def untokenize(iterable):
				315	"""Transform tokens back into Python source code.
				316
				317	Each element returned by the iterable must be a token sequence
				318	with at least two elements, a token number and token value. If
				319	only two tokens are passed, the resulting output is poor.
				320
				321	Round-trip invariant for full input:
				322	Untokenized source will match input source exactly
				323
				324	Round-trip invariant for limited intput:
				325	# Output text will tokenize the back to the input
				326	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				327	newcode = untokenize(t1)
				328	readline = iter(newcode.splitlines(1)).next
				329	t2 = [tok[:2] for tokin generate_tokens(readline)]
				330	assert t1 == t2
				331	"""
				332	ut = Untokenizer()
				333	return ut.untokenize(iterable)
				334
				335	def generate_tokens(readline):
				336	"""
				337	The generate_tokens() generator requires one argment, readline, which
				338	must be a callable object which provides the same interface as the
				339	readline() method of built-in file objects. Each call to the function
				340	should return one line of input as a string. Alternately, readline
				341	can be a callable function terminating with StopIteration:
				342	readline = open(myfile).next # Example of alternate readline
				343
				344	The generator produces 5-tuples with these members: the token type; the
				345	token string; a 2-tuple (srow, scol) of ints specifying the row and
				346	column where the token begins in the source; a 2-tuple (erow, ecol) of
				347	ints specifying the row and column where the token ends in the source;
				348	and the line on which the token was found. The line passed is the
				349	logical line; continuation lines are included.
				350	"""
				351	lnum = parenlev = continued = 0
				352	namechars, numchars = string.ascii_letters + '_', '0123456789'
				353	contstr, needcont = '', 0
				354	contline = None
				355	indents = [0]
				356
				357	while 1: # loop over lines in stream
				358	try:
				359	line = readline()
				360	except StopIteration:
				361	line = ''
				362	lnum = lnum + 1
				363	pos, max = 0, len(line)
				364
				365	if contstr: # continued string
				366	if not line:
				367	raise TokenError, ("EOF in multi-line string", strstart)
				368	endmatch = endprog.match(line)
				369	if endmatch:
				370	pos = end = endmatch.end(0)
				371	yield (STRING, contstr + line[:end],
				372	strstart, (lnum, end), contline + line)
				373	contstr, needcont = '', 0
				374	contline = None
				375	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				376	yield (ERRORTOKEN, contstr + line,
				377	strstart, (lnum, len(line)), contline)
				378	contstr = ''
				379	contline = None
				380	continue
				381	else:
				382	contstr = contstr + line
				383	contline = contline + line
				384	continue
				385
				386	elif parenlev == 0 and not continued: # new statement
				387	if not line: break
				388	column = 0
				389	while pos < max: # measure leading whitespace
				390	if line[pos] == ' ': column = column + 1
Benjamin Peterson	f9e7d54	2009-11-02 18:12:12 +0000	[diff] [blame]	391	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	392	elif line[pos] == '\f': column = 0
				393	else: break
				394	pos = pos + 1
				395	if pos == max: break
				396
				397	if line[pos] in '#\r\n': # skip comments or blank lines
				398	if line[pos] == '#':
				399	comment_token = line[pos:].rstrip('\r\n')
				400	nl_pos = pos + len(comment_token)
				401	yield (COMMENT, comment_token,
				402	(lnum, pos), (lnum, pos + len(comment_token)), line)
				403	yield (NL, line[nl_pos:],
				404	(lnum, nl_pos), (lnum, len(line)), line)
				405	else:
				406	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				407	(lnum, pos), (lnum, len(line)), line)
				408	continue
				409
				410	if column > indents[-1]: # count indents or dedents
				411	indents.append(column)
				412	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				413	while column < indents[-1]:
				414	if column not in indents:
				415	raise IndentationError(
				416	"unindent does not match any outer indentation level",
				417	("<tokenize>", lnum, pos, line))
				418	indents = indents[:-1]
				419	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				420
				421	else: # continued statement
				422	if not line:
				423	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
				424	continued = 0
				425
				426	while pos < max:
				427	pseudomatch = pseudoprog.match(line, pos)
				428	if pseudomatch: # scan for tokens
				429	start, end = pseudomatch.span(1)
				430	spos, epos, pos = (lnum, start), (lnum, end), end
				431	token, initial = line[start:end], line[start]
				432
				433	if initial in numchars or \
				434	(initial == '.' and token != '.'): # ordinary number
				435	yield (NUMBER, token, spos, epos, line)
				436	elif initial in '\r\n':
				437	newline = NEWLINE
				438	if parenlev > 0:
				439	newline = NL
				440	yield (newline, token, spos, epos, line)
				441	elif initial == '#':
				442	assert not token.endswith("\n")
				443	yield (COMMENT, token, spos, epos, line)
				444	elif token in triple_quoted:
				445	endprog = endprogs[token]
				446	endmatch = endprog.match(line, pos)
				447	if endmatch: # all on one line
				448	pos = endmatch.end(0)
				449	token = line[start:pos]
				450	yield (STRING, token, spos, (lnum, pos), line)
				451	else:
				452	strstart = (lnum, start) # multiple lines
				453	contstr = line[start:]
				454	contline = line
				455	break
				456	elif initial in single_quoted or \
				457	token[:2] in single_quoted or \
				458	token[:3] in single_quoted:
				459	if token[-1] == '\n': # continued string
				460	strstart = (lnum, start)
				461	endprog = (endprogs[initial] or endprogs[token[1]] or
				462	endprogs[token[2]])
				463	contstr, needcont = line[start:], 1
				464	contline = line
				465	break
				466	else: # ordinary string
				467	yield (STRING, token, spos, epos, line)
				468	elif initial in namechars: # ordinary name
				469	yield (NAME, token, spos, epos, line)
				470	elif initial == '\\': # continued stmt
				471	# This yield is new; needed for better idempotency:
				472	yield (NL, token, spos, (lnum, pos), line)
				473	continued = 1
				474	else:
				475	if initial in '([{': parenlev = parenlev + 1
				476	elif initial in ')]}': parenlev = parenlev - 1
				477	yield (OP, token, spos, epos, line)
				478	else:
				479	yield (ERRORTOKEN, line[pos],
				480	(lnum, pos), (lnum, pos+1), line)
				481	pos = pos + 1
				482
				483	for indent in indents[1:]: # pop remaining indent levels
				484	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				485	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				486
				487	if __name__ == '__main__': # testing
				488	import sys
				489	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				490	else: tokenize(sys.stdin.readline)