Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython2

blob: 5f56560a68f47a7bd5d997ad678cdddc3e6a8c3a [file] [log] [blame]

Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
				41	def group(*choices): return '(' + '\|'.join(choices) + ')'
				42	def any(choices): return group(choices) + '*'
				43	def maybe(choices): return group(choices) + '?'
				44
				45	Whitespace = r'[ \f\t]*'
				46	Comment = r'#[^\r\n]*'
				47	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				48	Name = r'[a-zA-Z_]\w*'
				49
				50	Binnumber = r'0[bB][01]*'
				51	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				52	Octnumber = r'0[oO]?[0-7]*[lL]?'
				53	Decnumber = r'[1-9]\d*[lL]?'
				54	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				55	Exponent = r'[eE][-+]?\d+'
				56	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				57	Expfloat = r'\d+' + Exponent
				58	Floatnumber = group(Pointfloat, Expfloat)
				59	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				60	Number = group(Imagnumber, Floatnumber, Intnumber)
				61
				62	# Tail end of ' string.
				63	Single = r"[^'\\](?:\\.[^'\\])*'"
				64	# Tail end of " string.
				65	Double = r'[^"\\](?:\\.[^"\\])*"'
				66	# Tail end of ''' string.
				67	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				68	# Tail end of """ string.
				69	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				70	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				71	# Single-line ' or " string.
				72	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				73	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				74
				75	# Because of leftmost-then-longest match semantics, be sure to put the
				76	# longest operators first (e.g., if = came before ==, == would get
				77	# recognized as two instances of =).
				78	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				79	r"//=?", r"->",
				80	r"[+\-*/%&\|^=<>]=?",
				81	r"~")
				82
				83	Bracket = '[][(){}]'
				84	Special = group(r'\r?\n', r'[:;.,`@]')
				85	Funny = group(Operator, Bracket, Special)
				86
				87	PlainToken = group(Number, Funny, String, Name)
				88	Token = Ignore + PlainToken
				89
				90	# First (or only) line of ' or " string.
				91	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				92	group("'", r'\\\r?\n'),
				93	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				94	group('"', r'\\\r?\n'))
				95	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				96	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				97
				98	tokenprog, pseudoprog, single3prog, double3prog = map(
				99	re.compile, (Token, PseudoToken, Single3, Double3))
				100	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				101	"'''": single3prog, '"""': double3prog,
				102	"r'''": single3prog, 'r"""': double3prog,
				103	"u'''": single3prog, 'u"""': double3prog,
				104	"b'''": single3prog, 'b"""': double3prog,
				105	"ur'''": single3prog, 'ur"""': double3prog,
				106	"br'''": single3prog, 'br"""': double3prog,
				107	"R'''": single3prog, 'R"""': double3prog,
				108	"U'''": single3prog, 'U"""': double3prog,
				109	"B'''": single3prog, 'B"""': double3prog,
				110	"uR'''": single3prog, 'uR"""': double3prog,
				111	"Ur'''": single3prog, 'Ur"""': double3prog,
				112	"UR'''": single3prog, 'UR"""': double3prog,
				113	"bR'''": single3prog, 'bR"""': double3prog,
				114	"Br'''": single3prog, 'Br"""': double3prog,
				115	"BR'''": single3prog, 'BR"""': double3prog,
				116	'r': None, 'R': None,
				117	'u': None, 'U': None,
				118	'b': None, 'B': None}
				119
				120	triple_quoted = {}
				121	for t in ("'''", '"""',
				122	"r'''", 'r"""', "R'''", 'R"""',
				123	"u'''", 'u"""', "U'''", 'U"""',
				124	"b'''", 'b"""', "B'''", 'B"""',
				125	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				126	"uR'''", 'uR"""', "UR'''", 'UR"""',
				127	"br'''", 'br"""', "Br'''", 'Br"""',
				128	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				129	triple_quoted[t] = t
				130	single_quoted = {}
				131	for t in ("'", '"',
				132	"r'", 'r"', "R'", 'R"',
				133	"u'", 'u"', "U'", 'U"',
				134	"b'", 'b"', "B'", 'B"',
				135	"ur'", 'ur"', "Ur'", 'Ur"',
				136	"uR'", 'uR"', "UR'", 'UR"',
				137	"br'", 'br"', "Br'", 'Br"',
				138	"bR'", 'bR"', "BR'", 'BR"', ):
				139	single_quoted[t] = t
				140
				141	tabsize = 8
				142
				143	class TokenError(Exception): pass
				144
				145	class StopTokenizing(Exception): pass
				146
				147	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
				148	print "%d,%d-%d,%d:\t%s\t%s" % \
				149	(srow, scol, erow, ecol, tok_name[type], repr(token))
				150
				151	def tokenize(readline, tokeneater=printtoken):
				152	"""
				153	The tokenize() function accepts two parameters: one representing the
				154	input stream, and one providing an output mechanism for tokenize().
				155
				156	The first parameter, readline, must be a callable object which provides
				157	the same interface as the readline() method of built-in file objects.
				158	Each call to the function should return one line of input as a string.
				159
				160	The second parameter, tokeneater, must also be a callable object. It is
				161	called once for each token, with five arguments, corresponding to the
				162	tuples generated by generate_tokens().
				163	"""
				164	try:
				165	tokenize_loop(readline, tokeneater)
				166	except StopTokenizing:
				167	pass
				168
				169	# backwards compatible interface
				170	def tokenize_loop(readline, tokeneater):
				171	for token_info in generate_tokens(readline):
				172	tokeneater(*token_info)
				173
				174	class Untokenizer:
				175
				176	def __init__(self):
				177	self.tokens = []
				178	self.prev_row = 1
				179	self.prev_col = 0
				180
				181	def add_whitespace(self, start):
				182	row, col = start
				183	assert row <= self.prev_row
				184	col_offset = col - self.prev_col
				185	if col_offset:
				186	self.tokens.append(" " * col_offset)
				187
				188	def untokenize(self, iterable):
				189	for t in iterable:
				190	if len(t) == 2:
				191	self.compat(t, iterable)
				192	break
				193	tok_type, token, start, end, line = t
				194	self.add_whitespace(start)
				195	self.tokens.append(token)
				196	self.prev_row, self.prev_col = end
				197	if tok_type in (NEWLINE, NL):
				198	self.prev_row += 1
				199	self.prev_col = 0
				200	return "".join(self.tokens)
				201
				202	def compat(self, token, iterable):
				203	startline = False
				204	indents = []
				205	toks_append = self.tokens.append
				206	toknum, tokval = token
				207	if toknum in (NAME, NUMBER):
				208	tokval += ' '
				209	if toknum in (NEWLINE, NL):
				210	startline = True
				211	for tok in iterable:
				212	toknum, tokval = tok[:2]
				213
				214	if toknum in (NAME, NUMBER):
				215	tokval += ' '
				216
				217	if toknum == INDENT:
				218	indents.append(tokval)
				219	continue
				220	elif toknum == DEDENT:
				221	indents.pop()
				222	continue
				223	elif toknum in (NEWLINE, NL):
				224	startline = True
				225	elif startline and indents:
				226	toks_append(indents[-1])
				227	startline = False
				228	toks_append(tokval)
				229
Benjamin Peterson	84ad84e	2009-05-09 01:01:14 +0000	[diff] [blame]	230	cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
				231
				232	def detect_encoding(readline):
				233	"""
				234	The detect_encoding() function is used to detect the encoding that should
				235	be used to decode a Python source file. It requires one argment, readline,
				236	in the same way as the tokenize() generator.
				237
				238	It will call readline a maximum of twice, and return the encoding used
				239	(as a string) and a list of any lines (left as bytes) it has read
				240	in.
				241
				242	It detects the encoding from the presence of a utf-8 bom or an encoding
				243	cookie as specified in pep-0263. If both a bom and a cookie are present,
				244	but disagree, a SyntaxError will be raised. If the encoding cookie is an
				245	invalid charset, raise a SyntaxError.
				246
				247	If no encoding is specified, then the default of 'utf-8' will be returned.
				248	"""
				249	bom_found = False
				250	encoding = None
				251	def read_or_stop():
				252	try:
				253	return readline()
				254	except StopIteration:
				255	return b''
				256
				257	def find_cookie(line):
				258	try:
				259	line_string = line.decode('ascii')
				260	except UnicodeDecodeError:
				261	return None
				262
				263	matches = cookie_re.findall(line_string)
				264	if not matches:
				265	return None
				266	encoding = matches[0]
				267	try:
				268	codec = lookup(encoding)
				269	except LookupError:
				270	# This behaviour mimics the Python interpreter
				271	raise SyntaxError("unknown encoding: " + encoding)
				272
				273	if bom_found and codec.name != 'utf-8':
				274	# This behaviour mimics the Python interpreter
				275	raise SyntaxError('encoding problem: utf-8')
				276	return encoding
				277
				278	first = read_or_stop()
				279	if first.startswith(BOM_UTF8):
				280	bom_found = True
				281	first = first[3:]
				282	if not first:
				283	return 'utf-8', []
				284
				285	encoding = find_cookie(first)
				286	if encoding:
				287	return encoding, [first]
				288
				289	second = read_or_stop()
				290	if not second:
				291	return 'utf-8', [first]
				292
				293	encoding = find_cookie(second)
				294	if encoding:
				295	return encoding, [first, second]
				296
				297	return 'utf-8', [first, second]
				298
Martin v. Löwis	5e37bae	2008-03-19 04:43:46 +0000	[diff] [blame]	299	def untokenize(iterable):
				300	"""Transform tokens back into Python source code.
				301
				302	Each element returned by the iterable must be a token sequence
				303	with at least two elements, a token number and token value. If
				304	only two tokens are passed, the resulting output is poor.
				305
				306	Round-trip invariant for full input:
				307	Untokenized source will match input source exactly
				308
				309	Round-trip invariant for limited intput:
				310	# Output text will tokenize the back to the input
				311	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				312	newcode = untokenize(t1)
				313	readline = iter(newcode.splitlines(1)).next
				314	t2 = [tok[:2] for tokin generate_tokens(readline)]
				315	assert t1 == t2
				316	"""
				317	ut = Untokenizer()
				318	return ut.untokenize(iterable)
				319
				320	def generate_tokens(readline):
				321	"""
				322	The generate_tokens() generator requires one argment, readline, which
				323	must be a callable object which provides the same interface as the
				324	readline() method of built-in file objects. Each call to the function
				325	should return one line of input as a string. Alternately, readline
				326	can be a callable function terminating with StopIteration:
				327	readline = open(myfile).next # Example of alternate readline
				328
				329	The generator produces 5-tuples with these members: the token type; the
				330	token string; a 2-tuple (srow, scol) of ints specifying the row and
				331	column where the token begins in the source; a 2-tuple (erow, ecol) of
				332	ints specifying the row and column where the token ends in the source;
				333	and the line on which the token was found. The line passed is the
				334	logical line; continuation lines are included.
				335	"""
				336	lnum = parenlev = continued = 0
				337	namechars, numchars = string.ascii_letters + '_', '0123456789'
				338	contstr, needcont = '', 0
				339	contline = None
				340	indents = [0]
				341
				342	while 1: # loop over lines in stream
				343	try:
				344	line = readline()
				345	except StopIteration:
				346	line = ''
				347	lnum = lnum + 1
				348	pos, max = 0, len(line)
				349
				350	if contstr: # continued string
				351	if not line:
				352	raise TokenError, ("EOF in multi-line string", strstart)
				353	endmatch = endprog.match(line)
				354	if endmatch:
				355	pos = end = endmatch.end(0)
				356	yield (STRING, contstr + line[:end],
				357	strstart, (lnum, end), contline + line)
				358	contstr, needcont = '', 0
				359	contline = None
				360	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				361	yield (ERRORTOKEN, contstr + line,
				362	strstart, (lnum, len(line)), contline)
				363	contstr = ''
				364	contline = None
				365	continue
				366	else:
				367	contstr = contstr + line
				368	contline = contline + line
				369	continue
				370
				371	elif parenlev == 0 and not continued: # new statement
				372	if not line: break
				373	column = 0
				374	while pos < max: # measure leading whitespace
				375	if line[pos] == ' ': column = column + 1
				376	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
				377	elif line[pos] == '\f': column = 0
				378	else: break
				379	pos = pos + 1
				380	if pos == max: break
				381
				382	if line[pos] in '#\r\n': # skip comments or blank lines
				383	if line[pos] == '#':
				384	comment_token = line[pos:].rstrip('\r\n')
				385	nl_pos = pos + len(comment_token)
				386	yield (COMMENT, comment_token,
				387	(lnum, pos), (lnum, pos + len(comment_token)), line)
				388	yield (NL, line[nl_pos:],
				389	(lnum, nl_pos), (lnum, len(line)), line)
				390	else:
				391	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				392	(lnum, pos), (lnum, len(line)), line)
				393	continue
				394
				395	if column > indents[-1]: # count indents or dedents
				396	indents.append(column)
				397	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				398	while column < indents[-1]:
				399	if column not in indents:
				400	raise IndentationError(
				401	"unindent does not match any outer indentation level",
				402	("<tokenize>", lnum, pos, line))
				403	indents = indents[:-1]
				404	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				405
				406	else: # continued statement
				407	if not line:
				408	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
				409	continued = 0
				410
				411	while pos < max:
				412	pseudomatch = pseudoprog.match(line, pos)
				413	if pseudomatch: # scan for tokens
				414	start, end = pseudomatch.span(1)
				415	spos, epos, pos = (lnum, start), (lnum, end), end
				416	token, initial = line[start:end], line[start]
				417
				418	if initial in numchars or \
				419	(initial == '.' and token != '.'): # ordinary number
				420	yield (NUMBER, token, spos, epos, line)
				421	elif initial in '\r\n':
				422	newline = NEWLINE
				423	if parenlev > 0:
				424	newline = NL
				425	yield (newline, token, spos, epos, line)
				426	elif initial == '#':
				427	assert not token.endswith("\n")
				428	yield (COMMENT, token, spos, epos, line)
				429	elif token in triple_quoted:
				430	endprog = endprogs[token]
				431	endmatch = endprog.match(line, pos)
				432	if endmatch: # all on one line
				433	pos = endmatch.end(0)
				434	token = line[start:pos]
				435	yield (STRING, token, spos, (lnum, pos), line)
				436	else:
				437	strstart = (lnum, start) # multiple lines
				438	contstr = line[start:]
				439	contline = line
				440	break
				441	elif initial in single_quoted or \
				442	token[:2] in single_quoted or \
				443	token[:3] in single_quoted:
				444	if token[-1] == '\n': # continued string
				445	strstart = (lnum, start)
				446	endprog = (endprogs[initial] or endprogs[token[1]] or
				447	endprogs[token[2]])
				448	contstr, needcont = line[start:], 1
				449	contline = line
				450	break
				451	else: # ordinary string
				452	yield (STRING, token, spos, epos, line)
				453	elif initial in namechars: # ordinary name
				454	yield (NAME, token, spos, epos, line)
				455	elif initial == '\\': # continued stmt
				456	# This yield is new; needed for better idempotency:
				457	yield (NL, token, spos, (lnum, pos), line)
				458	continued = 1
				459	else:
				460	if initial in '([{': parenlev = parenlev + 1
				461	elif initial in ')]}': parenlev = parenlev - 1
				462	yield (OP, token, spos, epos, line)
				463	else:
				464	yield (ERRORTOKEN, line[pos],
				465	(lnum, pos), (lnum, pos+1), line)
				466	pos = pos + 1
				467
				468	for indent in indents[1:]: # pop remaining indent levels
				469	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				470	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				471
				472	if __name__ == '__main__': # testing
				473	import sys
				474	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				475	else: tokenize(sys.stdin.readline)