Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: 33cfc33b7059f0685d5469cff555f8fbccd033d7 [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
				33	from lib2to3.pgen2.token import *
				34
				35	from . import token
				36	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				37	"generate_tokens", "untokenize"]
				38	del token
				39
				40	def group(*choices): return '(' + '\|'.join(choices) + ')'
				41	def any(choices): return group(choices) + '*'
				42	def maybe(choices): return group(choices) + '?'
				43
				44	Whitespace = r'[ \f\t]*'
				45	Comment = r'#[^\r\n]*'
				46	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				47	Name = r'[a-zA-Z_]\w*'
				48
				49	Binnumber = r'0[bB][01]*'
				50	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				51	Octnumber = r'0[oO]?[0-7]*[lL]?'
				52	Decnumber = r'[1-9]\d*[lL]?'
				53	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				54	Exponent = r'[eE][-+]?\d+'
				55	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				56	Expfloat = r'\d+' + Exponent
				57	Floatnumber = group(Pointfloat, Expfloat)
				58	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				59	Number = group(Imagnumber, Floatnumber, Intnumber)
				60
				61	# Tail end of ' string.
				62	Single = r"[^'\\](?:\\.[^'\\])*'"
				63	# Tail end of " string.
				64	Double = r'[^"\\](?:\\.[^"\\])*"'
				65	# Tail end of ''' string.
				66	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				67	# Tail end of """ string.
				68	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				69	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				70	# Single-line ' or " string.
				71	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				72	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				73
				74	# Because of leftmost-then-longest match semantics, be sure to put the
				75	# longest operators first (e.g., if = came before ==, == would get
				76	# recognized as two instances of =).
				77	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				78	r"//=?", r"->",
				79	r"[+\-*/%&\|^=<>]=?",
				80	r"~")
				81
				82	Bracket = '[][(){}]'
				83	Special = group(r'\r?\n', r'[:;.,`@]')
				84	Funny = group(Operator, Bracket, Special)
				85
				86	PlainToken = group(Number, Funny, String, Name)
				87	Token = Ignore + PlainToken
				88
				89	# First (or only) line of ' or " string.
				90	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				91	group("'", r'\\\r?\n'),
				92	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				93	group('"', r'\\\r?\n'))
				94	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				95	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				96
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame^]	97	tokenprog, pseudoprog, single3prog, double3prog = list(map(
				98	re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	99	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				100	"'''": single3prog, '"""': double3prog,
				101	"r'''": single3prog, 'r"""': double3prog,
				102	"u'''": single3prog, 'u"""': double3prog,
				103	"b'''": single3prog, 'b"""': double3prog,
				104	"ur'''": single3prog, 'ur"""': double3prog,
				105	"br'''": single3prog, 'br"""': double3prog,
				106	"R'''": single3prog, 'R"""': double3prog,
				107	"U'''": single3prog, 'U"""': double3prog,
				108	"B'''": single3prog, 'B"""': double3prog,
				109	"uR'''": single3prog, 'uR"""': double3prog,
				110	"Ur'''": single3prog, 'Ur"""': double3prog,
				111	"UR'''": single3prog, 'UR"""': double3prog,
				112	"bR'''": single3prog, 'bR"""': double3prog,
				113	"Br'''": single3prog, 'Br"""': double3prog,
				114	"BR'''": single3prog, 'BR"""': double3prog,
				115	'r': None, 'R': None,
				116	'u': None, 'U': None,
				117	'b': None, 'B': None}
				118
				119	triple_quoted = {}
				120	for t in ("'''", '"""',
				121	"r'''", 'r"""', "R'''", 'R"""',
				122	"u'''", 'u"""', "U'''", 'U"""',
				123	"b'''", 'b"""', "B'''", 'B"""',
				124	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				125	"uR'''", 'uR"""', "UR'''", 'UR"""',
				126	"br'''", 'br"""', "Br'''", 'Br"""',
				127	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				128	triple_quoted[t] = t
				129	single_quoted = {}
				130	for t in ("'", '"',
				131	"r'", 'r"', "R'", 'R"',
				132	"u'", 'u"', "U'", 'U"',
				133	"b'", 'b"', "B'", 'B"',
				134	"ur'", 'ur"', "Ur'", 'Ur"',
				135	"uR'", 'uR"', "UR'", 'UR"',
				136	"br'", 'br"', "Br'", 'Br"',
				137	"bR'", 'bR"', "BR'", 'BR"', ):
				138	single_quoted[t] = t
				139
				140	tabsize = 8
				141
				142	class TokenError(Exception): pass
				143
				144	class StopTokenizing(Exception): pass
				145
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame^]	146	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
				147	(srow, scol) = xxx_todo_changeme
				148	(erow, ecol) = xxx_todo_changeme1
				149	print("%d,%d-%d,%d:\t%s\t%s" % \
				150	(srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	151
				152	def tokenize(readline, tokeneater=printtoken):
				153	"""
				154	The tokenize() function accepts two parameters: one representing the
				155	input stream, and one providing an output mechanism for tokenize().
				156
				157	The first parameter, readline, must be a callable object which provides
				158	the same interface as the readline() method of built-in file objects.
				159	Each call to the function should return one line of input as a string.
				160
				161	The second parameter, tokeneater, must also be a callable object. It is
				162	called once for each token, with five arguments, corresponding to the
				163	tuples generated by generate_tokens().
				164	"""
				165	try:
				166	tokenize_loop(readline, tokeneater)
				167	except StopTokenizing:
				168	pass
				169
				170	# backwards compatible interface
				171	def tokenize_loop(readline, tokeneater):
				172	for token_info in generate_tokens(readline):
				173	tokeneater(*token_info)
				174
				175	class Untokenizer:
				176
				177	def __init__(self):
				178	self.tokens = []
				179	self.prev_row = 1
				180	self.prev_col = 0
				181
				182	def add_whitespace(self, start):
				183	row, col = start
				184	assert row <= self.prev_row
				185	col_offset = col - self.prev_col
				186	if col_offset:
				187	self.tokens.append(" " * col_offset)
				188
				189	def untokenize(self, iterable):
				190	for t in iterable:
				191	if len(t) == 2:
				192	self.compat(t, iterable)
				193	break
				194	tok_type, token, start, end, line = t
				195	self.add_whitespace(start)
				196	self.tokens.append(token)
				197	self.prev_row, self.prev_col = end
				198	if tok_type in (NEWLINE, NL):
				199	self.prev_row += 1
				200	self.prev_col = 0
				201	return "".join(self.tokens)
				202
				203	def compat(self, token, iterable):
				204	startline = False
				205	indents = []
				206	toks_append = self.tokens.append
				207	toknum, tokval = token
				208	if toknum in (NAME, NUMBER):
				209	tokval += ' '
				210	if toknum in (NEWLINE, NL):
				211	startline = True
				212	for tok in iterable:
				213	toknum, tokval = tok[:2]
				214
				215	if toknum in (NAME, NUMBER):
				216	tokval += ' '
				217
				218	if toknum == INDENT:
				219	indents.append(tokval)
				220	continue
				221	elif toknum == DEDENT:
				222	indents.pop()
				223	continue
				224	elif toknum in (NEWLINE, NL):
				225	startline = True
				226	elif startline and indents:
				227	toks_append(indents[-1])
				228	startline = False
				229	toks_append(tokval)
				230
				231	def untokenize(iterable):
				232	"""Transform tokens back into Python source code.
				233
				234	Each element returned by the iterable must be a token sequence
				235	with at least two elements, a token number and token value. If
				236	only two tokens are passed, the resulting output is poor.
				237
				238	Round-trip invariant for full input:
				239	Untokenized source will match input source exactly
				240
				241	Round-trip invariant for limited intput:
				242	# Output text will tokenize the back to the input
				243	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				244	newcode = untokenize(t1)
				245	readline = iter(newcode.splitlines(1)).next
				246	t2 = [tok[:2] for tokin generate_tokens(readline)]
				247	assert t1 == t2
				248	"""
				249	ut = Untokenizer()
				250	return ut.untokenize(iterable)
				251
				252	def generate_tokens(readline):
				253	"""
				254	The generate_tokens() generator requires one argment, readline, which
				255	must be a callable object which provides the same interface as the
				256	readline() method of built-in file objects. Each call to the function
				257	should return one line of input as a string. Alternately, readline
				258	can be a callable function terminating with StopIteration:
				259	readline = open(myfile).next # Example of alternate readline
				260
				261	The generator produces 5-tuples with these members: the token type; the
				262	token string; a 2-tuple (srow, scol) of ints specifying the row and
				263	column where the token begins in the source; a 2-tuple (erow, ecol) of
				264	ints specifying the row and column where the token ends in the source;
				265	and the line on which the token was found. The line passed is the
				266	logical line; continuation lines are included.
				267	"""
				268	lnum = parenlev = continued = 0
				269	namechars, numchars = string.ascii_letters + '_', '0123456789'
				270	contstr, needcont = '', 0
				271	contline = None
				272	indents = [0]
				273
				274	while 1: # loop over lines in stream
				275	try:
				276	line = readline()
				277	except StopIteration:
				278	line = ''
				279	lnum = lnum + 1
				280	pos, max = 0, len(line)
				281
				282	if contstr: # continued string
				283	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame^]	284	raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	285	endmatch = endprog.match(line)
				286	if endmatch:
				287	pos = end = endmatch.end(0)
				288	yield (STRING, contstr + line[:end],
				289	strstart, (lnum, end), contline + line)
				290	contstr, needcont = '', 0
				291	contline = None
				292	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				293	yield (ERRORTOKEN, contstr + line,
				294	strstart, (lnum, len(line)), contline)
				295	contstr = ''
				296	contline = None
				297	continue
				298	else:
				299	contstr = contstr + line
				300	contline = contline + line
				301	continue
				302
				303	elif parenlev == 0 and not continued: # new statement
				304	if not line: break
				305	column = 0
				306	while pos < max: # measure leading whitespace
				307	if line[pos] == ' ': column = column + 1
				308	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
				309	elif line[pos] == '\f': column = 0
				310	else: break
				311	pos = pos + 1
				312	if pos == max: break
				313
				314	if line[pos] in '#\r\n': # skip comments or blank lines
				315	if line[pos] == '#':
				316	comment_token = line[pos:].rstrip('\r\n')
				317	nl_pos = pos + len(comment_token)
				318	yield (COMMENT, comment_token,
				319	(lnum, pos), (lnum, pos + len(comment_token)), line)
				320	yield (NL, line[nl_pos:],
				321	(lnum, nl_pos), (lnum, len(line)), line)
				322	else:
				323	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				324	(lnum, pos), (lnum, len(line)), line)
				325	continue
				326
				327	if column > indents[-1]: # count indents or dedents
				328	indents.append(column)
				329	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				330	while column < indents[-1]:
				331	if column not in indents:
				332	raise IndentationError(
				333	"unindent does not match any outer indentation level",
				334	("<tokenize>", lnum, pos, line))
				335	indents = indents[:-1]
				336	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				337
				338	else: # continued statement
				339	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame^]	340	raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	341	continued = 0
				342
				343	while pos < max:
				344	pseudomatch = pseudoprog.match(line, pos)
				345	if pseudomatch: # scan for tokens
				346	start, end = pseudomatch.span(1)
				347	spos, epos, pos = (lnum, start), (lnum, end), end
				348	token, initial = line[start:end], line[start]
				349
				350	if initial in numchars or \
				351	(initial == '.' and token != '.'): # ordinary number
				352	yield (NUMBER, token, spos, epos, line)
				353	elif initial in '\r\n':
				354	newline = NEWLINE
				355	if parenlev > 0:
				356	newline = NL
				357	yield (newline, token, spos, epos, line)
				358	elif initial == '#':
				359	assert not token.endswith("\n")
				360	yield (COMMENT, token, spos, epos, line)
				361	elif token in triple_quoted:
				362	endprog = endprogs[token]
				363	endmatch = endprog.match(line, pos)
				364	if endmatch: # all on one line
				365	pos = endmatch.end(0)
				366	token = line[start:pos]
				367	yield (STRING, token, spos, (lnum, pos), line)
				368	else:
				369	strstart = (lnum, start) # multiple lines
				370	contstr = line[start:]
				371	contline = line
				372	break
				373	elif initial in single_quoted or \
				374	token[:2] in single_quoted or \
				375	token[:3] in single_quoted:
				376	if token[-1] == '\n': # continued string
				377	strstart = (lnum, start)
				378	endprog = (endprogs[initial] or endprogs[token[1]] or
				379	endprogs[token[2]])
				380	contstr, needcont = line[start:], 1
				381	contline = line
				382	break
				383	else: # ordinary string
				384	yield (STRING, token, spos, epos, line)
				385	elif initial in namechars: # ordinary name
				386	yield (NAME, token, spos, epos, line)
				387	elif initial == '\\': # continued stmt
				388	# This yield is new; needed for better idempotency:
				389	yield (NL, token, spos, (lnum, pos), line)
				390	continued = 1
				391	else:
				392	if initial in '([{': parenlev = parenlev + 1
				393	elif initial in ')]}': parenlev = parenlev - 1
				394	yield (OP, token, spos, epos, line)
				395	else:
				396	yield (ERRORTOKEN, line[pos],
				397	(lnum, pos), (lnum, pos+1), line)
				398	pos = pos + 1
				399
				400	for indent in indents[1:]: # pop remaining indent levels
				401	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				402	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				403
				404	if __name__ == '__main__': # testing
				405	import sys
				406	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				407	else: tokenize(sys.stdin.readline)