Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: d14db60f7da89ea4d63e9a1af1c30e70ac4aeb1b [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
Benjamin Peterson	8d26b0b	2010-05-07 19:10:11 +0000	[diff] [blame]	41	try:
				42	bytes
				43	except NameError:
				44	# Support bytes type in Python <= 2.5, so 2to3 turns itself into
				45	# valid Python 3 code.
				46	bytes = str
				47
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	48	def group(*choices): return '(' + '\|'.join(choices) + ')'
				49	def any(choices): return group(choices) + '*'
				50	def maybe(choices): return group(choices) + '?'
				51
				52	Whitespace = r'[ \f\t]*'
				53	Comment = r'#[^\r\n]*'
				54	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				55	Name = r'[a-zA-Z_]\w*'
				56
				57	Binnumber = r'0[bB][01]*'
				58	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				59	Octnumber = r'0[oO]?[0-7]*[lL]?'
				60	Decnumber = r'[1-9]\d*[lL]?'
				61	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				62	Exponent = r'[eE][-+]?\d+'
				63	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				64	Expfloat = r'\d+' + Exponent
				65	Floatnumber = group(Pointfloat, Expfloat)
				66	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				67	Number = group(Imagnumber, Floatnumber, Intnumber)
				68
				69	# Tail end of ' string.
				70	Single = r"[^'\\](?:\\.[^'\\])*'"
				71	# Tail end of " string.
				72	Double = r'[^"\\](?:\\.[^"\\])*"'
				73	# Tail end of ''' string.
				74	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				75	# Tail end of """ string.
				76	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				77	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				78	# Single-line ' or " string.
				79	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				80	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				81
				82	# Because of leftmost-then-longest match semantics, be sure to put the
				83	# longest operators first (e.g., if = came before ==, == would get
				84	# recognized as two instances of =).
				85	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				86	r"//=?", r"->",
Benjamin Peterson	4ab92c8	2014-04-10 00:12:47 -0400	[diff] [blame]	87	r"[+\-*/%&@\|^=<>]=?",
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	88	r"~")
				89
				90	Bracket = '[][(){}]'
				91	Special = group(r'\r?\n', r'[:;.,`@]')
				92	Funny = group(Operator, Bracket, Special)
				93
				94	PlainToken = group(Number, Funny, String, Name)
				95	Token = Ignore + PlainToken
				96
				97	# First (or only) line of ' or " string.
				98	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				99	group("'", r'\\\r?\n'),
				100	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				101	group('"', r'\\\r?\n'))
				102	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				103	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				104
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	105	tokenprog, pseudoprog, single3prog, double3prog = list(map(
				106	re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	107	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				108	"'''": single3prog, '"""': double3prog,
				109	"r'''": single3prog, 'r"""': double3prog,
				110	"u'''": single3prog, 'u"""': double3prog,
				111	"b'''": single3prog, 'b"""': double3prog,
				112	"ur'''": single3prog, 'ur"""': double3prog,
				113	"br'''": single3prog, 'br"""': double3prog,
				114	"R'''": single3prog, 'R"""': double3prog,
				115	"U'''": single3prog, 'U"""': double3prog,
				116	"B'''": single3prog, 'B"""': double3prog,
				117	"uR'''": single3prog, 'uR"""': double3prog,
				118	"Ur'''": single3prog, 'Ur"""': double3prog,
				119	"UR'''": single3prog, 'UR"""': double3prog,
				120	"bR'''": single3prog, 'bR"""': double3prog,
				121	"Br'''": single3prog, 'Br"""': double3prog,
				122	"BR'''": single3prog, 'BR"""': double3prog,
				123	'r': None, 'R': None,
				124	'u': None, 'U': None,
				125	'b': None, 'B': None}
				126
				127	triple_quoted = {}
				128	for t in ("'''", '"""',
				129	"r'''", 'r"""', "R'''", 'R"""',
				130	"u'''", 'u"""', "U'''", 'U"""',
				131	"b'''", 'b"""', "B'''", 'B"""',
				132	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				133	"uR'''", 'uR"""', "UR'''", 'UR"""',
				134	"br'''", 'br"""', "Br'''", 'Br"""',
				135	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				136	triple_quoted[t] = t
				137	single_quoted = {}
				138	for t in ("'", '"',
				139	"r'", 'r"', "R'", 'R"',
				140	"u'", 'u"', "U'", 'U"',
				141	"b'", 'b"', "B'", 'B"',
				142	"ur'", 'ur"', "Ur'", 'Ur"',
				143	"uR'", 'uR"', "UR'", 'UR"',
				144	"br'", 'br"', "Br'", 'Br"',
				145	"bR'", 'bR"', "BR'", 'BR"', ):
				146	single_quoted[t] = t
				147
				148	tabsize = 8
				149
				150	class TokenError(Exception): pass
				151
				152	class StopTokenizing(Exception): pass
				153
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	154	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
				155	(srow, scol) = xxx_todo_changeme
				156	(erow, ecol) = xxx_todo_changeme1
				157	print("%d,%d-%d,%d:\t%s\t%s" % \
				158	(srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	159
				160	def tokenize(readline, tokeneater=printtoken):
				161	"""
				162	The tokenize() function accepts two parameters: one representing the
				163	input stream, and one providing an output mechanism for tokenize().
				164
				165	The first parameter, readline, must be a callable object which provides
				166	the same interface as the readline() method of built-in file objects.
				167	Each call to the function should return one line of input as a string.
				168
				169	The second parameter, tokeneater, must also be a callable object. It is
				170	called once for each token, with five arguments, corresponding to the
				171	tuples generated by generate_tokens().
				172	"""
				173	try:
				174	tokenize_loop(readline, tokeneater)
				175	except StopTokenizing:
				176	pass
				177
				178	# backwards compatible interface
				179	def tokenize_loop(readline, tokeneater):
				180	for token_info in generate_tokens(readline):
				181	tokeneater(*token_info)
				182
				183	class Untokenizer:
				184
				185	def __init__(self):
				186	self.tokens = []
				187	self.prev_row = 1
				188	self.prev_col = 0
				189
				190	def add_whitespace(self, start):
				191	row, col = start
				192	assert row <= self.prev_row
				193	col_offset = col - self.prev_col
				194	if col_offset:
				195	self.tokens.append(" " * col_offset)
				196
				197	def untokenize(self, iterable):
				198	for t in iterable:
				199	if len(t) == 2:
				200	self.compat(t, iterable)
				201	break
				202	tok_type, token, start, end, line = t
				203	self.add_whitespace(start)
				204	self.tokens.append(token)
				205	self.prev_row, self.prev_col = end
				206	if tok_type in (NEWLINE, NL):
				207	self.prev_row += 1
				208	self.prev_col = 0
				209	return "".join(self.tokens)
				210
				211	def compat(self, token, iterable):
				212	startline = False
				213	indents = []
				214	toks_append = self.tokens.append
				215	toknum, tokval = token
				216	if toknum in (NAME, NUMBER):
				217	tokval += ' '
				218	if toknum in (NEWLINE, NL):
				219	startline = True
				220	for tok in iterable:
				221	toknum, tokval = tok[:2]
				222
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	223	if toknum in (NAME, NUMBER, ASYNC, AWAIT):
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	224	tokval += ' '
				225
				226	if toknum == INDENT:
				227	indents.append(tokval)
				228	continue
				229	elif toknum == DEDENT:
				230	indents.pop()
				231	continue
				232	elif toknum in (NEWLINE, NL):
				233	startline = True
				234	elif startline and indents:
				235	toks_append(indents[-1])
				236	startline = False
				237	toks_append(tokval)
				238
Serhiy Storchaka	e431d3c	2016-03-20 23:36:29 +0200	[diff] [blame]	239	cookie_re = re.compile(r'^[ \t\f]#.?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	240	blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]\|$)', re.ASCII)
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	241
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	242	def _get_normal_name(orig_enc):
				243	"""Imitates get_normal_name in tokenizer.c."""
				244	# Only care about the first 12 characters.
				245	enc = orig_enc[:12].lower().replace("_", "-")
				246	if enc == "utf-8" or enc.startswith("utf-8-"):
				247	return "utf-8"
				248	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
				249	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
				250	return "iso-8859-1"
				251	return orig_enc
				252
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	253	def detect_encoding(readline):
				254	"""
				255	The detect_encoding() function is used to detect the encoding that should
Ezio Melotti	4bcc796	2013-11-25 05:14:51 +0200	[diff] [blame]	256	be used to decode a Python source file. It requires one argument, readline,
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	257	in the same way as the tokenize() generator.
				258
				259	It will call readline a maximum of twice, and return the encoding used
				260	(as a string) and a list of any lines (left as bytes) it has read
				261	in.
				262
				263	It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	264	cookie as specified in pep-0263. If both a bom and a cookie are present, but
				265	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
				266	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
				267	'utf-8-sig' is returned.
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	268
				269	If no encoding is specified, then the default of 'utf-8' will be returned.
				270	"""
				271	bom_found = False
				272	encoding = None
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	273	default = 'utf-8'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	274	def read_or_stop():
				275	try:
				276	return readline()
				277	except StopIteration:
Benjamin Peterson	8d26b0b	2010-05-07 19:10:11 +0000	[diff] [blame]	278	return bytes()
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	279
				280	def find_cookie(line):
				281	try:
				282	line_string = line.decode('ascii')
				283	except UnicodeDecodeError:
				284	return None
Serhiy Storchaka	dafea85	2013-09-16 23:51:56 +0300	[diff] [blame]	285	match = cookie_re.match(line_string)
				286	if not match:
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	287	return None
Serhiy Storchaka	dafea85	2013-09-16 23:51:56 +0300	[diff] [blame]	288	encoding = _get_normal_name(match.group(1))
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	289	try:
				290	codec = lookup(encoding)
				291	except LookupError:
				292	# This behaviour mimics the Python interpreter
				293	raise SyntaxError("unknown encoding: " + encoding)
				294
Benjamin Peterson	2021100	2009-11-25 18:34:42 +0000	[diff] [blame]	295	if bom_found:
				296	if codec.name != 'utf-8':
				297	# This behaviour mimics the Python interpreter
				298	raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	299	encoding += '-sig'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	300	return encoding
				301
				302	first = read_or_stop()
				303	if first.startswith(BOM_UTF8):
				304	bom_found = True
				305	first = first[3:]
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	306	default = 'utf-8-sig'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	307	if not first:
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	308	return default, []
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	309
				310	encoding = find_cookie(first)
				311	if encoding:
				312	return encoding, [first]
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	313	if not blank_re.match(first):
				314	return default, [first]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	315
				316	second = read_or_stop()
				317	if not second:
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	318	return default, [first]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	319
				320	encoding = find_cookie(second)
				321	if encoding:
				322	return encoding, [first, second]
				323
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	324	return default, [first, second]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	325
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	326	def untokenize(iterable):
				327	"""Transform tokens back into Python source code.
				328
				329	Each element returned by the iterable must be a token sequence
				330	with at least two elements, a token number and token value. If
				331	only two tokens are passed, the resulting output is poor.
				332
				333	Round-trip invariant for full input:
				334	Untokenized source will match input source exactly
				335
				336	Round-trip invariant for limited intput:
				337	# Output text will tokenize the back to the input
				338	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				339	newcode = untokenize(t1)
				340	readline = iter(newcode.splitlines(1)).next
				341	t2 = [tok[:2] for tokin generate_tokens(readline)]
				342	assert t1 == t2
				343	"""
				344	ut = Untokenizer()
				345	return ut.untokenize(iterable)
				346
				347	def generate_tokens(readline):
				348	"""
Ezio Melotti	4bcc796	2013-11-25 05:14:51 +0200	[diff] [blame]	349	The generate_tokens() generator requires one argument, readline, which
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	350	must be a callable object which provides the same interface as the
				351	readline() method of built-in file objects. Each call to the function
				352	should return one line of input as a string. Alternately, readline
				353	can be a callable function terminating with StopIteration:
				354	readline = open(myfile).next # Example of alternate readline
				355
				356	The generator produces 5-tuples with these members: the token type; the
				357	token string; a 2-tuple (srow, scol) of ints specifying the row and
				358	column where the token begins in the source; a 2-tuple (erow, ecol) of
				359	ints specifying the row and column where the token ends in the source;
				360	and the line on which the token was found. The line passed is the
				361	logical line; continuation lines are included.
				362	"""
				363	lnum = parenlev = continued = 0
				364	namechars, numchars = string.ascii_letters + '_', '0123456789'
				365	contstr, needcont = '', 0
				366	contline = None
				367	indents = [0]
				368
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	369	# 'stashed' and 'async_*' are used for async/await parsing
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	370	stashed = None
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	371	async_def = False
				372	async_def_indent = 0
				373	async_def_nl = False
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	374
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	375	while 1: # loop over lines in stream
				376	try:
				377	line = readline()
				378	except StopIteration:
				379	line = ''
				380	lnum = lnum + 1
				381	pos, max = 0, len(line)
				382
				383	if contstr: # continued string
				384	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	385	raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	386	endmatch = endprog.match(line)
				387	if endmatch:
				388	pos = end = endmatch.end(0)
				389	yield (STRING, contstr + line[:end],
				390	strstart, (lnum, end), contline + line)
				391	contstr, needcont = '', 0
				392	contline = None
				393	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				394	yield (ERRORTOKEN, contstr + line,
				395	strstart, (lnum, len(line)), contline)
				396	contstr = ''
				397	contline = None
				398	continue
				399	else:
				400	contstr = contstr + line
				401	contline = contline + line
				402	continue
				403
				404	elif parenlev == 0 and not continued: # new statement
				405	if not line: break
				406	column = 0
				407	while pos < max: # measure leading whitespace
				408	if line[pos] == ' ': column = column + 1
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	409	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	410	elif line[pos] == '\f': column = 0
				411	else: break
				412	pos = pos + 1
				413	if pos == max: break
				414
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	415	if stashed:
				416	yield stashed
				417	stashed = None
				418
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	419	if line[pos] in '#\r\n': # skip comments or blank lines
				420	if line[pos] == '#':
				421	comment_token = line[pos:].rstrip('\r\n')
				422	nl_pos = pos + len(comment_token)
				423	yield (COMMENT, comment_token,
				424	(lnum, pos), (lnum, pos + len(comment_token)), line)
				425	yield (NL, line[nl_pos:],
				426	(lnum, nl_pos), (lnum, len(line)), line)
				427	else:
				428	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				429	(lnum, pos), (lnum, len(line)), line)
				430	continue
				431
				432	if column > indents[-1]: # count indents or dedents
				433	indents.append(column)
				434	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				435	while column < indents[-1]:
				436	if column not in indents:
				437	raise IndentationError(
				438	"unindent does not match any outer indentation level",
				439	("<tokenize>", lnum, pos, line))
				440	indents = indents[:-1]
Yury Selivanov	8fb307c	2015-07-22 13:33:45 +0300	[diff] [blame]	441
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	442	if async_def and async_def_indent >= indents[-1]:
				443	async_def = False
				444	async_def_nl = False
				445	async_def_indent = 0
Yury Selivanov	8fb307c	2015-07-22 13:33:45 +0300	[diff] [blame]	446
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	447	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				448
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	449	if async_def and async_def_nl and async_def_indent >= indents[-1]:
				450	async_def = False
				451	async_def_nl = False
				452	async_def_indent = 0
				453
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	454	else: # continued statement
				455	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	456	raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	457	continued = 0
				458
				459	while pos < max:
				460	pseudomatch = pseudoprog.match(line, pos)
				461	if pseudomatch: # scan for tokens
				462	start, end = pseudomatch.span(1)
				463	spos, epos, pos = (lnum, start), (lnum, end), end
				464	token, initial = line[start:end], line[start]
				465
				466	if initial in numchars or \
				467	(initial == '.' and token != '.'): # ordinary number
				468	yield (NUMBER, token, spos, epos, line)
				469	elif initial in '\r\n':
				470	newline = NEWLINE
				471	if parenlev > 0:
				472	newline = NL
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	473	elif async_def:
				474	async_def_nl = True
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	475	if stashed:
				476	yield stashed
				477	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	478	yield (newline, token, spos, epos, line)
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	479
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	480	elif initial == '#':
				481	assert not token.endswith("\n")
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	482	if stashed:
				483	yield stashed
				484	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	485	yield (COMMENT, token, spos, epos, line)
				486	elif token in triple_quoted:
				487	endprog = endprogs[token]
				488	endmatch = endprog.match(line, pos)
				489	if endmatch: # all on one line
				490	pos = endmatch.end(0)
				491	token = line[start:pos]
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	492	if stashed:
				493	yield stashed
				494	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	495	yield (STRING, token, spos, (lnum, pos), line)
				496	else:
				497	strstart = (lnum, start) # multiple lines
				498	contstr = line[start:]
				499	contline = line
				500	break
				501	elif initial in single_quoted or \
				502	token[:2] in single_quoted or \
				503	token[:3] in single_quoted:
				504	if token[-1] == '\n': # continued string
				505	strstart = (lnum, start)
				506	endprog = (endprogs[initial] or endprogs[token[1]] or
				507	endprogs[token[2]])
				508	contstr, needcont = line[start:], 1
				509	contline = line
				510	break
				511	else: # ordinary string
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	512	if stashed:
				513	yield stashed
				514	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	515	yield (STRING, token, spos, epos, line)
				516	elif initial in namechars: # ordinary name
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	517	if token in ('async', 'await'):
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	518	if async_def:
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	519	yield (ASYNC if token == 'async' else AWAIT,
				520	token, spos, epos, line)
				521	continue
				522
				523	tok = (NAME, token, spos, epos, line)
				524	if token == 'async' and not stashed:
				525	stashed = tok
				526	continue
				527
				528	if token == 'def':
				529	if (stashed
				530	and stashed[0] == NAME
				531	and stashed[1] == 'async'):
				532
Yury Selivanov	96ec934	2015-07-23 15:01:58 +0300	[diff] [blame]	533	async_def = True
				534	async_def_indent = indents[-1]
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	535
				536	yield (ASYNC, stashed[1],
				537	stashed[2], stashed[3],
				538	stashed[4])
				539	stashed = None
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	540
				541	if stashed:
				542	yield stashed
				543	stashed = None
				544
				545	yield tok
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	546	elif initial == '\\': # continued stmt
				547	# This yield is new; needed for better idempotency:
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	548	if stashed:
				549	yield stashed
				550	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	551	yield (NL, token, spos, (lnum, pos), line)
				552	continued = 1
				553	else:
				554	if initial in '([{': parenlev = parenlev + 1
				555	elif initial in ')]}': parenlev = parenlev - 1
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	556	if stashed:
				557	yield stashed
				558	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	559	yield (OP, token, spos, epos, line)
				560	else:
				561	yield (ERRORTOKEN, line[pos],
				562	(lnum, pos), (lnum, pos+1), line)
				563	pos = pos + 1
				564
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	565	if stashed:
				566	yield stashed
				567	stashed = None
				568
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	569	for indent in indents[1:]: # pop remaining indent levels
				570	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				571	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				572
				573	if __name__ == '__main__': # testing
				574	import sys
				575	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				576	else: tokenize(sys.stdin.readline)