Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

blob: 896b0fa0ad47da9d2e704048fe89df15fb4a96b6 [file] [log] [blame]

Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	1	# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
				2	# All rights reserved.
				3
				4	"""Tokenization help for Python programs.
				5
				6	generate_tokens(readline) is a generator that breaks a stream of
				7	text into Python tokens. It accepts a readline-like method which is called
				8	repeatedly to get the next line of input (or "" for EOF). It generates
				9	5-tuples with these members:
				10
				11	the token type (see token.py)
				12	the token (a string)
				13	the starting (row, column) indices of the token (a 2-tuple of ints)
				14	the ending (row, column) indices of the token (a 2-tuple of ints)
				15	the original line (string)
				16
				17	It is designed to match the working of the Python tokenizer exactly, except
				18	that it produces COMMENT tokens for comments and gives type OP for all
				19	operators
				20
				21	Older entry points
				22	tokenize_loop(readline, tokeneater)
				23	tokenize(readline, tokeneater=printtoken)
				24	are the same, except instead of generating tokens, tokeneater is a callback
				25	function to which the 5 fields described above are passed as 5 arguments,
				26	each time a new token is found."""
				27
				28	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
				29	__credits__ = \
				30	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
				31
				32	import string, re
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	33	from codecs import BOM_UTF8, lookup
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	34	from lib2to3.pgen2.token import *
				35
				36	from . import token
				37	__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
				38	"generate_tokens", "untokenize"]
				39	del token
				40
Benjamin Peterson	8d26b0b	2010-05-07 19:10:11 +0000	[diff] [blame]	41	try:
				42	bytes
				43	except NameError:
				44	# Support bytes type in Python <= 2.5, so 2to3 turns itself into
				45	# valid Python 3 code.
				46	bytes = str
				47
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	48	def group(*choices): return '(' + '\|'.join(choices) + ')'
				49	def any(choices): return group(choices) + '*'
				50	def maybe(choices): return group(choices) + '?'
				51
				52	Whitespace = r'[ \f\t]*'
				53	Comment = r'#[^\r\n]*'
				54	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
				55	Name = r'[a-zA-Z_]\w*'
				56
				57	Binnumber = r'0[bB][01]*'
				58	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
				59	Octnumber = r'0[oO]?[0-7]*[lL]?'
				60	Decnumber = r'[1-9]\d*[lL]?'
				61	Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
				62	Exponent = r'[eE][-+]?\d+'
				63	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
				64	Expfloat = r'\d+' + Exponent
				65	Floatnumber = group(Pointfloat, Expfloat)
				66	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
				67	Number = group(Imagnumber, Floatnumber, Intnumber)
				68
				69	# Tail end of ' string.
				70	Single = r"[^'\\](?:\\.[^'\\])*'"
				71	# Tail end of " string.
				72	Double = r'[^"\\](?:\\.[^"\\])*"'
				73	# Tail end of ''' string.
				74	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
				75	# Tail end of """ string.
				76	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
				77	Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
				78	# Single-line ' or " string.
				79	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
				80	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
				81
				82	# Because of leftmost-then-longest match semantics, be sure to put the
				83	# longest operators first (e.g., if = came before ==, == would get
				84	# recognized as two instances of =).
				85	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
				86	r"//=?", r"->",
Benjamin Peterson	4ab92c8	2014-04-10 00:12:47 -0400	[diff] [blame]	87	r"[+\-*/%&@\|^=<>]=?",
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	88	r"~")
				89
				90	Bracket = '[][(){}]'
				91	Special = group(r'\r?\n', r'[:;.,`@]')
				92	Funny = group(Operator, Bracket, Special)
				93
				94	PlainToken = group(Number, Funny, String, Name)
				95	Token = Ignore + PlainToken
				96
				97	# First (or only) line of ' or " string.
				98	ContStr = group(r"[uUbB]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
				99	group("'", r'\\\r?\n'),
				100	r'[uUbB]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
				101	group('"', r'\\\r?\n'))
				102	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
				103	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
				104
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	105	tokenprog, pseudoprog, single3prog, double3prog = list(map(
				106	re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	107	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
				108	"'''": single3prog, '"""': double3prog,
				109	"r'''": single3prog, 'r"""': double3prog,
				110	"u'''": single3prog, 'u"""': double3prog,
				111	"b'''": single3prog, 'b"""': double3prog,
				112	"ur'''": single3prog, 'ur"""': double3prog,
				113	"br'''": single3prog, 'br"""': double3prog,
				114	"R'''": single3prog, 'R"""': double3prog,
				115	"U'''": single3prog, 'U"""': double3prog,
				116	"B'''": single3prog, 'B"""': double3prog,
				117	"uR'''": single3prog, 'uR"""': double3prog,
				118	"Ur'''": single3prog, 'Ur"""': double3prog,
				119	"UR'''": single3prog, 'UR"""': double3prog,
				120	"bR'''": single3prog, 'bR"""': double3prog,
				121	"Br'''": single3prog, 'Br"""': double3prog,
				122	"BR'''": single3prog, 'BR"""': double3prog,
				123	'r': None, 'R': None,
				124	'u': None, 'U': None,
				125	'b': None, 'B': None}
				126
				127	triple_quoted = {}
				128	for t in ("'''", '"""',
				129	"r'''", 'r"""', "R'''", 'R"""',
				130	"u'''", 'u"""', "U'''", 'U"""',
				131	"b'''", 'b"""', "B'''", 'B"""',
				132	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
				133	"uR'''", 'uR"""', "UR'''", 'UR"""',
				134	"br'''", 'br"""', "Br'''", 'Br"""',
				135	"bR'''", 'bR"""', "BR'''", 'BR"""',):
				136	triple_quoted[t] = t
				137	single_quoted = {}
				138	for t in ("'", '"',
				139	"r'", 'r"', "R'", 'R"',
				140	"u'", 'u"', "U'", 'U"',
				141	"b'", 'b"', "B'", 'B"',
				142	"ur'", 'ur"', "Ur'", 'Ur"',
				143	"uR'", 'uR"', "UR'", 'UR"',
				144	"br'", 'br"', "Br'", 'Br"',
				145	"bR'", 'bR"', "BR'", 'BR"', ):
				146	single_quoted[t] = t
				147
				148	tabsize = 8
				149
				150	class TokenError(Exception): pass
				151
				152	class StopTokenizing(Exception): pass
				153
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	154	def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
				155	(srow, scol) = xxx_todo_changeme
				156	(erow, ecol) = xxx_todo_changeme1
				157	print("%d,%d-%d,%d:\t%s\t%s" % \
				158	(srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	159
				160	def tokenize(readline, tokeneater=printtoken):
				161	"""
				162	The tokenize() function accepts two parameters: one representing the
				163	input stream, and one providing an output mechanism for tokenize().
				164
				165	The first parameter, readline, must be a callable object which provides
				166	the same interface as the readline() method of built-in file objects.
				167	Each call to the function should return one line of input as a string.
				168
				169	The second parameter, tokeneater, must also be a callable object. It is
				170	called once for each token, with five arguments, corresponding to the
				171	tuples generated by generate_tokens().
				172	"""
				173	try:
				174	tokenize_loop(readline, tokeneater)
				175	except StopTokenizing:
				176	pass
				177
				178	# backwards compatible interface
				179	def tokenize_loop(readline, tokeneater):
				180	for token_info in generate_tokens(readline):
				181	tokeneater(*token_info)
				182
				183	class Untokenizer:
				184
				185	def __init__(self):
				186	self.tokens = []
				187	self.prev_row = 1
				188	self.prev_col = 0
				189
				190	def add_whitespace(self, start):
				191	row, col = start
				192	assert row <= self.prev_row
				193	col_offset = col - self.prev_col
				194	if col_offset:
				195	self.tokens.append(" " * col_offset)
				196
				197	def untokenize(self, iterable):
				198	for t in iterable:
				199	if len(t) == 2:
				200	self.compat(t, iterable)
				201	break
				202	tok_type, token, start, end, line = t
				203	self.add_whitespace(start)
				204	self.tokens.append(token)
				205	self.prev_row, self.prev_col = end
				206	if tok_type in (NEWLINE, NL):
				207	self.prev_row += 1
				208	self.prev_col = 0
				209	return "".join(self.tokens)
				210
				211	def compat(self, token, iterable):
				212	startline = False
				213	indents = []
				214	toks_append = self.tokens.append
				215	toknum, tokval = token
				216	if toknum in (NAME, NUMBER):
				217	tokval += ' '
				218	if toknum in (NEWLINE, NL):
				219	startline = True
				220	for tok in iterable:
				221	toknum, tokval = tok[:2]
				222
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	223	if toknum in (NAME, NUMBER, ASYNC, AWAIT):
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	224	tokval += ' '
				225
				226	if toknum == INDENT:
				227	indents.append(tokval)
				228	continue
				229	elif toknum == DEDENT:
				230	indents.pop()
				231	continue
				232	elif toknum in (NEWLINE, NL):
				233	startline = True
				234	elif startline and indents:
				235	toks_append(indents[-1])
				236	startline = False
				237	toks_append(tokval)
				238
Serhiy Storchaka	dafea85	2013-09-16 23:51:56 +0300	[diff] [blame]	239	cookie_re = re.compile(r'^[ \t\f]#.coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	240	blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]\|$)', re.ASCII)
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	241
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	242	def _get_normal_name(orig_enc):
				243	"""Imitates get_normal_name in tokenizer.c."""
				244	# Only care about the first 12 characters.
				245	enc = orig_enc[:12].lower().replace("_", "-")
				246	if enc == "utf-8" or enc.startswith("utf-8-"):
				247	return "utf-8"
				248	if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
				249	enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
				250	return "iso-8859-1"
				251	return orig_enc
				252
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	253	def detect_encoding(readline):
				254	"""
				255	The detect_encoding() function is used to detect the encoding that should
Ezio Melotti	4bcc796	2013-11-25 05:14:51 +0200	[diff] [blame]	256	be used to decode a Python source file. It requires one argument, readline,
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	257	in the same way as the tokenize() generator.
				258
				259	It will call readline a maximum of twice, and return the encoding used
				260	(as a string) and a list of any lines (left as bytes) it has read
				261	in.
				262
				263	It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	264	cookie as specified in pep-0263. If both a bom and a cookie are present, but
				265	disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
				266	charset, raise a SyntaxError. Note that if a utf-8 bom is found,
				267	'utf-8-sig' is returned.
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	268
				269	If no encoding is specified, then the default of 'utf-8' will be returned.
				270	"""
				271	bom_found = False
				272	encoding = None
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	273	default = 'utf-8'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	274	def read_or_stop():
				275	try:
				276	return readline()
				277	except StopIteration:
Benjamin Peterson	8d26b0b	2010-05-07 19:10:11 +0000	[diff] [blame]	278	return bytes()
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	279
				280	def find_cookie(line):
				281	try:
				282	line_string = line.decode('ascii')
				283	except UnicodeDecodeError:
				284	return None
Serhiy Storchaka	dafea85	2013-09-16 23:51:56 +0300	[diff] [blame]	285	match = cookie_re.match(line_string)
				286	if not match:
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	287	return None
Serhiy Storchaka	dafea85	2013-09-16 23:51:56 +0300	[diff] [blame]	288	encoding = _get_normal_name(match.group(1))
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	289	try:
				290	codec = lookup(encoding)
				291	except LookupError:
				292	# This behaviour mimics the Python interpreter
				293	raise SyntaxError("unknown encoding: " + encoding)
				294
Benjamin Peterson	2021100	2009-11-25 18:34:42 +0000	[diff] [blame]	295	if bom_found:
				296	if codec.name != 'utf-8':
				297	# This behaviour mimics the Python interpreter
				298	raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	299	encoding += '-sig'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	300	return encoding
				301
				302	first = read_or_stop()
				303	if first.startswith(BOM_UTF8):
				304	bom_found = True
				305	first = first[3:]
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	306	default = 'utf-8-sig'
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	307	if not first:
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	308	return default, []
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	309
				310	encoding = find_cookie(first)
				311	if encoding:
				312	return encoding, [first]
Serhiy Storchaka	768c16c	2014-01-09 18:36:09 +0200	[diff] [blame]	313	if not blank_re.match(first):
				314	return default, [first]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	315
				316	second = read_or_stop()
				317	if not second:
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	318	return default, [first]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	319
				320	encoding = find_cookie(second)
				321	if encoding:
				322	return encoding, [first, second]
				323
Benjamin Peterson	0af9398	2010-03-23 03:22:05 +0000	[diff] [blame]	324	return default, [first, second]
Benjamin Peterson	d481e3d	2009-05-09 19:42:23 +0000	[diff] [blame]	325
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	326	def untokenize(iterable):
				327	"""Transform tokens back into Python source code.
				328
				329	Each element returned by the iterable must be a token sequence
				330	with at least two elements, a token number and token value. If
				331	only two tokens are passed, the resulting output is poor.
				332
				333	Round-trip invariant for full input:
				334	Untokenized source will match input source exactly
				335
				336	Round-trip invariant for limited intput:
				337	# Output text will tokenize the back to the input
				338	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
				339	newcode = untokenize(t1)
				340	readline = iter(newcode.splitlines(1)).next
				341	t2 = [tok[:2] for tokin generate_tokens(readline)]
				342	assert t1 == t2
				343	"""
				344	ut = Untokenizer()
				345	return ut.untokenize(iterable)
				346
				347	def generate_tokens(readline):
				348	"""
Ezio Melotti	4bcc796	2013-11-25 05:14:51 +0200	[diff] [blame]	349	The generate_tokens() generator requires one argument, readline, which
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	350	must be a callable object which provides the same interface as the
				351	readline() method of built-in file objects. Each call to the function
				352	should return one line of input as a string. Alternately, readline
				353	can be a callable function terminating with StopIteration:
				354	readline = open(myfile).next # Example of alternate readline
				355
				356	The generator produces 5-tuples with these members: the token type; the
				357	token string; a 2-tuple (srow, scol) of ints specifying the row and
				358	column where the token begins in the source; a 2-tuple (erow, ecol) of
				359	ints specifying the row and column where the token ends in the source;
				360	and the line on which the token was found. The line passed is the
				361	logical line; continuation lines are included.
				362	"""
				363	lnum = parenlev = continued = 0
				364	namechars, numchars = string.ascii_letters + '_', '0123456789'
				365	contstr, needcont = '', 0
				366	contline = None
				367	indents = [0]
				368
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	369	# 'stashed' and 'ctx' are used for async/await parsing
				370	stashed = None
				371	ctx = [('sync', 0)]
Yury Selivanov	8fb307c	2015-07-22 13:33:45 +0300	[diff] [blame]	372	in_async = 0
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	373
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	374	while 1: # loop over lines in stream
				375	try:
				376	line = readline()
				377	except StopIteration:
				378	line = ''
				379	lnum = lnum + 1
				380	pos, max = 0, len(line)
				381
				382	if contstr: # continued string
				383	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	384	raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	385	endmatch = endprog.match(line)
				386	if endmatch:
				387	pos = end = endmatch.end(0)
				388	yield (STRING, contstr + line[:end],
				389	strstart, (lnum, end), contline + line)
				390	contstr, needcont = '', 0
				391	contline = None
				392	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
				393	yield (ERRORTOKEN, contstr + line,
				394	strstart, (lnum, len(line)), contline)
				395	contstr = ''
				396	contline = None
				397	continue
				398	else:
				399	contstr = contstr + line
				400	contline = contline + line
				401	continue
				402
				403	elif parenlev == 0 and not continued: # new statement
				404	if not line: break
				405	column = 0
				406	while pos < max: # measure leading whitespace
				407	if line[pos] == ' ': column = column + 1
Benjamin Peterson	d9af52b	2009-11-02 18:16:28 +0000	[diff] [blame]	408	elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	409	elif line[pos] == '\f': column = 0
				410	else: break
				411	pos = pos + 1
				412	if pos == max: break
				413
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	414	if stashed:
				415	yield stashed
				416	stashed = None
				417
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	418	if line[pos] in '#\r\n': # skip comments or blank lines
				419	if line[pos] == '#':
				420	comment_token = line[pos:].rstrip('\r\n')
				421	nl_pos = pos + len(comment_token)
				422	yield (COMMENT, comment_token,
				423	(lnum, pos), (lnum, pos + len(comment_token)), line)
				424	yield (NL, line[nl_pos:],
				425	(lnum, nl_pos), (lnum, len(line)), line)
				426	else:
				427	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
				428	(lnum, pos), (lnum, len(line)), line)
				429	continue
				430
				431	if column > indents[-1]: # count indents or dedents
				432	indents.append(column)
				433	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
				434	while column < indents[-1]:
				435	if column not in indents:
				436	raise IndentationError(
				437	"unindent does not match any outer indentation level",
				438	("<tokenize>", lnum, pos, line))
				439	indents = indents[:-1]
Yury Selivanov	8fb307c	2015-07-22 13:33:45 +0300	[diff] [blame]	440
				441	cur_indent = indents[-1]
				442	while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
				443	if ctx[-1][0] == 'async':
				444	in_async -= 1
				445	assert in_async >= 0
				446	ctx.pop()
				447
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	448	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
				449
				450	else: # continued statement
				451	if not line:
Martin v. Löwis	8a5f8ca	2008-03-19 05:33:36 +0000	[diff] [blame]	452	raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	453	continued = 0
				454
				455	while pos < max:
				456	pseudomatch = pseudoprog.match(line, pos)
				457	if pseudomatch: # scan for tokens
				458	start, end = pseudomatch.span(1)
				459	spos, epos, pos = (lnum, start), (lnum, end), end
				460	token, initial = line[start:end], line[start]
				461
				462	if initial in numchars or \
				463	(initial == '.' and token != '.'): # ordinary number
				464	yield (NUMBER, token, spos, epos, line)
				465	elif initial in '\r\n':
				466	newline = NEWLINE
				467	if parenlev > 0:
				468	newline = NL
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	469	if stashed:
				470	yield stashed
				471	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	472	yield (newline, token, spos, epos, line)
				473	elif initial == '#':
				474	assert not token.endswith("\n")
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	475	if stashed:
				476	yield stashed
				477	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	478	yield (COMMENT, token, spos, epos, line)
				479	elif token in triple_quoted:
				480	endprog = endprogs[token]
				481	endmatch = endprog.match(line, pos)
				482	if endmatch: # all on one line
				483	pos = endmatch.end(0)
				484	token = line[start:pos]
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	485	if stashed:
				486	yield stashed
				487	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	488	yield (STRING, token, spos, (lnum, pos), line)
				489	else:
				490	strstart = (lnum, start) # multiple lines
				491	contstr = line[start:]
				492	contline = line
				493	break
				494	elif initial in single_quoted or \
				495	token[:2] in single_quoted or \
				496	token[:3] in single_quoted:
				497	if token[-1] == '\n': # continued string
				498	strstart = (lnum, start)
				499	endprog = (endprogs[initial] or endprogs[token[1]] or
				500	endprogs[token[2]])
				501	contstr, needcont = line[start:], 1
				502	contline = line
				503	break
				504	else: # ordinary string
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	505	if stashed:
				506	yield stashed
				507	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	508	yield (STRING, token, spos, epos, line)
				509	elif initial in namechars: # ordinary name
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	510	if token in ('async', 'await'):
Yury Selivanov	8fb307c	2015-07-22 13:33:45 +0300	[diff] [blame]	511	if in_async:
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	512	yield (ASYNC if token == 'async' else AWAIT,
				513	token, spos, epos, line)
				514	continue
				515
				516	tok = (NAME, token, spos, epos, line)
				517	if token == 'async' and not stashed:
				518	stashed = tok
				519	continue
				520
				521	if token == 'def':
				522	if (stashed
				523	and stashed[0] == NAME
				524	and stashed[1] == 'async'):
				525
				526	ctx.append(('async', indents[-1]))
Yury Selivanov	8fb307c	2015-07-22 13:33:45 +0300	[diff] [blame]	527	in_async += 1
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	528
				529	yield (ASYNC, stashed[1],
				530	stashed[2], stashed[3],
				531	stashed[4])
				532	stashed = None
				533	else:
				534	ctx.append(('sync', indents[-1]))
				535
				536	if stashed:
				537	yield stashed
				538	stashed = None
				539
				540	yield tok
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	541	elif initial == '\\': # continued stmt
				542	# This yield is new; needed for better idempotency:
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	543	if stashed:
				544	yield stashed
				545	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	546	yield (NL, token, spos, (lnum, pos), line)
				547	continued = 1
				548	else:
				549	if initial in '([{': parenlev = parenlev + 1
				550	elif initial in ')]}': parenlev = parenlev - 1
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	551	if stashed:
				552	yield stashed
				553	stashed = None
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	554	yield (OP, token, spos, epos, line)
				555	else:
				556	yield (ERRORTOKEN, line[pos],
				557	(lnum, pos), (lnum, pos+1), line)
				558	pos = pos + 1
				559
Yury Selivanov	7544508	2015-05-11 22:57:16 -0400	[diff] [blame]	560	if stashed:
				561	yield stashed
				562	stashed = None
				563
Martin v. Löwis	ef04c44	2008-03-19 05:04:44 +0000	[diff] [blame]	564	for indent in indents[1:]: # pop remaining indent levels
				565	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
				566	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
				567
				568	if __name__ == '__main__': # testing
				569	import sys
				570	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
				571	else: tokenize(sys.stdin.readline)