blob: bf83ab8c7d9d2e4f758ecf87d2ab0821ddaa6945 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +000041try:
42 bytes
43except NameError:
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
46 bytes = str
47
Martin v. Löwisef04c442008-03-19 05:04:44 +000048def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
Zsolt Dollenstein8f37e842018-04-16 22:33:31 +010051def _combinations(*l):
52 return set(
53 x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
54 )
Martin v. Löwisef04c442008-03-19 05:04:44 +000055
56Whitespace = r'[ \f\t]*'
57Comment = r'#[^\r\n]*'
58Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
59Name = r'[a-zA-Z_]\w*'
60
Nevada Sancheza6e395d2017-04-13 13:32:54 -040061Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
62Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
63Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
64Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
Martin v. Löwisef04c442008-03-19 05:04:44 +000065Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040066Exponent = r'[eE][-+]?\d+(?:_\d+)*'
67Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
68Expfloat = r'\d+(?:_\d+)*' + Exponent
Martin v. Löwisef04c442008-03-19 05:04:44 +000069Floatnumber = group(Pointfloat, Expfloat)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040070Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
Martin v. Löwisef04c442008-03-19 05:04:44 +000071Number = group(Imagnumber, Floatnumber, Intnumber)
72
73# Tail end of ' string.
74Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
75# Tail end of " string.
76Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
77# Tail end of ''' string.
78Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
79# Tail end of """ string.
80Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Zsolt Dollenstein8f37e842018-04-16 22:33:31 +010081_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
Łukasz Langa0c4aca52017-05-22 15:19:09 -070082Triple = group(_litprefix + "'''", _litprefix + '"""')
Martin v. Löwisef04c442008-03-19 05:04:44 +000083# Single-line ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -070084String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
85 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Martin v. Löwisef04c442008-03-19 05:04:44 +000086
87# Because of leftmost-then-longest match semantics, be sure to put the
88# longest operators first (e.g., if = came before ==, == would get
89# recognized as two instances of =).
90Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
91 r"//=?", r"->",
Benjamin Peterson4ab92c82014-04-10 00:12:47 -040092 r"[+\-*/%&@|^=<>]=?",
Martin v. Löwisef04c442008-03-19 05:04:44 +000093 r"~")
94
95Bracket = '[][(){}]'
96Special = group(r'\r?\n', r'[:;.,`@]')
97Funny = group(Operator, Bracket, Special)
98
99PlainToken = group(Number, Funny, String, Name)
100Token = Ignore + PlainToken
101
102# First (or only) line of ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700103ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000104 group("'", r'\\\r?\n'),
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700105 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000106 group('"', r'\\\r?\n'))
107PseudoExtras = group(r'\\\r?\n', Comment, Triple)
108PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
109
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000110tokenprog, pseudoprog, single3prog, double3prog = list(map(
111 re.compile, (Token, PseudoToken, Single3, Double3)))
Zsolt Dollenstein8f37e842018-04-16 22:33:31 +0100112
113_strprefixes = (
114 _combinations('r', 'R', 'f', 'F') |
115 _combinations('r', 'R', 'b', 'B') |
116 {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
117)
118
Martin v. Löwisef04c442008-03-19 05:04:44 +0000119endprogs = {"'": re.compile(Single), '"': re.compile(Double),
120 "'''": single3prog, '"""': double3prog,
Zsolt Dollenstein8f37e842018-04-16 22:33:31 +0100121 **{f"{prefix}'''": single3prog for prefix in _strprefixes},
122 **{f'{prefix}"""': double3prog for prefix in _strprefixes},
123 **{prefix: None for prefix in _strprefixes}}
Martin v. Löwisef04c442008-03-19 05:04:44 +0000124
Zsolt Dollenstein8f37e842018-04-16 22:33:31 +0100125triple_quoted = (
126 {"'''", '"""'} |
127 {f"{prefix}'''" for prefix in _strprefixes} |
128 {f'{prefix}"""' for prefix in _strprefixes}
129)
130single_quoted = (
131 {"'", '"'} |
132 {f"{prefix}'" for prefix in _strprefixes} |
133 {f'{prefix}"' for prefix in _strprefixes}
134)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000135
136tabsize = 8
137
138class TokenError(Exception): pass
139
140class StopTokenizing(Exception): pass
141
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000142def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
143 (srow, scol) = xxx_todo_changeme
144 (erow, ecol) = xxx_todo_changeme1
145 print("%d,%d-%d,%d:\t%s\t%s" % \
146 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000147
148def tokenize(readline, tokeneater=printtoken):
149 """
150 The tokenize() function accepts two parameters: one representing the
151 input stream, and one providing an output mechanism for tokenize().
152
153 The first parameter, readline, must be a callable object which provides
154 the same interface as the readline() method of built-in file objects.
155 Each call to the function should return one line of input as a string.
156
157 The second parameter, tokeneater, must also be a callable object. It is
158 called once for each token, with five arguments, corresponding to the
159 tuples generated by generate_tokens().
160 """
161 try:
162 tokenize_loop(readline, tokeneater)
163 except StopTokenizing:
164 pass
165
166# backwards compatible interface
167def tokenize_loop(readline, tokeneater):
168 for token_info in generate_tokens(readline):
169 tokeneater(*token_info)
170
171class Untokenizer:
172
173 def __init__(self):
174 self.tokens = []
175 self.prev_row = 1
176 self.prev_col = 0
177
178 def add_whitespace(self, start):
179 row, col = start
180 assert row <= self.prev_row
181 col_offset = col - self.prev_col
182 if col_offset:
183 self.tokens.append(" " * col_offset)
184
185 def untokenize(self, iterable):
186 for t in iterable:
187 if len(t) == 2:
188 self.compat(t, iterable)
189 break
190 tok_type, token, start, end, line = t
191 self.add_whitespace(start)
192 self.tokens.append(token)
193 self.prev_row, self.prev_col = end
194 if tok_type in (NEWLINE, NL):
195 self.prev_row += 1
196 self.prev_col = 0
197 return "".join(self.tokens)
198
199 def compat(self, token, iterable):
200 startline = False
201 indents = []
202 toks_append = self.tokens.append
203 toknum, tokval = token
204 if toknum in (NAME, NUMBER):
205 tokval += ' '
206 if toknum in (NEWLINE, NL):
207 startline = True
208 for tok in iterable:
209 toknum, tokval = tok[:2]
210
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700211 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000212 tokval += ' '
213
214 if toknum == INDENT:
215 indents.append(tokval)
216 continue
217 elif toknum == DEDENT:
218 indents.pop()
219 continue
220 elif toknum in (NEWLINE, NL):
221 startline = True
222 elif startline and indents:
223 toks_append(indents[-1])
224 startline = False
225 toks_append(tokval)
226
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200227cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200228blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000229
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000230def _get_normal_name(orig_enc):
231 """Imitates get_normal_name in tokenizer.c."""
232 # Only care about the first 12 characters.
233 enc = orig_enc[:12].lower().replace("_", "-")
234 if enc == "utf-8" or enc.startswith("utf-8-"):
235 return "utf-8"
236 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
237 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
238 return "iso-8859-1"
239 return orig_enc
240
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000241def detect_encoding(readline):
242 """
243 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200244 be used to decode a Python source file. It requires one argument, readline,
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000245 in the same way as the tokenize() generator.
246
247 It will call readline a maximum of twice, and return the encoding used
248 (as a string) and a list of any lines (left as bytes) it has read
249 in.
250
251 It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson0af93982010-03-23 03:22:05 +0000252 cookie as specified in pep-0263. If both a bom and a cookie are present, but
253 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
254 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
255 'utf-8-sig' is returned.
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000256
257 If no encoding is specified, then the default of 'utf-8' will be returned.
258 """
259 bom_found = False
260 encoding = None
Benjamin Peterson0af93982010-03-23 03:22:05 +0000261 default = 'utf-8'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000262 def read_or_stop():
263 try:
264 return readline()
265 except StopIteration:
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +0000266 return bytes()
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000267
268 def find_cookie(line):
269 try:
270 line_string = line.decode('ascii')
271 except UnicodeDecodeError:
272 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300273 match = cookie_re.match(line_string)
274 if not match:
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000275 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300276 encoding = _get_normal_name(match.group(1))
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000277 try:
278 codec = lookup(encoding)
279 except LookupError:
280 # This behaviour mimics the Python interpreter
281 raise SyntaxError("unknown encoding: " + encoding)
282
Benjamin Peterson20211002009-11-25 18:34:42 +0000283 if bom_found:
284 if codec.name != 'utf-8':
285 # This behaviour mimics the Python interpreter
286 raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson0af93982010-03-23 03:22:05 +0000287 encoding += '-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000288 return encoding
289
290 first = read_or_stop()
291 if first.startswith(BOM_UTF8):
292 bom_found = True
293 first = first[3:]
Benjamin Peterson0af93982010-03-23 03:22:05 +0000294 default = 'utf-8-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000295 if not first:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000296 return default, []
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000297
298 encoding = find_cookie(first)
299 if encoding:
300 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200301 if not blank_re.match(first):
302 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000303
304 second = read_or_stop()
305 if not second:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000306 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000307
308 encoding = find_cookie(second)
309 if encoding:
310 return encoding, [first, second]
311
Benjamin Peterson0af93982010-03-23 03:22:05 +0000312 return default, [first, second]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000313
Martin v. Löwisef04c442008-03-19 05:04:44 +0000314def untokenize(iterable):
315 """Transform tokens back into Python source code.
316
317 Each element returned by the iterable must be a token sequence
318 with at least two elements, a token number and token value. If
319 only two tokens are passed, the resulting output is poor.
320
321 Round-trip invariant for full input:
322 Untokenized source will match input source exactly
323
324 Round-trip invariant for limited intput:
325 # Output text will tokenize the back to the input
326 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
327 newcode = untokenize(t1)
328 readline = iter(newcode.splitlines(1)).next
329 t2 = [tok[:2] for tokin generate_tokens(readline)]
330 assert t1 == t2
331 """
332 ut = Untokenizer()
333 return ut.untokenize(iterable)
334
335def generate_tokens(readline):
336 """
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200337 The generate_tokens() generator requires one argument, readline, which
Martin v. Löwisef04c442008-03-19 05:04:44 +0000338 must be a callable object which provides the same interface as the
339 readline() method of built-in file objects. Each call to the function
340 should return one line of input as a string. Alternately, readline
341 can be a callable function terminating with StopIteration:
342 readline = open(myfile).next # Example of alternate readline
343
344 The generator produces 5-tuples with these members: the token type; the
345 token string; a 2-tuple (srow, scol) of ints specifying the row and
346 column where the token begins in the source; a 2-tuple (erow, ecol) of
347 ints specifying the row and column where the token ends in the source;
348 and the line on which the token was found. The line passed is the
349 logical line; continuation lines are included.
350 """
351 lnum = parenlev = continued = 0
352 namechars, numchars = string.ascii_letters + '_', '0123456789'
353 contstr, needcont = '', 0
354 contline = None
355 indents = [0]
356
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700357 # 'stashed' and 'async_*' are used for async/await parsing
358 stashed = None
359 async_def = False
360 async_def_indent = 0
361 async_def_nl = False
362
Martin v. Löwisef04c442008-03-19 05:04:44 +0000363 while 1: # loop over lines in stream
364 try:
365 line = readline()
366 except StopIteration:
367 line = ''
368 lnum = lnum + 1
369 pos, max = 0, len(line)
370
371 if contstr: # continued string
372 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000373 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000374 endmatch = endprog.match(line)
375 if endmatch:
376 pos = end = endmatch.end(0)
377 yield (STRING, contstr + line[:end],
378 strstart, (lnum, end), contline + line)
379 contstr, needcont = '', 0
380 contline = None
381 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
382 yield (ERRORTOKEN, contstr + line,
383 strstart, (lnum, len(line)), contline)
384 contstr = ''
385 contline = None
386 continue
387 else:
388 contstr = contstr + line
389 contline = contline + line
390 continue
391
392 elif parenlev == 0 and not continued: # new statement
393 if not line: break
394 column = 0
395 while pos < max: # measure leading whitespace
396 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000397 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000398 elif line[pos] == '\f': column = 0
399 else: break
400 pos = pos + 1
401 if pos == max: break
402
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700403 if stashed:
404 yield stashed
405 stashed = None
406
Martin v. Löwisef04c442008-03-19 05:04:44 +0000407 if line[pos] in '#\r\n': # skip comments or blank lines
408 if line[pos] == '#':
409 comment_token = line[pos:].rstrip('\r\n')
410 nl_pos = pos + len(comment_token)
411 yield (COMMENT, comment_token,
412 (lnum, pos), (lnum, pos + len(comment_token)), line)
413 yield (NL, line[nl_pos:],
414 (lnum, nl_pos), (lnum, len(line)), line)
415 else:
416 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
417 (lnum, pos), (lnum, len(line)), line)
418 continue
419
420 if column > indents[-1]: # count indents or dedents
421 indents.append(column)
422 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
423 while column < indents[-1]:
424 if column not in indents:
425 raise IndentationError(
426 "unindent does not match any outer indentation level",
427 ("<tokenize>", lnum, pos, line))
428 indents = indents[:-1]
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300429
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700430 if async_def and async_def_indent >= indents[-1]:
431 async_def = False
432 async_def_nl = False
433 async_def_indent = 0
434
Martin v. Löwisef04c442008-03-19 05:04:44 +0000435 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
436
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700437 if async_def and async_def_nl and async_def_indent >= indents[-1]:
438 async_def = False
439 async_def_nl = False
440 async_def_indent = 0
441
Martin v. Löwisef04c442008-03-19 05:04:44 +0000442 else: # continued statement
443 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000444 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000445 continued = 0
446
447 while pos < max:
448 pseudomatch = pseudoprog.match(line, pos)
449 if pseudomatch: # scan for tokens
450 start, end = pseudomatch.span(1)
451 spos, epos, pos = (lnum, start), (lnum, end), end
452 token, initial = line[start:end], line[start]
453
454 if initial in numchars or \
455 (initial == '.' and token != '.'): # ordinary number
456 yield (NUMBER, token, spos, epos, line)
457 elif initial in '\r\n':
458 newline = NEWLINE
459 if parenlev > 0:
460 newline = NL
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700461 elif async_def:
462 async_def_nl = True
463 if stashed:
464 yield stashed
465 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000466 yield (newline, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300467
Martin v. Löwisef04c442008-03-19 05:04:44 +0000468 elif initial == '#':
469 assert not token.endswith("\n")
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700470 if stashed:
471 yield stashed
472 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000473 yield (COMMENT, token, spos, epos, line)
474 elif token in triple_quoted:
475 endprog = endprogs[token]
476 endmatch = endprog.match(line, pos)
477 if endmatch: # all on one line
478 pos = endmatch.end(0)
479 token = line[start:pos]
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700480 if stashed:
481 yield stashed
482 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000483 yield (STRING, token, spos, (lnum, pos), line)
484 else:
485 strstart = (lnum, start) # multiple lines
486 contstr = line[start:]
487 contline = line
488 break
489 elif initial in single_quoted or \
490 token[:2] in single_quoted or \
491 token[:3] in single_quoted:
492 if token[-1] == '\n': # continued string
493 strstart = (lnum, start)
494 endprog = (endprogs[initial] or endprogs[token[1]] or
495 endprogs[token[2]])
496 contstr, needcont = line[start:], 1
497 contline = line
498 break
499 else: # ordinary string
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700500 if stashed:
501 yield stashed
502 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000503 yield (STRING, token, spos, epos, line)
504 elif initial in namechars: # ordinary name
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700505 if token in ('async', 'await'):
506 if async_def:
507 yield (ASYNC if token == 'async' else AWAIT,
508 token, spos, epos, line)
509 continue
510
511 tok = (NAME, token, spos, epos, line)
512 if token == 'async' and not stashed:
513 stashed = tok
514 continue
515
516 if token == 'def':
517 if (stashed
518 and stashed[0] == NAME
519 and stashed[1] == 'async'):
520
521 async_def = True
522 async_def_indent = indents[-1]
523
524 yield (ASYNC, stashed[1],
525 stashed[2], stashed[3],
526 stashed[4])
527 stashed = None
528
529 if stashed:
530 yield stashed
531 stashed = None
532
533 yield tok
Martin v. Löwisef04c442008-03-19 05:04:44 +0000534 elif initial == '\\': # continued stmt
535 # This yield is new; needed for better idempotency:
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700536 if stashed:
537 yield stashed
538 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000539 yield (NL, token, spos, (lnum, pos), line)
540 continued = 1
541 else:
542 if initial in '([{': parenlev = parenlev + 1
543 elif initial in ')]}': parenlev = parenlev - 1
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700544 if stashed:
545 yield stashed
546 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000547 yield (OP, token, spos, epos, line)
548 else:
549 yield (ERRORTOKEN, line[pos],
550 (lnum, pos), (lnum, pos+1), line)
551 pos = pos + 1
552
Jelle Zijlstraf64aae42018-03-18 09:54:33 -0700553 if stashed:
554 yield stashed
555 stashed = None
556
Martin v. Löwisef04c442008-03-19 05:04:44 +0000557 for indent in indents[1:]: # pop remaining indent levels
558 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
559 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
560
561if __name__ == '__main__': # testing
562 import sys
563 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
564 else: tokenize(sys.stdin.readline)