blob: 9a0cc1ea4086c7120b02b7765ba67e07aa155f34 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +000041try:
42 bytes
43except NameError:
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
46 bytes = str
47
Martin v. Löwisef04c442008-03-19 05:04:44 +000048def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
Nevada Sancheza6e395d2017-04-13 13:32:54 -040057Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
58Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
59Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
60Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
Martin v. Löwisef04c442008-03-19 05:04:44 +000061Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040062Exponent = r'[eE][-+]?\d+(?:_\d+)*'
63Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
64Expfloat = r'\d+(?:_\d+)*' + Exponent
Martin v. Löwisef04c442008-03-19 05:04:44 +000065Floatnumber = group(Pointfloat, Expfloat)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040066Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
Martin v. Löwisef04c442008-03-19 05:04:44 +000067Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Łukasz Langa0c4aca52017-05-22 15:19:09 -070077_litprefix = r"(?:[uUrRbB]|[rR][bB]|[bBuU][rR])?"
78Triple = group(_litprefix + "'''", _litprefix + '"""')
Martin v. Löwisef04c442008-03-19 05:04:44 +000079# Single-line ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -070080String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
81 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Martin v. Löwisef04c442008-03-19 05:04:44 +000082
83# Because of leftmost-then-longest match semantics, be sure to put the
84# longest operators first (e.g., if = came before ==, == would get
85# recognized as two instances of =).
86Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
87 r"//=?", r"->",
Benjamin Peterson4ab92c82014-04-10 00:12:47 -040088 r"[+\-*/%&@|^=<>]=?",
Martin v. Löwisef04c442008-03-19 05:04:44 +000089 r"~")
90
91Bracket = '[][(){}]'
92Special = group(r'\r?\n', r'[:;.,`@]')
93Funny = group(Operator, Bracket, Special)
94
95PlainToken = group(Number, Funny, String, Name)
96Token = Ignore + PlainToken
97
98# First (or only) line of ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -070099ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000100 group("'", r'\\\r?\n'),
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700101 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000102 group('"', r'\\\r?\n'))
103PseudoExtras = group(r'\\\r?\n', Comment, Triple)
104PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
105
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000106tokenprog, pseudoprog, single3prog, double3prog = list(map(
107 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000108endprogs = {"'": re.compile(Single), '"': re.compile(Double),
109 "'''": single3prog, '"""': double3prog,
110 "r'''": single3prog, 'r"""': double3prog,
111 "u'''": single3prog, 'u"""': double3prog,
112 "b'''": single3prog, 'b"""': double3prog,
113 "ur'''": single3prog, 'ur"""': double3prog,
114 "br'''": single3prog, 'br"""': double3prog,
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700115 "rb'''": single3prog, 'rb"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000116 "R'''": single3prog, 'R"""': double3prog,
117 "U'''": single3prog, 'U"""': double3prog,
118 "B'''": single3prog, 'B"""': double3prog,
119 "uR'''": single3prog, 'uR"""': double3prog,
120 "Ur'''": single3prog, 'Ur"""': double3prog,
121 "UR'''": single3prog, 'UR"""': double3prog,
122 "bR'''": single3prog, 'bR"""': double3prog,
123 "Br'''": single3prog, 'Br"""': double3prog,
124 "BR'''": single3prog, 'BR"""': double3prog,
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700125 "rB'''": single3prog, 'rB"""': double3prog,
126 "Rb'''": single3prog, 'Rb"""': double3prog,
127 "RB'''": single3prog, 'RB"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000128 'r': None, 'R': None,
129 'u': None, 'U': None,
130 'b': None, 'B': None}
131
132triple_quoted = {}
133for t in ("'''", '"""',
134 "r'''", 'r"""', "R'''", 'R"""',
135 "u'''", 'u"""', "U'''", 'U"""',
136 "b'''", 'b"""', "B'''", 'B"""',
137 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
138 "uR'''", 'uR"""', "UR'''", 'UR"""',
139 "br'''", 'br"""', "Br'''", 'Br"""',
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700140 "bR'''", 'bR"""', "BR'''", 'BR"""',
141 "rb'''", 'rb"""', "Rb'''", 'Rb"""',
142 "rB'''", 'rB"""', "RB'''", 'RB"""',):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000143 triple_quoted[t] = t
144single_quoted = {}
145for t in ("'", '"',
146 "r'", 'r"', "R'", 'R"',
147 "u'", 'u"', "U'", 'U"',
148 "b'", 'b"', "B'", 'B"',
149 "ur'", 'ur"', "Ur'", 'Ur"',
150 "uR'", 'uR"', "UR'", 'UR"',
151 "br'", 'br"', "Br'", 'Br"',
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700152 "bR'", 'bR"', "BR'", 'BR"',
153 "rb'", 'rb"', "Rb'", 'Rb"',
154 "rB'", 'rB"', "RB'", 'RB"',):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000155 single_quoted[t] = t
156
157tabsize = 8
158
159class TokenError(Exception): pass
160
161class StopTokenizing(Exception): pass
162
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000163def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
164 (srow, scol) = xxx_todo_changeme
165 (erow, ecol) = xxx_todo_changeme1
166 print("%d,%d-%d,%d:\t%s\t%s" % \
167 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000168
169def tokenize(readline, tokeneater=printtoken):
170 """
171 The tokenize() function accepts two parameters: one representing the
172 input stream, and one providing an output mechanism for tokenize().
173
174 The first parameter, readline, must be a callable object which provides
175 the same interface as the readline() method of built-in file objects.
176 Each call to the function should return one line of input as a string.
177
178 The second parameter, tokeneater, must also be a callable object. It is
179 called once for each token, with five arguments, corresponding to the
180 tuples generated by generate_tokens().
181 """
182 try:
183 tokenize_loop(readline, tokeneater)
184 except StopTokenizing:
185 pass
186
187# backwards compatible interface
188def tokenize_loop(readline, tokeneater):
189 for token_info in generate_tokens(readline):
190 tokeneater(*token_info)
191
192class Untokenizer:
193
194 def __init__(self):
195 self.tokens = []
196 self.prev_row = 1
197 self.prev_col = 0
198
199 def add_whitespace(self, start):
200 row, col = start
201 assert row <= self.prev_row
202 col_offset = col - self.prev_col
203 if col_offset:
204 self.tokens.append(" " * col_offset)
205
206 def untokenize(self, iterable):
207 for t in iterable:
208 if len(t) == 2:
209 self.compat(t, iterable)
210 break
211 tok_type, token, start, end, line = t
212 self.add_whitespace(start)
213 self.tokens.append(token)
214 self.prev_row, self.prev_col = end
215 if tok_type in (NEWLINE, NL):
216 self.prev_row += 1
217 self.prev_col = 0
218 return "".join(self.tokens)
219
220 def compat(self, token, iterable):
221 startline = False
222 indents = []
223 toks_append = self.tokens.append
224 toknum, tokval = token
225 if toknum in (NAME, NUMBER):
226 tokval += ' '
227 if toknum in (NEWLINE, NL):
228 startline = True
229 for tok in iterable:
230 toknum, tokval = tok[:2]
231
Yury Selivanov75445082015-05-11 22:57:16 -0400232 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000233 tokval += ' '
234
235 if toknum == INDENT:
236 indents.append(tokval)
237 continue
238 elif toknum == DEDENT:
239 indents.pop()
240 continue
241 elif toknum in (NEWLINE, NL):
242 startline = True
243 elif startline and indents:
244 toks_append(indents[-1])
245 startline = False
246 toks_append(tokval)
247
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200248cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200249blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000250
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000251def _get_normal_name(orig_enc):
252 """Imitates get_normal_name in tokenizer.c."""
253 # Only care about the first 12 characters.
254 enc = orig_enc[:12].lower().replace("_", "-")
255 if enc == "utf-8" or enc.startswith("utf-8-"):
256 return "utf-8"
257 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
258 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
259 return "iso-8859-1"
260 return orig_enc
261
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000262def detect_encoding(readline):
263 """
264 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200265 be used to decode a Python source file. It requires one argument, readline,
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000266 in the same way as the tokenize() generator.
267
268 It will call readline a maximum of twice, and return the encoding used
269 (as a string) and a list of any lines (left as bytes) it has read
270 in.
271
272 It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson0af93982010-03-23 03:22:05 +0000273 cookie as specified in pep-0263. If both a bom and a cookie are present, but
274 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
275 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
276 'utf-8-sig' is returned.
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000277
278 If no encoding is specified, then the default of 'utf-8' will be returned.
279 """
280 bom_found = False
281 encoding = None
Benjamin Peterson0af93982010-03-23 03:22:05 +0000282 default = 'utf-8'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000283 def read_or_stop():
284 try:
285 return readline()
286 except StopIteration:
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +0000287 return bytes()
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000288
289 def find_cookie(line):
290 try:
291 line_string = line.decode('ascii')
292 except UnicodeDecodeError:
293 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300294 match = cookie_re.match(line_string)
295 if not match:
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000296 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300297 encoding = _get_normal_name(match.group(1))
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000298 try:
299 codec = lookup(encoding)
300 except LookupError:
301 # This behaviour mimics the Python interpreter
302 raise SyntaxError("unknown encoding: " + encoding)
303
Benjamin Peterson20211002009-11-25 18:34:42 +0000304 if bom_found:
305 if codec.name != 'utf-8':
306 # This behaviour mimics the Python interpreter
307 raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson0af93982010-03-23 03:22:05 +0000308 encoding += '-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000309 return encoding
310
311 first = read_or_stop()
312 if first.startswith(BOM_UTF8):
313 bom_found = True
314 first = first[3:]
Benjamin Peterson0af93982010-03-23 03:22:05 +0000315 default = 'utf-8-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000316 if not first:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000317 return default, []
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000318
319 encoding = find_cookie(first)
320 if encoding:
321 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200322 if not blank_re.match(first):
323 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000324
325 second = read_or_stop()
326 if not second:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000327 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000328
329 encoding = find_cookie(second)
330 if encoding:
331 return encoding, [first, second]
332
Benjamin Peterson0af93982010-03-23 03:22:05 +0000333 return default, [first, second]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000334
Martin v. Löwisef04c442008-03-19 05:04:44 +0000335def untokenize(iterable):
336 """Transform tokens back into Python source code.
337
338 Each element returned by the iterable must be a token sequence
339 with at least two elements, a token number and token value. If
340 only two tokens are passed, the resulting output is poor.
341
342 Round-trip invariant for full input:
343 Untokenized source will match input source exactly
344
345 Round-trip invariant for limited intput:
346 # Output text will tokenize the back to the input
347 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
348 newcode = untokenize(t1)
349 readline = iter(newcode.splitlines(1)).next
350 t2 = [tok[:2] for tokin generate_tokens(readline)]
351 assert t1 == t2
352 """
353 ut = Untokenizer()
354 return ut.untokenize(iterable)
355
356def generate_tokens(readline):
357 """
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200358 The generate_tokens() generator requires one argument, readline, which
Martin v. Löwisef04c442008-03-19 05:04:44 +0000359 must be a callable object which provides the same interface as the
360 readline() method of built-in file objects. Each call to the function
361 should return one line of input as a string. Alternately, readline
362 can be a callable function terminating with StopIteration:
363 readline = open(myfile).next # Example of alternate readline
364
365 The generator produces 5-tuples with these members: the token type; the
366 token string; a 2-tuple (srow, scol) of ints specifying the row and
367 column where the token begins in the source; a 2-tuple (erow, ecol) of
368 ints specifying the row and column where the token ends in the source;
369 and the line on which the token was found. The line passed is the
370 logical line; continuation lines are included.
371 """
372 lnum = parenlev = continued = 0
373 namechars, numchars = string.ascii_letters + '_', '0123456789'
374 contstr, needcont = '', 0
375 contline = None
376 indents = [0]
377
Yury Selivanov96ec9342015-07-23 15:01:58 +0300378 # 'stashed' and 'async_*' are used for async/await parsing
Yury Selivanov75445082015-05-11 22:57:16 -0400379 stashed = None
Yury Selivanov96ec9342015-07-23 15:01:58 +0300380 async_def = False
381 async_def_indent = 0
382 async_def_nl = False
Yury Selivanov75445082015-05-11 22:57:16 -0400383
Martin v. Löwisef04c442008-03-19 05:04:44 +0000384 while 1: # loop over lines in stream
385 try:
386 line = readline()
387 except StopIteration:
388 line = ''
389 lnum = lnum + 1
390 pos, max = 0, len(line)
391
392 if contstr: # continued string
393 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000394 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000395 endmatch = endprog.match(line)
396 if endmatch:
397 pos = end = endmatch.end(0)
398 yield (STRING, contstr + line[:end],
399 strstart, (lnum, end), contline + line)
400 contstr, needcont = '', 0
401 contline = None
402 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
403 yield (ERRORTOKEN, contstr + line,
404 strstart, (lnum, len(line)), contline)
405 contstr = ''
406 contline = None
407 continue
408 else:
409 contstr = contstr + line
410 contline = contline + line
411 continue
412
413 elif parenlev == 0 and not continued: # new statement
414 if not line: break
415 column = 0
416 while pos < max: # measure leading whitespace
417 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000418 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000419 elif line[pos] == '\f': column = 0
420 else: break
421 pos = pos + 1
422 if pos == max: break
423
Yury Selivanov75445082015-05-11 22:57:16 -0400424 if stashed:
425 yield stashed
426 stashed = None
427
Martin v. Löwisef04c442008-03-19 05:04:44 +0000428 if line[pos] in '#\r\n': # skip comments or blank lines
429 if line[pos] == '#':
430 comment_token = line[pos:].rstrip('\r\n')
431 nl_pos = pos + len(comment_token)
432 yield (COMMENT, comment_token,
433 (lnum, pos), (lnum, pos + len(comment_token)), line)
434 yield (NL, line[nl_pos:],
435 (lnum, nl_pos), (lnum, len(line)), line)
436 else:
437 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
438 (lnum, pos), (lnum, len(line)), line)
439 continue
440
441 if column > indents[-1]: # count indents or dedents
442 indents.append(column)
443 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
444 while column < indents[-1]:
445 if column not in indents:
446 raise IndentationError(
447 "unindent does not match any outer indentation level",
448 ("<tokenize>", lnum, pos, line))
449 indents = indents[:-1]
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300450
Yury Selivanov96ec9342015-07-23 15:01:58 +0300451 if async_def and async_def_indent >= indents[-1]:
452 async_def = False
453 async_def_nl = False
454 async_def_indent = 0
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300455
Martin v. Löwisef04c442008-03-19 05:04:44 +0000456 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
457
Yury Selivanov96ec9342015-07-23 15:01:58 +0300458 if async_def and async_def_nl and async_def_indent >= indents[-1]:
459 async_def = False
460 async_def_nl = False
461 async_def_indent = 0
462
Martin v. Löwisef04c442008-03-19 05:04:44 +0000463 else: # continued statement
464 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000465 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000466 continued = 0
467
468 while pos < max:
469 pseudomatch = pseudoprog.match(line, pos)
470 if pseudomatch: # scan for tokens
471 start, end = pseudomatch.span(1)
472 spos, epos, pos = (lnum, start), (lnum, end), end
473 token, initial = line[start:end], line[start]
474
475 if initial in numchars or \
476 (initial == '.' and token != '.'): # ordinary number
477 yield (NUMBER, token, spos, epos, line)
478 elif initial in '\r\n':
479 newline = NEWLINE
480 if parenlev > 0:
481 newline = NL
Yury Selivanov96ec9342015-07-23 15:01:58 +0300482 elif async_def:
483 async_def_nl = True
Yury Selivanov75445082015-05-11 22:57:16 -0400484 if stashed:
485 yield stashed
486 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000487 yield (newline, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300488
Martin v. Löwisef04c442008-03-19 05:04:44 +0000489 elif initial == '#':
490 assert not token.endswith("\n")
Yury Selivanov75445082015-05-11 22:57:16 -0400491 if stashed:
492 yield stashed
493 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000494 yield (COMMENT, token, spos, epos, line)
495 elif token in triple_quoted:
496 endprog = endprogs[token]
497 endmatch = endprog.match(line, pos)
498 if endmatch: # all on one line
499 pos = endmatch.end(0)
500 token = line[start:pos]
Yury Selivanov75445082015-05-11 22:57:16 -0400501 if stashed:
502 yield stashed
503 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000504 yield (STRING, token, spos, (lnum, pos), line)
505 else:
506 strstart = (lnum, start) # multiple lines
507 contstr = line[start:]
508 contline = line
509 break
510 elif initial in single_quoted or \
511 token[:2] in single_quoted or \
512 token[:3] in single_quoted:
513 if token[-1] == '\n': # continued string
514 strstart = (lnum, start)
515 endprog = (endprogs[initial] or endprogs[token[1]] or
516 endprogs[token[2]])
517 contstr, needcont = line[start:], 1
518 contline = line
519 break
520 else: # ordinary string
Yury Selivanov75445082015-05-11 22:57:16 -0400521 if stashed:
522 yield stashed
523 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000524 yield (STRING, token, spos, epos, line)
525 elif initial in namechars: # ordinary name
Yury Selivanov75445082015-05-11 22:57:16 -0400526 if token in ('async', 'await'):
Yury Selivanov96ec9342015-07-23 15:01:58 +0300527 if async_def:
Yury Selivanov75445082015-05-11 22:57:16 -0400528 yield (ASYNC if token == 'async' else AWAIT,
529 token, spos, epos, line)
530 continue
531
532 tok = (NAME, token, spos, epos, line)
533 if token == 'async' and not stashed:
534 stashed = tok
535 continue
536
537 if token == 'def':
538 if (stashed
539 and stashed[0] == NAME
540 and stashed[1] == 'async'):
541
Yury Selivanov96ec9342015-07-23 15:01:58 +0300542 async_def = True
543 async_def_indent = indents[-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400544
545 yield (ASYNC, stashed[1],
546 stashed[2], stashed[3],
547 stashed[4])
548 stashed = None
Yury Selivanov75445082015-05-11 22:57:16 -0400549
550 if stashed:
551 yield stashed
552 stashed = None
553
554 yield tok
Martin v. Löwisef04c442008-03-19 05:04:44 +0000555 elif initial == '\\': # continued stmt
556 # This yield is new; needed for better idempotency:
Yury Selivanov75445082015-05-11 22:57:16 -0400557 if stashed:
558 yield stashed
559 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000560 yield (NL, token, spos, (lnum, pos), line)
561 continued = 1
562 else:
563 if initial in '([{': parenlev = parenlev + 1
564 elif initial in ')]}': parenlev = parenlev - 1
Yury Selivanov75445082015-05-11 22:57:16 -0400565 if stashed:
566 yield stashed
567 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000568 yield (OP, token, spos, epos, line)
569 else:
570 yield (ERRORTOKEN, line[pos],
571 (lnum, pos), (lnum, pos+1), line)
572 pos = pos + 1
573
Yury Selivanov75445082015-05-11 22:57:16 -0400574 if stashed:
575 yield stashed
576 stashed = None
577
Martin v. Löwisef04c442008-03-19 05:04:44 +0000578 for indent in indents[1:]: # pop remaining indent levels
579 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
580 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
581
582if __name__ == '__main__': # testing
583 import sys
584 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
585 else: tokenize(sys.stdin.readline)