blob: 45afc5f4e53fcf7d0f1f602638804a93726d50c1 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +000041try:
42 bytes
43except NameError:
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
46 bytes = str
47
Martin v. Löwisef04c442008-03-19 05:04:44 +000048def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
Nevada Sancheza6e395d2017-04-13 13:32:54 -040057Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
58Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
59Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
60Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
Martin v. Löwisef04c442008-03-19 05:04:44 +000061Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040062Exponent = r'[eE][-+]?\d+(?:_\d+)*'
63Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
64Expfloat = r'\d+(?:_\d+)*' + Exponent
Martin v. Löwisef04c442008-03-19 05:04:44 +000065Floatnumber = group(Pointfloat, Expfloat)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040066Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
Martin v. Löwisef04c442008-03-19 05:04:44 +000067Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Łukasz Langa1b9530c2017-05-22 16:35:48 -070077_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
Łukasz Langa0c4aca52017-05-22 15:19:09 -070078Triple = group(_litprefix + "'''", _litprefix + '"""')
Martin v. Löwisef04c442008-03-19 05:04:44 +000079# Single-line ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -070080String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
81 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Martin v. Löwisef04c442008-03-19 05:04:44 +000082
83# Because of leftmost-then-longest match semantics, be sure to put the
84# longest operators first (e.g., if = came before ==, == would get
85# recognized as two instances of =).
86Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
87 r"//=?", r"->",
Benjamin Peterson4ab92c82014-04-10 00:12:47 -040088 r"[+\-*/%&@|^=<>]=?",
Martin v. Löwisef04c442008-03-19 05:04:44 +000089 r"~")
90
91Bracket = '[][(){}]'
92Special = group(r'\r?\n', r'[:;.,`@]')
93Funny = group(Operator, Bracket, Special)
94
95PlainToken = group(Number, Funny, String, Name)
96Token = Ignore + PlainToken
97
98# First (or only) line of ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -070099ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000100 group("'", r'\\\r?\n'),
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700101 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000102 group('"', r'\\\r?\n'))
103PseudoExtras = group(r'\\\r?\n', Comment, Triple)
104PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
105
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000106tokenprog, pseudoprog, single3prog, double3prog = list(map(
107 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000108endprogs = {"'": re.compile(Single), '"': re.compile(Double),
109 "'''": single3prog, '"""': double3prog,
110 "r'''": single3prog, 'r"""': double3prog,
111 "u'''": single3prog, 'u"""': double3prog,
112 "b'''": single3prog, 'b"""': double3prog,
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700113 "f'''": single3prog, 'f"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000114 "ur'''": single3prog, 'ur"""': double3prog,
115 "br'''": single3prog, 'br"""': double3prog,
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700116 "rb'''": single3prog, 'rb"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000117 "R'''": single3prog, 'R"""': double3prog,
118 "U'''": single3prog, 'U"""': double3prog,
119 "B'''": single3prog, 'B"""': double3prog,
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700120 "F'''": single3prog, 'F"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000121 "uR'''": single3prog, 'uR"""': double3prog,
122 "Ur'''": single3prog, 'Ur"""': double3prog,
123 "UR'''": single3prog, 'UR"""': double3prog,
124 "bR'''": single3prog, 'bR"""': double3prog,
125 "Br'''": single3prog, 'Br"""': double3prog,
126 "BR'''": single3prog, 'BR"""': double3prog,
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700127 "rB'''": single3prog, 'rB"""': double3prog,
128 "Rb'''": single3prog, 'Rb"""': double3prog,
129 "RB'''": single3prog, 'RB"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000130 'r': None, 'R': None,
131 'u': None, 'U': None,
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700132 'f': None, 'F': None,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000133 'b': None, 'B': None}
134
135triple_quoted = {}
136for t in ("'''", '"""',
137 "r'''", 'r"""', "R'''", 'R"""',
138 "u'''", 'u"""', "U'''", 'U"""',
139 "b'''", 'b"""', "B'''", 'B"""',
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700140 "f'''", 'f"""', "F'''", 'F"""',
Martin v. Löwisef04c442008-03-19 05:04:44 +0000141 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
142 "uR'''", 'uR"""', "UR'''", 'UR"""',
143 "br'''", 'br"""', "Br'''", 'Br"""',
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700144 "bR'''", 'bR"""', "BR'''", 'BR"""',
145 "rb'''", 'rb"""', "Rb'''", 'Rb"""',
146 "rB'''", 'rB"""', "RB'''", 'RB"""',):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000147 triple_quoted[t] = t
148single_quoted = {}
149for t in ("'", '"',
150 "r'", 'r"', "R'", 'R"',
151 "u'", 'u"', "U'", 'U"',
152 "b'", 'b"', "B'", 'B"',
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700153 "f'", 'f"', "F'", 'F"',
Martin v. Löwisef04c442008-03-19 05:04:44 +0000154 "ur'", 'ur"', "Ur'", 'Ur"',
155 "uR'", 'uR"', "UR'", 'UR"',
156 "br'", 'br"', "Br'", 'Br"',
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700157 "bR'", 'bR"', "BR'", 'BR"',
158 "rb'", 'rb"', "Rb'", 'Rb"',
159 "rB'", 'rB"', "RB'", 'RB"',):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000160 single_quoted[t] = t
161
162tabsize = 8
163
164class TokenError(Exception): pass
165
166class StopTokenizing(Exception): pass
167
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000168def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
169 (srow, scol) = xxx_todo_changeme
170 (erow, ecol) = xxx_todo_changeme1
171 print("%d,%d-%d,%d:\t%s\t%s" % \
172 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000173
174def tokenize(readline, tokeneater=printtoken):
175 """
176 The tokenize() function accepts two parameters: one representing the
177 input stream, and one providing an output mechanism for tokenize().
178
179 The first parameter, readline, must be a callable object which provides
180 the same interface as the readline() method of built-in file objects.
181 Each call to the function should return one line of input as a string.
182
183 The second parameter, tokeneater, must also be a callable object. It is
184 called once for each token, with five arguments, corresponding to the
185 tuples generated by generate_tokens().
186 """
187 try:
188 tokenize_loop(readline, tokeneater)
189 except StopTokenizing:
190 pass
191
192# backwards compatible interface
193def tokenize_loop(readline, tokeneater):
194 for token_info in generate_tokens(readline):
195 tokeneater(*token_info)
196
197class Untokenizer:
198
199 def __init__(self):
200 self.tokens = []
201 self.prev_row = 1
202 self.prev_col = 0
203
204 def add_whitespace(self, start):
205 row, col = start
206 assert row <= self.prev_row
207 col_offset = col - self.prev_col
208 if col_offset:
209 self.tokens.append(" " * col_offset)
210
211 def untokenize(self, iterable):
212 for t in iterable:
213 if len(t) == 2:
214 self.compat(t, iterable)
215 break
216 tok_type, token, start, end, line = t
217 self.add_whitespace(start)
218 self.tokens.append(token)
219 self.prev_row, self.prev_col = end
220 if tok_type in (NEWLINE, NL):
221 self.prev_row += 1
222 self.prev_col = 0
223 return "".join(self.tokens)
224
225 def compat(self, token, iterable):
226 startline = False
227 indents = []
228 toks_append = self.tokens.append
229 toknum, tokval = token
230 if toknum in (NAME, NUMBER):
231 tokval += ' '
232 if toknum in (NEWLINE, NL):
233 startline = True
234 for tok in iterable:
235 toknum, tokval = tok[:2]
236
Yury Selivanov75445082015-05-11 22:57:16 -0400237 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000238 tokval += ' '
239
240 if toknum == INDENT:
241 indents.append(tokval)
242 continue
243 elif toknum == DEDENT:
244 indents.pop()
245 continue
246 elif toknum in (NEWLINE, NL):
247 startline = True
248 elif startline and indents:
249 toks_append(indents[-1])
250 startline = False
251 toks_append(tokval)
252
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200253cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200254blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000255
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000256def _get_normal_name(orig_enc):
257 """Imitates get_normal_name in tokenizer.c."""
258 # Only care about the first 12 characters.
259 enc = orig_enc[:12].lower().replace("_", "-")
260 if enc == "utf-8" or enc.startswith("utf-8-"):
261 return "utf-8"
262 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
263 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
264 return "iso-8859-1"
265 return orig_enc
266
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000267def detect_encoding(readline):
268 """
269 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200270 be used to decode a Python source file. It requires one argument, readline,
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000271 in the same way as the tokenize() generator.
272
273 It will call readline a maximum of twice, and return the encoding used
274 (as a string) and a list of any lines (left as bytes) it has read
275 in.
276
277 It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson0af93982010-03-23 03:22:05 +0000278 cookie as specified in pep-0263. If both a bom and a cookie are present, but
279 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
280 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
281 'utf-8-sig' is returned.
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000282
283 If no encoding is specified, then the default of 'utf-8' will be returned.
284 """
285 bom_found = False
286 encoding = None
Benjamin Peterson0af93982010-03-23 03:22:05 +0000287 default = 'utf-8'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000288 def read_or_stop():
289 try:
290 return readline()
291 except StopIteration:
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +0000292 return bytes()
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000293
294 def find_cookie(line):
295 try:
296 line_string = line.decode('ascii')
297 except UnicodeDecodeError:
298 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300299 match = cookie_re.match(line_string)
300 if not match:
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000301 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300302 encoding = _get_normal_name(match.group(1))
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000303 try:
304 codec = lookup(encoding)
305 except LookupError:
306 # This behaviour mimics the Python interpreter
307 raise SyntaxError("unknown encoding: " + encoding)
308
Benjamin Peterson20211002009-11-25 18:34:42 +0000309 if bom_found:
310 if codec.name != 'utf-8':
311 # This behaviour mimics the Python interpreter
312 raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson0af93982010-03-23 03:22:05 +0000313 encoding += '-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000314 return encoding
315
316 first = read_or_stop()
317 if first.startswith(BOM_UTF8):
318 bom_found = True
319 first = first[3:]
Benjamin Peterson0af93982010-03-23 03:22:05 +0000320 default = 'utf-8-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000321 if not first:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000322 return default, []
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000323
324 encoding = find_cookie(first)
325 if encoding:
326 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200327 if not blank_re.match(first):
328 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000329
330 second = read_or_stop()
331 if not second:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000332 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000333
334 encoding = find_cookie(second)
335 if encoding:
336 return encoding, [first, second]
337
Benjamin Peterson0af93982010-03-23 03:22:05 +0000338 return default, [first, second]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000339
Martin v. Löwisef04c442008-03-19 05:04:44 +0000340def untokenize(iterable):
341 """Transform tokens back into Python source code.
342
343 Each element returned by the iterable must be a token sequence
344 with at least two elements, a token number and token value. If
345 only two tokens are passed, the resulting output is poor.
346
347 Round-trip invariant for full input:
348 Untokenized source will match input source exactly
349
350 Round-trip invariant for limited intput:
351 # Output text will tokenize the back to the input
352 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
353 newcode = untokenize(t1)
354 readline = iter(newcode.splitlines(1)).next
355 t2 = [tok[:2] for tokin generate_tokens(readline)]
356 assert t1 == t2
357 """
358 ut = Untokenizer()
359 return ut.untokenize(iterable)
360
361def generate_tokens(readline):
362 """
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200363 The generate_tokens() generator requires one argument, readline, which
Martin v. Löwisef04c442008-03-19 05:04:44 +0000364 must be a callable object which provides the same interface as the
365 readline() method of built-in file objects. Each call to the function
366 should return one line of input as a string. Alternately, readline
367 can be a callable function terminating with StopIteration:
368 readline = open(myfile).next # Example of alternate readline
369
370 The generator produces 5-tuples with these members: the token type; the
371 token string; a 2-tuple (srow, scol) of ints specifying the row and
372 column where the token begins in the source; a 2-tuple (erow, ecol) of
373 ints specifying the row and column where the token ends in the source;
374 and the line on which the token was found. The line passed is the
375 logical line; continuation lines are included.
376 """
377 lnum = parenlev = continued = 0
378 namechars, numchars = string.ascii_letters + '_', '0123456789'
379 contstr, needcont = '', 0
380 contline = None
381 indents = [0]
382
Yury Selivanov96ec9342015-07-23 15:01:58 +0300383 # 'stashed' and 'async_*' are used for async/await parsing
Yury Selivanov75445082015-05-11 22:57:16 -0400384 stashed = None
Yury Selivanov96ec9342015-07-23 15:01:58 +0300385 async_def = False
386 async_def_indent = 0
387 async_def_nl = False
Yury Selivanov75445082015-05-11 22:57:16 -0400388
Martin v. Löwisef04c442008-03-19 05:04:44 +0000389 while 1: # loop over lines in stream
390 try:
391 line = readline()
392 except StopIteration:
393 line = ''
394 lnum = lnum + 1
395 pos, max = 0, len(line)
396
397 if contstr: # continued string
398 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000399 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000400 endmatch = endprog.match(line)
401 if endmatch:
402 pos = end = endmatch.end(0)
403 yield (STRING, contstr + line[:end],
404 strstart, (lnum, end), contline + line)
405 contstr, needcont = '', 0
406 contline = None
407 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
408 yield (ERRORTOKEN, contstr + line,
409 strstart, (lnum, len(line)), contline)
410 contstr = ''
411 contline = None
412 continue
413 else:
414 contstr = contstr + line
415 contline = contline + line
416 continue
417
418 elif parenlev == 0 and not continued: # new statement
419 if not line: break
420 column = 0
421 while pos < max: # measure leading whitespace
422 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000423 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000424 elif line[pos] == '\f': column = 0
425 else: break
426 pos = pos + 1
427 if pos == max: break
428
Yury Selivanov75445082015-05-11 22:57:16 -0400429 if stashed:
430 yield stashed
431 stashed = None
432
Martin v. Löwisef04c442008-03-19 05:04:44 +0000433 if line[pos] in '#\r\n': # skip comments or blank lines
434 if line[pos] == '#':
435 comment_token = line[pos:].rstrip('\r\n')
436 nl_pos = pos + len(comment_token)
437 yield (COMMENT, comment_token,
438 (lnum, pos), (lnum, pos + len(comment_token)), line)
439 yield (NL, line[nl_pos:],
440 (lnum, nl_pos), (lnum, len(line)), line)
441 else:
442 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
443 (lnum, pos), (lnum, len(line)), line)
444 continue
445
446 if column > indents[-1]: # count indents or dedents
447 indents.append(column)
448 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
449 while column < indents[-1]:
450 if column not in indents:
451 raise IndentationError(
452 "unindent does not match any outer indentation level",
453 ("<tokenize>", lnum, pos, line))
454 indents = indents[:-1]
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300455
Yury Selivanov96ec9342015-07-23 15:01:58 +0300456 if async_def and async_def_indent >= indents[-1]:
457 async_def = False
458 async_def_nl = False
459 async_def_indent = 0
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300460
Martin v. Löwisef04c442008-03-19 05:04:44 +0000461 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
462
Yury Selivanov96ec9342015-07-23 15:01:58 +0300463 if async_def and async_def_nl and async_def_indent >= indents[-1]:
464 async_def = False
465 async_def_nl = False
466 async_def_indent = 0
467
Martin v. Löwisef04c442008-03-19 05:04:44 +0000468 else: # continued statement
469 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000470 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000471 continued = 0
472
473 while pos < max:
474 pseudomatch = pseudoprog.match(line, pos)
475 if pseudomatch: # scan for tokens
476 start, end = pseudomatch.span(1)
477 spos, epos, pos = (lnum, start), (lnum, end), end
478 token, initial = line[start:end], line[start]
479
480 if initial in numchars or \
481 (initial == '.' and token != '.'): # ordinary number
482 yield (NUMBER, token, spos, epos, line)
483 elif initial in '\r\n':
484 newline = NEWLINE
485 if parenlev > 0:
486 newline = NL
Yury Selivanov96ec9342015-07-23 15:01:58 +0300487 elif async_def:
488 async_def_nl = True
Yury Selivanov75445082015-05-11 22:57:16 -0400489 if stashed:
490 yield stashed
491 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000492 yield (newline, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300493
Martin v. Löwisef04c442008-03-19 05:04:44 +0000494 elif initial == '#':
495 assert not token.endswith("\n")
Yury Selivanov75445082015-05-11 22:57:16 -0400496 if stashed:
497 yield stashed
498 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000499 yield (COMMENT, token, spos, epos, line)
500 elif token in triple_quoted:
501 endprog = endprogs[token]
502 endmatch = endprog.match(line, pos)
503 if endmatch: # all on one line
504 pos = endmatch.end(0)
505 token = line[start:pos]
Yury Selivanov75445082015-05-11 22:57:16 -0400506 if stashed:
507 yield stashed
508 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000509 yield (STRING, token, spos, (lnum, pos), line)
510 else:
511 strstart = (lnum, start) # multiple lines
512 contstr = line[start:]
513 contline = line
514 break
515 elif initial in single_quoted or \
516 token[:2] in single_quoted or \
517 token[:3] in single_quoted:
518 if token[-1] == '\n': # continued string
519 strstart = (lnum, start)
520 endprog = (endprogs[initial] or endprogs[token[1]] or
521 endprogs[token[2]])
522 contstr, needcont = line[start:], 1
523 contline = line
524 break
525 else: # ordinary string
Yury Selivanov75445082015-05-11 22:57:16 -0400526 if stashed:
527 yield stashed
528 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000529 yield (STRING, token, spos, epos, line)
530 elif initial in namechars: # ordinary name
Yury Selivanov75445082015-05-11 22:57:16 -0400531 if token in ('async', 'await'):
Yury Selivanov96ec9342015-07-23 15:01:58 +0300532 if async_def:
Yury Selivanov75445082015-05-11 22:57:16 -0400533 yield (ASYNC if token == 'async' else AWAIT,
534 token, spos, epos, line)
535 continue
536
537 tok = (NAME, token, spos, epos, line)
538 if token == 'async' and not stashed:
539 stashed = tok
540 continue
541
542 if token == 'def':
543 if (stashed
544 and stashed[0] == NAME
545 and stashed[1] == 'async'):
546
Yury Selivanov96ec9342015-07-23 15:01:58 +0300547 async_def = True
548 async_def_indent = indents[-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400549
550 yield (ASYNC, stashed[1],
551 stashed[2], stashed[3],
552 stashed[4])
553 stashed = None
Yury Selivanov75445082015-05-11 22:57:16 -0400554
555 if stashed:
556 yield stashed
557 stashed = None
558
559 yield tok
Martin v. Löwisef04c442008-03-19 05:04:44 +0000560 elif initial == '\\': # continued stmt
561 # This yield is new; needed for better idempotency:
Yury Selivanov75445082015-05-11 22:57:16 -0400562 if stashed:
563 yield stashed
564 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000565 yield (NL, token, spos, (lnum, pos), line)
566 continued = 1
567 else:
568 if initial in '([{': parenlev = parenlev + 1
569 elif initial in ')]}': parenlev = parenlev - 1
Yury Selivanov75445082015-05-11 22:57:16 -0400570 if stashed:
571 yield stashed
572 stashed = None
Martin v. Löwisef04c442008-03-19 05:04:44 +0000573 yield (OP, token, spos, epos, line)
574 else:
575 yield (ERRORTOKEN, line[pos],
576 (lnum, pos), (lnum, pos+1), line)
577 pos = pos + 1
578
Yury Selivanov75445082015-05-11 22:57:16 -0400579 if stashed:
580 yield stashed
581 stashed = None
582
Martin v. Löwisef04c442008-03-19 05:04:44 +0000583 for indent in indents[1:]: # pop remaining indent levels
584 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
585 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
586
587if __name__ == '__main__': # testing
588 import sys
589 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
590 else: tokenize(sys.stdin.readline)