blob: 14560e4fddff6224abbafd228229ed88605bd1a3 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +000041try:
42 bytes
43except NameError:
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
46 bytes = str
47
Martin v. Löwisef04c442008-03-19 05:04:44 +000048def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
Nevada Sancheza6e395d2017-04-13 13:32:54 -040057Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
58Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
59Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
60Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
Martin v. Löwisef04c442008-03-19 05:04:44 +000061Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040062Exponent = r'[eE][-+]?\d+(?:_\d+)*'
63Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
64Expfloat = r'\d+(?:_\d+)*' + Exponent
Martin v. Löwisef04c442008-03-19 05:04:44 +000065Floatnumber = group(Pointfloat, Expfloat)
Nevada Sancheza6e395d2017-04-13 13:32:54 -040066Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
Martin v. Löwisef04c442008-03-19 05:04:44 +000067Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Łukasz Langa1b9530c2017-05-22 16:35:48 -070077_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"
Łukasz Langa0c4aca52017-05-22 15:19:09 -070078Triple = group(_litprefix + "'''", _litprefix + '"""')
Martin v. Löwisef04c442008-03-19 05:04:44 +000079# Single-line ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -070080String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
81 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Martin v. Löwisef04c442008-03-19 05:04:44 +000082
83# Because of leftmost-then-longest match semantics, be sure to put the
84# longest operators first (e.g., if = came before ==, == would get
85# recognized as two instances of =).
86Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
87 r"//=?", r"->",
Benjamin Peterson4ab92c82014-04-10 00:12:47 -040088 r"[+\-*/%&@|^=<>]=?",
Martin v. Löwisef04c442008-03-19 05:04:44 +000089 r"~")
90
91Bracket = '[][(){}]'
92Special = group(r'\r?\n', r'[:;.,`@]')
93Funny = group(Operator, Bracket, Special)
94
95PlainToken = group(Number, Funny, String, Name)
96Token = Ignore + PlainToken
97
98# First (or only) line of ' or " string.
Łukasz Langa0c4aca52017-05-22 15:19:09 -070099ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000100 group("'", r'\\\r?\n'),
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700101 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Martin v. Löwisef04c442008-03-19 05:04:44 +0000102 group('"', r'\\\r?\n'))
103PseudoExtras = group(r'\\\r?\n', Comment, Triple)
104PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
105
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000106tokenprog, pseudoprog, single3prog, double3prog = list(map(
107 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000108endprogs = {"'": re.compile(Single), '"': re.compile(Double),
109 "'''": single3prog, '"""': double3prog,
110 "r'''": single3prog, 'r"""': double3prog,
111 "u'''": single3prog, 'u"""': double3prog,
112 "b'''": single3prog, 'b"""': double3prog,
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700113 "f'''": single3prog, 'f"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000114 "ur'''": single3prog, 'ur"""': double3prog,
115 "br'''": single3prog, 'br"""': double3prog,
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700116 "rb'''": single3prog, 'rb"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000117 "R'''": single3prog, 'R"""': double3prog,
118 "U'''": single3prog, 'U"""': double3prog,
119 "B'''": single3prog, 'B"""': double3prog,
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700120 "F'''": single3prog, 'F"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000121 "uR'''": single3prog, 'uR"""': double3prog,
122 "Ur'''": single3prog, 'Ur"""': double3prog,
123 "UR'''": single3prog, 'UR"""': double3prog,
124 "bR'''": single3prog, 'bR"""': double3prog,
125 "Br'''": single3prog, 'Br"""': double3prog,
126 "BR'''": single3prog, 'BR"""': double3prog,
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700127 "rB'''": single3prog, 'rB"""': double3prog,
128 "Rb'''": single3prog, 'Rb"""': double3prog,
129 "RB'''": single3prog, 'RB"""': double3prog,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000130 'r': None, 'R': None,
131 'u': None, 'U': None,
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700132 'f': None, 'F': None,
Martin v. Löwisef04c442008-03-19 05:04:44 +0000133 'b': None, 'B': None}
134
135triple_quoted = {}
136for t in ("'''", '"""',
137 "r'''", 'r"""', "R'''", 'R"""',
138 "u'''", 'u"""', "U'''", 'U"""',
139 "b'''", 'b"""', "B'''", 'B"""',
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700140 "f'''", 'f"""', "F'''", 'F"""',
Martin v. Löwisef04c442008-03-19 05:04:44 +0000141 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
142 "uR'''", 'uR"""', "UR'''", 'UR"""',
143 "br'''", 'br"""', "Br'''", 'Br"""',
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700144 "bR'''", 'bR"""', "BR'''", 'BR"""',
145 "rb'''", 'rb"""', "Rb'''", 'Rb"""',
146 "rB'''", 'rB"""', "RB'''", 'RB"""',):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000147 triple_quoted[t] = t
148single_quoted = {}
149for t in ("'", '"',
150 "r'", 'r"', "R'", 'R"',
151 "u'", 'u"', "U'", 'U"',
152 "b'", 'b"', "B'", 'B"',
Łukasz Langa1b9530c2017-05-22 16:35:48 -0700153 "f'", 'f"', "F'", 'F"',
Martin v. Löwisef04c442008-03-19 05:04:44 +0000154 "ur'", 'ur"', "Ur'", 'Ur"',
155 "uR'", 'uR"', "UR'", 'UR"',
156 "br'", 'br"', "Br'", 'Br"',
Łukasz Langa0c4aca52017-05-22 15:19:09 -0700157 "bR'", 'bR"', "BR'", 'BR"',
158 "rb'", 'rb"', "Rb'", 'Rb"',
159 "rB'", 'rB"', "RB'", 'RB"',):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000160 single_quoted[t] = t
161
162tabsize = 8
163
164class TokenError(Exception): pass
165
166class StopTokenizing(Exception): pass
167
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000168def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
169 (srow, scol) = xxx_todo_changeme
170 (erow, ecol) = xxx_todo_changeme1
171 print("%d,%d-%d,%d:\t%s\t%s" % \
172 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000173
174def tokenize(readline, tokeneater=printtoken):
175 """
176 The tokenize() function accepts two parameters: one representing the
177 input stream, and one providing an output mechanism for tokenize().
178
179 The first parameter, readline, must be a callable object which provides
180 the same interface as the readline() method of built-in file objects.
181 Each call to the function should return one line of input as a string.
182
183 The second parameter, tokeneater, must also be a callable object. It is
184 called once for each token, with five arguments, corresponding to the
185 tuples generated by generate_tokens().
186 """
187 try:
188 tokenize_loop(readline, tokeneater)
189 except StopTokenizing:
190 pass
191
192# backwards compatible interface
193def tokenize_loop(readline, tokeneater):
194 for token_info in generate_tokens(readline):
195 tokeneater(*token_info)
196
197class Untokenizer:
198
199 def __init__(self):
200 self.tokens = []
201 self.prev_row = 1
202 self.prev_col = 0
203
204 def add_whitespace(self, start):
205 row, col = start
206 assert row <= self.prev_row
207 col_offset = col - self.prev_col
208 if col_offset:
209 self.tokens.append(" " * col_offset)
210
211 def untokenize(self, iterable):
212 for t in iterable:
213 if len(t) == 2:
214 self.compat(t, iterable)
215 break
216 tok_type, token, start, end, line = t
217 self.add_whitespace(start)
218 self.tokens.append(token)
219 self.prev_row, self.prev_col = end
220 if tok_type in (NEWLINE, NL):
221 self.prev_row += 1
222 self.prev_col = 0
223 return "".join(self.tokens)
224
225 def compat(self, token, iterable):
226 startline = False
227 indents = []
228 toks_append = self.tokens.append
229 toknum, tokval = token
230 if toknum in (NAME, NUMBER):
231 tokval += ' '
232 if toknum in (NEWLINE, NL):
233 startline = True
234 for tok in iterable:
235 toknum, tokval = tok[:2]
236
Jelle Zijlstraac317702017-10-05 20:24:46 -0700237 if toknum in (NAME, NUMBER):
Martin v. Löwisef04c442008-03-19 05:04:44 +0000238 tokval += ' '
239
240 if toknum == INDENT:
241 indents.append(tokval)
242 continue
243 elif toknum == DEDENT:
244 indents.pop()
245 continue
246 elif toknum in (NEWLINE, NL):
247 startline = True
248 elif startline and indents:
249 toks_append(indents[-1])
250 startline = False
251 toks_append(tokval)
252
Serhiy Storchakae431d3c2016-03-20 23:36:29 +0200253cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200254blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000255
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000256def _get_normal_name(orig_enc):
257 """Imitates get_normal_name in tokenizer.c."""
258 # Only care about the first 12 characters.
259 enc = orig_enc[:12].lower().replace("_", "-")
260 if enc == "utf-8" or enc.startswith("utf-8-"):
261 return "utf-8"
262 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
263 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
264 return "iso-8859-1"
265 return orig_enc
266
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000267def detect_encoding(readline):
268 """
269 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200270 be used to decode a Python source file. It requires one argument, readline,
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000271 in the same way as the tokenize() generator.
272
273 It will call readline a maximum of twice, and return the encoding used
274 (as a string) and a list of any lines (left as bytes) it has read
275 in.
276
277 It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson0af93982010-03-23 03:22:05 +0000278 cookie as specified in pep-0263. If both a bom and a cookie are present, but
279 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
280 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
281 'utf-8-sig' is returned.
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000282
283 If no encoding is specified, then the default of 'utf-8' will be returned.
284 """
285 bom_found = False
286 encoding = None
Benjamin Peterson0af93982010-03-23 03:22:05 +0000287 default = 'utf-8'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000288 def read_or_stop():
289 try:
290 return readline()
291 except StopIteration:
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +0000292 return bytes()
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000293
294 def find_cookie(line):
295 try:
296 line_string = line.decode('ascii')
297 except UnicodeDecodeError:
298 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300299 match = cookie_re.match(line_string)
300 if not match:
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000301 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300302 encoding = _get_normal_name(match.group(1))
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000303 try:
304 codec = lookup(encoding)
305 except LookupError:
306 # This behaviour mimics the Python interpreter
307 raise SyntaxError("unknown encoding: " + encoding)
308
Benjamin Peterson20211002009-11-25 18:34:42 +0000309 if bom_found:
310 if codec.name != 'utf-8':
311 # This behaviour mimics the Python interpreter
312 raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson0af93982010-03-23 03:22:05 +0000313 encoding += '-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000314 return encoding
315
316 first = read_or_stop()
317 if first.startswith(BOM_UTF8):
318 bom_found = True
319 first = first[3:]
Benjamin Peterson0af93982010-03-23 03:22:05 +0000320 default = 'utf-8-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000321 if not first:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000322 return default, []
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000323
324 encoding = find_cookie(first)
325 if encoding:
326 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200327 if not blank_re.match(first):
328 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000329
330 second = read_or_stop()
331 if not second:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000332 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000333
334 encoding = find_cookie(second)
335 if encoding:
336 return encoding, [first, second]
337
Benjamin Peterson0af93982010-03-23 03:22:05 +0000338 return default, [first, second]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000339
Martin v. Löwisef04c442008-03-19 05:04:44 +0000340def untokenize(iterable):
341 """Transform tokens back into Python source code.
342
343 Each element returned by the iterable must be a token sequence
344 with at least two elements, a token number and token value. If
345 only two tokens are passed, the resulting output is poor.
346
347 Round-trip invariant for full input:
348 Untokenized source will match input source exactly
349
350 Round-trip invariant for limited intput:
351 # Output text will tokenize the back to the input
352 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
353 newcode = untokenize(t1)
354 readline = iter(newcode.splitlines(1)).next
355 t2 = [tok[:2] for tokin generate_tokens(readline)]
356 assert t1 == t2
357 """
358 ut = Untokenizer()
359 return ut.untokenize(iterable)
360
361def generate_tokens(readline):
362 """
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200363 The generate_tokens() generator requires one argument, readline, which
Martin v. Löwisef04c442008-03-19 05:04:44 +0000364 must be a callable object which provides the same interface as the
365 readline() method of built-in file objects. Each call to the function
366 should return one line of input as a string. Alternately, readline
367 can be a callable function terminating with StopIteration:
368 readline = open(myfile).next # Example of alternate readline
369
370 The generator produces 5-tuples with these members: the token type; the
371 token string; a 2-tuple (srow, scol) of ints specifying the row and
372 column where the token begins in the source; a 2-tuple (erow, ecol) of
373 ints specifying the row and column where the token ends in the source;
374 and the line on which the token was found. The line passed is the
375 logical line; continuation lines are included.
376 """
377 lnum = parenlev = continued = 0
378 namechars, numchars = string.ascii_letters + '_', '0123456789'
379 contstr, needcont = '', 0
380 contline = None
381 indents = [0]
382
383 while 1: # loop over lines in stream
384 try:
385 line = readline()
386 except StopIteration:
387 line = ''
388 lnum = lnum + 1
389 pos, max = 0, len(line)
390
391 if contstr: # continued string
392 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000393 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000394 endmatch = endprog.match(line)
395 if endmatch:
396 pos = end = endmatch.end(0)
397 yield (STRING, contstr + line[:end],
398 strstart, (lnum, end), contline + line)
399 contstr, needcont = '', 0
400 contline = None
401 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
402 yield (ERRORTOKEN, contstr + line,
403 strstart, (lnum, len(line)), contline)
404 contstr = ''
405 contline = None
406 continue
407 else:
408 contstr = contstr + line
409 contline = contline + line
410 continue
411
412 elif parenlev == 0 and not continued: # new statement
413 if not line: break
414 column = 0
415 while pos < max: # measure leading whitespace
416 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000417 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000418 elif line[pos] == '\f': column = 0
419 else: break
420 pos = pos + 1
421 if pos == max: break
422
423 if line[pos] in '#\r\n': # skip comments or blank lines
424 if line[pos] == '#':
425 comment_token = line[pos:].rstrip('\r\n')
426 nl_pos = pos + len(comment_token)
427 yield (COMMENT, comment_token,
428 (lnum, pos), (lnum, pos + len(comment_token)), line)
429 yield (NL, line[nl_pos:],
430 (lnum, nl_pos), (lnum, len(line)), line)
431 else:
432 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
433 (lnum, pos), (lnum, len(line)), line)
434 continue
435
436 if column > indents[-1]: # count indents or dedents
437 indents.append(column)
438 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
439 while column < indents[-1]:
440 if column not in indents:
441 raise IndentationError(
442 "unindent does not match any outer indentation level",
443 ("<tokenize>", lnum, pos, line))
444 indents = indents[:-1]
Yury Selivanov8fb307c2015-07-22 13:33:45 +0300445
Martin v. Löwisef04c442008-03-19 05:04:44 +0000446 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
447
448 else: # continued statement
449 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000450 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000451 continued = 0
452
453 while pos < max:
454 pseudomatch = pseudoprog.match(line, pos)
455 if pseudomatch: # scan for tokens
456 start, end = pseudomatch.span(1)
457 spos, epos, pos = (lnum, start), (lnum, end), end
458 token, initial = line[start:end], line[start]
459
460 if initial in numchars or \
461 (initial == '.' and token != '.'): # ordinary number
462 yield (NUMBER, token, spos, epos, line)
463 elif initial in '\r\n':
464 newline = NEWLINE
465 if parenlev > 0:
466 newline = NL
467 yield (newline, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300468
Martin v. Löwisef04c442008-03-19 05:04:44 +0000469 elif initial == '#':
470 assert not token.endswith("\n")
471 yield (COMMENT, token, spos, epos, line)
472 elif token in triple_quoted:
473 endprog = endprogs[token]
474 endmatch = endprog.match(line, pos)
475 if endmatch: # all on one line
476 pos = endmatch.end(0)
477 token = line[start:pos]
478 yield (STRING, token, spos, (lnum, pos), line)
479 else:
480 strstart = (lnum, start) # multiple lines
481 contstr = line[start:]
482 contline = line
483 break
484 elif initial in single_quoted or \
485 token[:2] in single_quoted or \
486 token[:3] in single_quoted:
487 if token[-1] == '\n': # continued string
488 strstart = (lnum, start)
489 endprog = (endprogs[initial] or endprogs[token[1]] or
490 endprogs[token[2]])
491 contstr, needcont = line[start:], 1
492 contline = line
493 break
494 else: # ordinary string
495 yield (STRING, token, spos, epos, line)
496 elif initial in namechars: # ordinary name
Jelle Zijlstraac317702017-10-05 20:24:46 -0700497 yield (NAME, token, spos, epos, line)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000498 elif initial == '\\': # continued stmt
499 # This yield is new; needed for better idempotency:
500 yield (NL, token, spos, (lnum, pos), line)
501 continued = 1
502 else:
503 if initial in '([{': parenlev = parenlev + 1
504 elif initial in ')]}': parenlev = parenlev - 1
505 yield (OP, token, spos, epos, line)
506 else:
507 yield (ERRORTOKEN, line[pos],
508 (lnum, pos), (lnum, pos+1), line)
509 pos = pos + 1
510
Martin v. Löwisef04c442008-03-19 05:04:44 +0000511 for indent in indents[1:]: # pop remaining indent levels
512 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
513 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
514
515if __name__ == '__main__': # testing
516 import sys
517 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
518 else: tokenize(sys.stdin.readline)