blob: 31e29698e62f660a702cd4b2bd4293d7886e6cf3 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +000041try:
42 bytes
43except NameError:
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
46 bytes = str
47
Martin v. Löwisef04c442008-03-19 05:04:44 +000048def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
57Binnumber = r'0[bB][01]*'
58Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
59Octnumber = r'0[oO]?[0-7]*[lL]?'
60Decnumber = r'[1-9]\d*[lL]?'
61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62Exponent = r'[eE][-+]?\d+'
63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64Expfloat = r'\d+' + Exponent
65Floatnumber = group(Pointfloat, Expfloat)
66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78# Single-line ' or " string.
79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
81
82# Because of leftmost-then-longest match semantics, be sure to put the
83# longest operators first (e.g., if = came before ==, == would get
84# recognized as two instances of =).
85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
86 r"//=?", r"->",
87 r"[+\-*/%&|^=<>]=?",
88 r"~")
89
90Bracket = '[][(){}]'
91Special = group(r'\r?\n', r'[:;.,`@]')
92Funny = group(Operator, Bracket, Special)
93
94PlainToken = group(Number, Funny, String, Name)
95Token = Ignore + PlainToken
96
97# First (or only) line of ' or " string.
98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
99 group("'", r'\\\r?\n'),
100 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
101 group('"', r'\\\r?\n'))
102PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
104
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000105tokenprog, pseudoprog, single3prog, double3prog = list(map(
106 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000107endprogs = {"'": re.compile(Single), '"': re.compile(Double),
108 "'''": single3prog, '"""': double3prog,
109 "r'''": single3prog, 'r"""': double3prog,
110 "u'''": single3prog, 'u"""': double3prog,
111 "b'''": single3prog, 'b"""': double3prog,
112 "ur'''": single3prog, 'ur"""': double3prog,
113 "br'''": single3prog, 'br"""': double3prog,
114 "R'''": single3prog, 'R"""': double3prog,
115 "U'''": single3prog, 'U"""': double3prog,
116 "B'''": single3prog, 'B"""': double3prog,
117 "uR'''": single3prog, 'uR"""': double3prog,
118 "Ur'''": single3prog, 'Ur"""': double3prog,
119 "UR'''": single3prog, 'UR"""': double3prog,
120 "bR'''": single3prog, 'bR"""': double3prog,
121 "Br'''": single3prog, 'Br"""': double3prog,
122 "BR'''": single3prog, 'BR"""': double3prog,
123 'r': None, 'R': None,
124 'u': None, 'U': None,
125 'b': None, 'B': None}
126
127triple_quoted = {}
128for t in ("'''", '"""',
129 "r'''", 'r"""', "R'''", 'R"""',
130 "u'''", 'u"""', "U'''", 'U"""',
131 "b'''", 'b"""', "B'''", 'B"""',
132 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
133 "uR'''", 'uR"""', "UR'''", 'UR"""',
134 "br'''", 'br"""', "Br'''", 'Br"""',
135 "bR'''", 'bR"""', "BR'''", 'BR"""',):
136 triple_quoted[t] = t
137single_quoted = {}
138for t in ("'", '"',
139 "r'", 'r"', "R'", 'R"',
140 "u'", 'u"', "U'", 'U"',
141 "b'", 'b"', "B'", 'B"',
142 "ur'", 'ur"', "Ur'", 'Ur"',
143 "uR'", 'uR"', "UR'", 'UR"',
144 "br'", 'br"', "Br'", 'Br"',
145 "bR'", 'bR"', "BR'", 'BR"', ):
146 single_quoted[t] = t
147
148tabsize = 8
149
150class TokenError(Exception): pass
151
152class StopTokenizing(Exception): pass
153
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000154def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
155 (srow, scol) = xxx_todo_changeme
156 (erow, ecol) = xxx_todo_changeme1
157 print("%d,%d-%d,%d:\t%s\t%s" % \
158 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000159
160def tokenize(readline, tokeneater=printtoken):
161 """
162 The tokenize() function accepts two parameters: one representing the
163 input stream, and one providing an output mechanism for tokenize().
164
165 The first parameter, readline, must be a callable object which provides
166 the same interface as the readline() method of built-in file objects.
167 Each call to the function should return one line of input as a string.
168
169 The second parameter, tokeneater, must also be a callable object. It is
170 called once for each token, with five arguments, corresponding to the
171 tuples generated by generate_tokens().
172 """
173 try:
174 tokenize_loop(readline, tokeneater)
175 except StopTokenizing:
176 pass
177
178# backwards compatible interface
179def tokenize_loop(readline, tokeneater):
180 for token_info in generate_tokens(readline):
181 tokeneater(*token_info)
182
183class Untokenizer:
184
185 def __init__(self):
186 self.tokens = []
187 self.prev_row = 1
188 self.prev_col = 0
189
190 def add_whitespace(self, start):
191 row, col = start
192 assert row <= self.prev_row
193 col_offset = col - self.prev_col
194 if col_offset:
195 self.tokens.append(" " * col_offset)
196
197 def untokenize(self, iterable):
198 for t in iterable:
199 if len(t) == 2:
200 self.compat(t, iterable)
201 break
202 tok_type, token, start, end, line = t
203 self.add_whitespace(start)
204 self.tokens.append(token)
205 self.prev_row, self.prev_col = end
206 if tok_type in (NEWLINE, NL):
207 self.prev_row += 1
208 self.prev_col = 0
209 return "".join(self.tokens)
210
211 def compat(self, token, iterable):
212 startline = False
213 indents = []
214 toks_append = self.tokens.append
215 toknum, tokval = token
216 if toknum in (NAME, NUMBER):
217 tokval += ' '
218 if toknum in (NEWLINE, NL):
219 startline = True
220 for tok in iterable:
221 toknum, tokval = tok[:2]
222
223 if toknum in (NAME, NUMBER):
224 tokval += ' '
225
226 if toknum == INDENT:
227 indents.append(tokval)
228 continue
229 elif toknum == DEDENT:
230 indents.pop()
231 continue
232 elif toknum in (NEWLINE, NL):
233 startline = True
234 elif startline and indents:
235 toks_append(indents[-1])
236 startline = False
237 toks_append(tokval)
238
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000239cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
240
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000241def _get_normal_name(orig_enc):
242 """Imitates get_normal_name in tokenizer.c."""
243 # Only care about the first 12 characters.
244 enc = orig_enc[:12].lower().replace("_", "-")
245 if enc == "utf-8" or enc.startswith("utf-8-"):
246 return "utf-8"
247 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
248 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
249 return "iso-8859-1"
250 return orig_enc
251
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000252def detect_encoding(readline):
253 """
254 The detect_encoding() function is used to detect the encoding that should
255 be used to decode a Python source file. It requires one argment, readline,
256 in the same way as the tokenize() generator.
257
258 It will call readline a maximum of twice, and return the encoding used
259 (as a string) and a list of any lines (left as bytes) it has read
260 in.
261
262 It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson0af93982010-03-23 03:22:05 +0000263 cookie as specified in pep-0263. If both a bom and a cookie are present, but
264 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
265 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
266 'utf-8-sig' is returned.
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000267
268 If no encoding is specified, then the default of 'utf-8' will be returned.
269 """
270 bom_found = False
271 encoding = None
Benjamin Peterson0af93982010-03-23 03:22:05 +0000272 default = 'utf-8'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000273 def read_or_stop():
274 try:
275 return readline()
276 except StopIteration:
Benjamin Peterson8d26b0b2010-05-07 19:10:11 +0000277 return bytes()
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000278
279 def find_cookie(line):
280 try:
281 line_string = line.decode('ascii')
282 except UnicodeDecodeError:
283 return None
284
285 matches = cookie_re.findall(line_string)
286 if not matches:
287 return None
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000288 encoding = _get_normal_name(matches[0])
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000289 try:
290 codec = lookup(encoding)
291 except LookupError:
292 # This behaviour mimics the Python interpreter
293 raise SyntaxError("unknown encoding: " + encoding)
294
Benjamin Peterson20211002009-11-25 18:34:42 +0000295 if bom_found:
296 if codec.name != 'utf-8':
297 # This behaviour mimics the Python interpreter
298 raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson0af93982010-03-23 03:22:05 +0000299 encoding += '-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000300 return encoding
301
302 first = read_or_stop()
303 if first.startswith(BOM_UTF8):
304 bom_found = True
305 first = first[3:]
Benjamin Peterson0af93982010-03-23 03:22:05 +0000306 default = 'utf-8-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000307 if not first:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000308 return default, []
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000309
310 encoding = find_cookie(first)
311 if encoding:
312 return encoding, [first]
313
314 second = read_or_stop()
315 if not second:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000316 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000317
318 encoding = find_cookie(second)
319 if encoding:
320 return encoding, [first, second]
321
Benjamin Peterson0af93982010-03-23 03:22:05 +0000322 return default, [first, second]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000323
Martin v. Löwisef04c442008-03-19 05:04:44 +0000324def untokenize(iterable):
325 """Transform tokens back into Python source code.
326
327 Each element returned by the iterable must be a token sequence
328 with at least two elements, a token number and token value. If
329 only two tokens are passed, the resulting output is poor.
330
331 Round-trip invariant for full input:
332 Untokenized source will match input source exactly
333
334 Round-trip invariant for limited intput:
335 # Output text will tokenize the back to the input
336 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
337 newcode = untokenize(t1)
338 readline = iter(newcode.splitlines(1)).next
339 t2 = [tok[:2] for tokin generate_tokens(readline)]
340 assert t1 == t2
341 """
342 ut = Untokenizer()
343 return ut.untokenize(iterable)
344
345def generate_tokens(readline):
346 """
347 The generate_tokens() generator requires one argment, readline, which
348 must be a callable object which provides the same interface as the
349 readline() method of built-in file objects. Each call to the function
350 should return one line of input as a string. Alternately, readline
351 can be a callable function terminating with StopIteration:
352 readline = open(myfile).next # Example of alternate readline
353
354 The generator produces 5-tuples with these members: the token type; the
355 token string; a 2-tuple (srow, scol) of ints specifying the row and
356 column where the token begins in the source; a 2-tuple (erow, ecol) of
357 ints specifying the row and column where the token ends in the source;
358 and the line on which the token was found. The line passed is the
359 logical line; continuation lines are included.
360 """
361 lnum = parenlev = continued = 0
362 namechars, numchars = string.ascii_letters + '_', '0123456789'
363 contstr, needcont = '', 0
364 contline = None
365 indents = [0]
366
367 while 1: # loop over lines in stream
368 try:
369 line = readline()
370 except StopIteration:
371 line = ''
372 lnum = lnum + 1
373 pos, max = 0, len(line)
374
375 if contstr: # continued string
376 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000377 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000378 endmatch = endprog.match(line)
379 if endmatch:
380 pos = end = endmatch.end(0)
381 yield (STRING, contstr + line[:end],
382 strstart, (lnum, end), contline + line)
383 contstr, needcont = '', 0
384 contline = None
385 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
386 yield (ERRORTOKEN, contstr + line,
387 strstart, (lnum, len(line)), contline)
388 contstr = ''
389 contline = None
390 continue
391 else:
392 contstr = contstr + line
393 contline = contline + line
394 continue
395
396 elif parenlev == 0 and not continued: # new statement
397 if not line: break
398 column = 0
399 while pos < max: # measure leading whitespace
400 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000401 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000402 elif line[pos] == '\f': column = 0
403 else: break
404 pos = pos + 1
405 if pos == max: break
406
407 if line[pos] in '#\r\n': # skip comments or blank lines
408 if line[pos] == '#':
409 comment_token = line[pos:].rstrip('\r\n')
410 nl_pos = pos + len(comment_token)
411 yield (COMMENT, comment_token,
412 (lnum, pos), (lnum, pos + len(comment_token)), line)
413 yield (NL, line[nl_pos:],
414 (lnum, nl_pos), (lnum, len(line)), line)
415 else:
416 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
417 (lnum, pos), (lnum, len(line)), line)
418 continue
419
420 if column > indents[-1]: # count indents or dedents
421 indents.append(column)
422 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
423 while column < indents[-1]:
424 if column not in indents:
425 raise IndentationError(
426 "unindent does not match any outer indentation level",
427 ("<tokenize>", lnum, pos, line))
428 indents = indents[:-1]
429 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
430
431 else: # continued statement
432 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000433 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000434 continued = 0
435
436 while pos < max:
437 pseudomatch = pseudoprog.match(line, pos)
438 if pseudomatch: # scan for tokens
439 start, end = pseudomatch.span(1)
440 spos, epos, pos = (lnum, start), (lnum, end), end
441 token, initial = line[start:end], line[start]
442
443 if initial in numchars or \
444 (initial == '.' and token != '.'): # ordinary number
445 yield (NUMBER, token, spos, epos, line)
446 elif initial in '\r\n':
447 newline = NEWLINE
448 if parenlev > 0:
449 newline = NL
450 yield (newline, token, spos, epos, line)
451 elif initial == '#':
452 assert not token.endswith("\n")
453 yield (COMMENT, token, spos, epos, line)
454 elif token in triple_quoted:
455 endprog = endprogs[token]
456 endmatch = endprog.match(line, pos)
457 if endmatch: # all on one line
458 pos = endmatch.end(0)
459 token = line[start:pos]
460 yield (STRING, token, spos, (lnum, pos), line)
461 else:
462 strstart = (lnum, start) # multiple lines
463 contstr = line[start:]
464 contline = line
465 break
466 elif initial in single_quoted or \
467 token[:2] in single_quoted or \
468 token[:3] in single_quoted:
469 if token[-1] == '\n': # continued string
470 strstart = (lnum, start)
471 endprog = (endprogs[initial] or endprogs[token[1]] or
472 endprogs[token[2]])
473 contstr, needcont = line[start:], 1
474 contline = line
475 break
476 else: # ordinary string
477 yield (STRING, token, spos, epos, line)
478 elif initial in namechars: # ordinary name
479 yield (NAME, token, spos, epos, line)
480 elif initial == '\\': # continued stmt
481 # This yield is new; needed for better idempotency:
482 yield (NL, token, spos, (lnum, pos), line)
483 continued = 1
484 else:
485 if initial in '([{': parenlev = parenlev + 1
486 elif initial in ')]}': parenlev = parenlev - 1
487 yield (OP, token, spos, epos, line)
488 else:
489 yield (ERRORTOKEN, line[pos],
490 (lnum, pos), (lnum, pos+1), line)
491 pos = pos + 1
492
493 for indent in indents[1:]: # pop remaining indent levels
494 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
495 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
496
497if __name__ == '__main__': # testing
498 import sys
499 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
500 else: tokenize(sys.stdin.readline)