blob: 701daf8a8d334bc84563100304e6bfa0e9311838 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
41def group(*choices): return '(' + '|'.join(choices) + ')'
42def any(*choices): return group(*choices) + '*'
43def maybe(*choices): return group(*choices) + '?'
44
45Whitespace = r'[ \f\t]*'
46Comment = r'#[^\r\n]*'
47Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
48Name = r'[a-zA-Z_]\w*'
49
50Binnumber = r'0[bB][01]*'
51Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
52Octnumber = r'0[oO]?[0-7]*[lL]?'
53Decnumber = r'[1-9]\d*[lL]?'
54Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
55Exponent = r'[eE][-+]?\d+'
56Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
57Expfloat = r'\d+' + Exponent
58Floatnumber = group(Pointfloat, Expfloat)
59Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
60Number = group(Imagnumber, Floatnumber, Intnumber)
61
62# Tail end of ' string.
63Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
64# Tail end of " string.
65Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
66# Tail end of ''' string.
67Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
68# Tail end of """ string.
69Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
70Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
71# Single-line ' or " string.
72String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
73 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
74
75# Because of leftmost-then-longest match semantics, be sure to put the
76# longest operators first (e.g., if = came before ==, == would get
77# recognized as two instances of =).
78Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
79 r"//=?", r"->",
80 r"[+\-*/%&|^=<>]=?",
81 r"~")
82
83Bracket = '[][(){}]'
84Special = group(r'\r?\n', r'[:;.,`@]')
85Funny = group(Operator, Bracket, Special)
86
87PlainToken = group(Number, Funny, String, Name)
88Token = Ignore + PlainToken
89
90# First (or only) line of ' or " string.
91ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
92 group("'", r'\\\r?\n'),
93 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
94 group('"', r'\\\r?\n'))
95PseudoExtras = group(r'\\\r?\n', Comment, Triple)
96PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
97
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +000098tokenprog, pseudoprog, single3prog, double3prog = list(map(
99 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000100endprogs = {"'": re.compile(Single), '"': re.compile(Double),
101 "'''": single3prog, '"""': double3prog,
102 "r'''": single3prog, 'r"""': double3prog,
103 "u'''": single3prog, 'u"""': double3prog,
104 "b'''": single3prog, 'b"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "br'''": single3prog, 'br"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "B'''": single3prog, 'B"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "bR'''": single3prog, 'bR"""': double3prog,
114 "Br'''": single3prog, 'Br"""': double3prog,
115 "BR'''": single3prog, 'BR"""': double3prog,
116 'r': None, 'R': None,
117 'u': None, 'U': None,
118 'b': None, 'B': None}
119
120triple_quoted = {}
121for t in ("'''", '"""',
122 "r'''", 'r"""', "R'''", 'R"""',
123 "u'''", 'u"""', "U'''", 'U"""',
124 "b'''", 'b"""', "B'''", 'B"""',
125 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
126 "uR'''", 'uR"""', "UR'''", 'UR"""',
127 "br'''", 'br"""', "Br'''", 'Br"""',
128 "bR'''", 'bR"""', "BR'''", 'BR"""',):
129 triple_quoted[t] = t
130single_quoted = {}
131for t in ("'", '"',
132 "r'", 'r"', "R'", 'R"',
133 "u'", 'u"', "U'", 'U"',
134 "b'", 'b"', "B'", 'B"',
135 "ur'", 'ur"', "Ur'", 'Ur"',
136 "uR'", 'uR"', "UR'", 'UR"',
137 "br'", 'br"', "Br'", 'Br"',
138 "bR'", 'bR"', "BR'", 'BR"', ):
139 single_quoted[t] = t
140
141tabsize = 8
142
143class TokenError(Exception): pass
144
145class StopTokenizing(Exception): pass
146
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000147def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
148 (srow, scol) = xxx_todo_changeme
149 (erow, ecol) = xxx_todo_changeme1
150 print("%d,%d-%d,%d:\t%s\t%s" % \
151 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000152
153def tokenize(readline, tokeneater=printtoken):
154 """
155 The tokenize() function accepts two parameters: one representing the
156 input stream, and one providing an output mechanism for tokenize().
157
158 The first parameter, readline, must be a callable object which provides
159 the same interface as the readline() method of built-in file objects.
160 Each call to the function should return one line of input as a string.
161
162 The second parameter, tokeneater, must also be a callable object. It is
163 called once for each token, with five arguments, corresponding to the
164 tuples generated by generate_tokens().
165 """
166 try:
167 tokenize_loop(readline, tokeneater)
168 except StopTokenizing:
169 pass
170
171# backwards compatible interface
172def tokenize_loop(readline, tokeneater):
173 for token_info in generate_tokens(readline):
174 tokeneater(*token_info)
175
176class Untokenizer:
177
178 def __init__(self):
179 self.tokens = []
180 self.prev_row = 1
181 self.prev_col = 0
182
183 def add_whitespace(self, start):
184 row, col = start
185 assert row <= self.prev_row
186 col_offset = col - self.prev_col
187 if col_offset:
188 self.tokens.append(" " * col_offset)
189
190 def untokenize(self, iterable):
191 for t in iterable:
192 if len(t) == 2:
193 self.compat(t, iterable)
194 break
195 tok_type, token, start, end, line = t
196 self.add_whitespace(start)
197 self.tokens.append(token)
198 self.prev_row, self.prev_col = end
199 if tok_type in (NEWLINE, NL):
200 self.prev_row += 1
201 self.prev_col = 0
202 return "".join(self.tokens)
203
204 def compat(self, token, iterable):
205 startline = False
206 indents = []
207 toks_append = self.tokens.append
208 toknum, tokval = token
209 if toknum in (NAME, NUMBER):
210 tokval += ' '
211 if toknum in (NEWLINE, NL):
212 startline = True
213 for tok in iterable:
214 toknum, tokval = tok[:2]
215
216 if toknum in (NAME, NUMBER):
217 tokval += ' '
218
219 if toknum == INDENT:
220 indents.append(tokval)
221 continue
222 elif toknum == DEDENT:
223 indents.pop()
224 continue
225 elif toknum in (NEWLINE, NL):
226 startline = True
227 elif startline and indents:
228 toks_append(indents[-1])
229 startline = False
230 toks_append(tokval)
231
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000232cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
233
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000234def _get_normal_name(orig_enc):
235 """Imitates get_normal_name in tokenizer.c."""
236 # Only care about the first 12 characters.
237 enc = orig_enc[:12].lower().replace("_", "-")
238 if enc == "utf-8" or enc.startswith("utf-8-"):
239 return "utf-8"
240 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
241 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
242 return "iso-8859-1"
243 return orig_enc
244
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000245def detect_encoding(readline):
246 """
247 The detect_encoding() function is used to detect the encoding that should
248 be used to decode a Python source file. It requires one argment, readline,
249 in the same way as the tokenize() generator.
250
251 It will call readline a maximum of twice, and return the encoding used
252 (as a string) and a list of any lines (left as bytes) it has read
253 in.
254
255 It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson0af93982010-03-23 03:22:05 +0000256 cookie as specified in pep-0263. If both a bom and a cookie are present, but
257 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
258 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
259 'utf-8-sig' is returned.
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000260
261 If no encoding is specified, then the default of 'utf-8' will be returned.
262 """
263 bom_found = False
264 encoding = None
Benjamin Peterson0af93982010-03-23 03:22:05 +0000265 default = 'utf-8'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000266 def read_or_stop():
267 try:
268 return readline()
269 except StopIteration:
270 return b''
271
272 def find_cookie(line):
273 try:
274 line_string = line.decode('ascii')
275 except UnicodeDecodeError:
276 return None
277
278 matches = cookie_re.findall(line_string)
279 if not matches:
280 return None
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000281 encoding = _get_normal_name(matches[0])
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000282 try:
283 codec = lookup(encoding)
284 except LookupError:
285 # This behaviour mimics the Python interpreter
286 raise SyntaxError("unknown encoding: " + encoding)
287
Benjamin Peterson20211002009-11-25 18:34:42 +0000288 if bom_found:
289 if codec.name != 'utf-8':
290 # This behaviour mimics the Python interpreter
291 raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson0af93982010-03-23 03:22:05 +0000292 encoding += '-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000293 return encoding
294
295 first = read_or_stop()
296 if first.startswith(BOM_UTF8):
297 bom_found = True
298 first = first[3:]
Benjamin Peterson0af93982010-03-23 03:22:05 +0000299 default = 'utf-8-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000300 if not first:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000301 return default, []
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000302
303 encoding = find_cookie(first)
304 if encoding:
305 return encoding, [first]
306
307 second = read_or_stop()
308 if not second:
Benjamin Peterson0af93982010-03-23 03:22:05 +0000309 return default, [first]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000310
311 encoding = find_cookie(second)
312 if encoding:
313 return encoding, [first, second]
314
Benjamin Peterson0af93982010-03-23 03:22:05 +0000315 return default, [first, second]
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000316
Martin v. Löwisef04c442008-03-19 05:04:44 +0000317def untokenize(iterable):
318 """Transform tokens back into Python source code.
319
320 Each element returned by the iterable must be a token sequence
321 with at least two elements, a token number and token value. If
322 only two tokens are passed, the resulting output is poor.
323
324 Round-trip invariant for full input:
325 Untokenized source will match input source exactly
326
327 Round-trip invariant for limited intput:
328 # Output text will tokenize the back to the input
329 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
330 newcode = untokenize(t1)
331 readline = iter(newcode.splitlines(1)).next
332 t2 = [tok[:2] for tokin generate_tokens(readline)]
333 assert t1 == t2
334 """
335 ut = Untokenizer()
336 return ut.untokenize(iterable)
337
338def generate_tokens(readline):
339 """
340 The generate_tokens() generator requires one argment, readline, which
341 must be a callable object which provides the same interface as the
342 readline() method of built-in file objects. Each call to the function
343 should return one line of input as a string. Alternately, readline
344 can be a callable function terminating with StopIteration:
345 readline = open(myfile).next # Example of alternate readline
346
347 The generator produces 5-tuples with these members: the token type; the
348 token string; a 2-tuple (srow, scol) of ints specifying the row and
349 column where the token begins in the source; a 2-tuple (erow, ecol) of
350 ints specifying the row and column where the token ends in the source;
351 and the line on which the token was found. The line passed is the
352 logical line; continuation lines are included.
353 """
354 lnum = parenlev = continued = 0
355 namechars, numchars = string.ascii_letters + '_', '0123456789'
356 contstr, needcont = '', 0
357 contline = None
358 indents = [0]
359
360 while 1: # loop over lines in stream
361 try:
362 line = readline()
363 except StopIteration:
364 line = ''
365 lnum = lnum + 1
366 pos, max = 0, len(line)
367
368 if contstr: # continued string
369 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000370 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000371 endmatch = endprog.match(line)
372 if endmatch:
373 pos = end = endmatch.end(0)
374 yield (STRING, contstr + line[:end],
375 strstart, (lnum, end), contline + line)
376 contstr, needcont = '', 0
377 contline = None
378 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
379 yield (ERRORTOKEN, contstr + line,
380 strstart, (lnum, len(line)), contline)
381 contstr = ''
382 contline = None
383 continue
384 else:
385 contstr = contstr + line
386 contline = contline + line
387 continue
388
389 elif parenlev == 0 and not continued: # new statement
390 if not line: break
391 column = 0
392 while pos < max: # measure leading whitespace
393 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000394 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000395 elif line[pos] == '\f': column = 0
396 else: break
397 pos = pos + 1
398 if pos == max: break
399
400 if line[pos] in '#\r\n': # skip comments or blank lines
401 if line[pos] == '#':
402 comment_token = line[pos:].rstrip('\r\n')
403 nl_pos = pos + len(comment_token)
404 yield (COMMENT, comment_token,
405 (lnum, pos), (lnum, pos + len(comment_token)), line)
406 yield (NL, line[nl_pos:],
407 (lnum, nl_pos), (lnum, len(line)), line)
408 else:
409 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
410 (lnum, pos), (lnum, len(line)), line)
411 continue
412
413 if column > indents[-1]: # count indents or dedents
414 indents.append(column)
415 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
416 while column < indents[-1]:
417 if column not in indents:
418 raise IndentationError(
419 "unindent does not match any outer indentation level",
420 ("<tokenize>", lnum, pos, line))
421 indents = indents[:-1]
422 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
423
424 else: # continued statement
425 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000426 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000427 continued = 0
428
429 while pos < max:
430 pseudomatch = pseudoprog.match(line, pos)
431 if pseudomatch: # scan for tokens
432 start, end = pseudomatch.span(1)
433 spos, epos, pos = (lnum, start), (lnum, end), end
434 token, initial = line[start:end], line[start]
435
436 if initial in numchars or \
437 (initial == '.' and token != '.'): # ordinary number
438 yield (NUMBER, token, spos, epos, line)
439 elif initial in '\r\n':
440 newline = NEWLINE
441 if parenlev > 0:
442 newline = NL
443 yield (newline, token, spos, epos, line)
444 elif initial == '#':
445 assert not token.endswith("\n")
446 yield (COMMENT, token, spos, epos, line)
447 elif token in triple_quoted:
448 endprog = endprogs[token]
449 endmatch = endprog.match(line, pos)
450 if endmatch: # all on one line
451 pos = endmatch.end(0)
452 token = line[start:pos]
453 yield (STRING, token, spos, (lnum, pos), line)
454 else:
455 strstart = (lnum, start) # multiple lines
456 contstr = line[start:]
457 contline = line
458 break
459 elif initial in single_quoted or \
460 token[:2] in single_quoted or \
461 token[:3] in single_quoted:
462 if token[-1] == '\n': # continued string
463 strstart = (lnum, start)
464 endprog = (endprogs[initial] or endprogs[token[1]] or
465 endprogs[token[2]])
466 contstr, needcont = line[start:], 1
467 contline = line
468 break
469 else: # ordinary string
470 yield (STRING, token, spos, epos, line)
471 elif initial in namechars: # ordinary name
472 yield (NAME, token, spos, epos, line)
473 elif initial == '\\': # continued stmt
474 # This yield is new; needed for better idempotency:
475 yield (NL, token, spos, (lnum, pos), line)
476 continued = 1
477 else:
478 if initial in '([{': parenlev = parenlev + 1
479 elif initial in ')]}': parenlev = parenlev - 1
480 yield (OP, token, spos, epos, line)
481 else:
482 yield (ERRORTOKEN, line[pos],
483 (lnum, pos), (lnum, pos+1), line)
484 pos = pos + 1
485
486 for indent in indents[1:]: # pop remaining indent levels
487 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
488 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
489
490if __name__ == '__main__': # testing
491 import sys
492 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
493 else: tokenize(sys.stdin.readline)