blob: 4585ca3f2b36d73f53c4ef7565aabf4a3606440c [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
41def group(*choices): return '(' + '|'.join(choices) + ')'
42def any(*choices): return group(*choices) + '*'
43def maybe(*choices): return group(*choices) + '?'
44
45Whitespace = r'[ \f\t]*'
46Comment = r'#[^\r\n]*'
47Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
48Name = r'[a-zA-Z_]\w*'
49
50Binnumber = r'0[bB][01]*'
51Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
52Octnumber = r'0[oO]?[0-7]*[lL]?'
53Decnumber = r'[1-9]\d*[lL]?'
54Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
55Exponent = r'[eE][-+]?\d+'
56Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
57Expfloat = r'\d+' + Exponent
58Floatnumber = group(Pointfloat, Expfloat)
59Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
60Number = group(Imagnumber, Floatnumber, Intnumber)
61
62# Tail end of ' string.
63Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
64# Tail end of " string.
65Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
66# Tail end of ''' string.
67Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
68# Tail end of """ string.
69Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
70Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
71# Single-line ' or " string.
72String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
73 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
74
75# Because of leftmost-then-longest match semantics, be sure to put the
76# longest operators first (e.g., if = came before ==, == would get
77# recognized as two instances of =).
78Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
79 r"//=?", r"->",
80 r"[+\-*/%&|^=<>]=?",
81 r"~")
82
83Bracket = '[][(){}]'
84Special = group(r'\r?\n', r'[:;.,`@]')
85Funny = group(Operator, Bracket, Special)
86
87PlainToken = group(Number, Funny, String, Name)
88Token = Ignore + PlainToken
89
90# First (or only) line of ' or " string.
91ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
92 group("'", r'\\\r?\n'),
93 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
94 group('"', r'\\\r?\n'))
95PseudoExtras = group(r'\\\r?\n', Comment, Triple)
96PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
97
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +000098tokenprog, pseudoprog, single3prog, double3prog = list(map(
99 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000100endprogs = {"'": re.compile(Single), '"': re.compile(Double),
101 "'''": single3prog, '"""': double3prog,
102 "r'''": single3prog, 'r"""': double3prog,
103 "u'''": single3prog, 'u"""': double3prog,
104 "b'''": single3prog, 'b"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "br'''": single3prog, 'br"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "B'''": single3prog, 'B"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "bR'''": single3prog, 'bR"""': double3prog,
114 "Br'''": single3prog, 'Br"""': double3prog,
115 "BR'''": single3prog, 'BR"""': double3prog,
116 'r': None, 'R': None,
117 'u': None, 'U': None,
118 'b': None, 'B': None}
119
120triple_quoted = {}
121for t in ("'''", '"""',
122 "r'''", 'r"""', "R'''", 'R"""',
123 "u'''", 'u"""', "U'''", 'U"""',
124 "b'''", 'b"""', "B'''", 'B"""',
125 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
126 "uR'''", 'uR"""', "UR'''", 'UR"""',
127 "br'''", 'br"""', "Br'''", 'Br"""',
128 "bR'''", 'bR"""', "BR'''", 'BR"""',):
129 triple_quoted[t] = t
130single_quoted = {}
131for t in ("'", '"',
132 "r'", 'r"', "R'", 'R"',
133 "u'", 'u"', "U'", 'U"',
134 "b'", 'b"', "B'", 'B"',
135 "ur'", 'ur"', "Ur'", 'Ur"',
136 "uR'", 'uR"', "UR'", 'UR"',
137 "br'", 'br"', "Br'", 'Br"',
138 "bR'", 'bR"', "BR'", 'BR"', ):
139 single_quoted[t] = t
140
141tabsize = 8
142
143class TokenError(Exception): pass
144
145class StopTokenizing(Exception): pass
146
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000147def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
148 (srow, scol) = xxx_todo_changeme
149 (erow, ecol) = xxx_todo_changeme1
150 print("%d,%d-%d,%d:\t%s\t%s" % \
151 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000152
153def tokenize(readline, tokeneater=printtoken):
154 """
155 The tokenize() function accepts two parameters: one representing the
156 input stream, and one providing an output mechanism for tokenize().
157
158 The first parameter, readline, must be a callable object which provides
159 the same interface as the readline() method of built-in file objects.
160 Each call to the function should return one line of input as a string.
161
162 The second parameter, tokeneater, must also be a callable object. It is
163 called once for each token, with five arguments, corresponding to the
164 tuples generated by generate_tokens().
165 """
166 try:
167 tokenize_loop(readline, tokeneater)
168 except StopTokenizing:
169 pass
170
171# backwards compatible interface
172def tokenize_loop(readline, tokeneater):
173 for token_info in generate_tokens(readline):
174 tokeneater(*token_info)
175
176class Untokenizer:
177
178 def __init__(self):
179 self.tokens = []
180 self.prev_row = 1
181 self.prev_col = 0
182
183 def add_whitespace(self, start):
184 row, col = start
185 assert row <= self.prev_row
186 col_offset = col - self.prev_col
187 if col_offset:
188 self.tokens.append(" " * col_offset)
189
190 def untokenize(self, iterable):
191 for t in iterable:
192 if len(t) == 2:
193 self.compat(t, iterable)
194 break
195 tok_type, token, start, end, line = t
196 self.add_whitespace(start)
197 self.tokens.append(token)
198 self.prev_row, self.prev_col = end
199 if tok_type in (NEWLINE, NL):
200 self.prev_row += 1
201 self.prev_col = 0
202 return "".join(self.tokens)
203
204 def compat(self, token, iterable):
205 startline = False
206 indents = []
207 toks_append = self.tokens.append
208 toknum, tokval = token
209 if toknum in (NAME, NUMBER):
210 tokval += ' '
211 if toknum in (NEWLINE, NL):
212 startline = True
213 for tok in iterable:
214 toknum, tokval = tok[:2]
215
216 if toknum in (NAME, NUMBER):
217 tokval += ' '
218
219 if toknum == INDENT:
220 indents.append(tokval)
221 continue
222 elif toknum == DEDENT:
223 indents.pop()
224 continue
225 elif toknum in (NEWLINE, NL):
226 startline = True
227 elif startline and indents:
228 toks_append(indents[-1])
229 startline = False
230 toks_append(tokval)
231
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000232cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
233
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000234def _get_normal_name(orig_enc):
235 """Imitates get_normal_name in tokenizer.c."""
236 # Only care about the first 12 characters.
237 enc = orig_enc[:12].lower().replace("_", "-")
238 if enc == "utf-8" or enc.startswith("utf-8-"):
239 return "utf-8"
240 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
241 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
242 return "iso-8859-1"
243 return orig_enc
244
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000245def detect_encoding(readline):
246 """
247 The detect_encoding() function is used to detect the encoding that should
248 be used to decode a Python source file. It requires one argment, readline,
249 in the same way as the tokenize() generator.
250
251 It will call readline a maximum of twice, and return the encoding used
252 (as a string) and a list of any lines (left as bytes) it has read
253 in.
254
255 It detects the encoding from the presence of a utf-8 bom or an encoding
256 cookie as specified in pep-0263. If both a bom and a cookie are present,
257 but disagree, a SyntaxError will be raised. If the encoding cookie is an
258 invalid charset, raise a SyntaxError.
259
260 If no encoding is specified, then the default of 'utf-8' will be returned.
261 """
262 bom_found = False
263 encoding = None
264 def read_or_stop():
265 try:
266 return readline()
267 except StopIteration:
268 return b''
269
270 def find_cookie(line):
271 try:
272 line_string = line.decode('ascii')
273 except UnicodeDecodeError:
274 return None
275
276 matches = cookie_re.findall(line_string)
277 if not matches:
278 return None
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000279 encoding = _get_normal_name(matches[0])
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000280 try:
281 codec = lookup(encoding)
282 except LookupError:
283 # This behaviour mimics the Python interpreter
284 raise SyntaxError("unknown encoding: " + encoding)
285
286 if bom_found and codec.name != 'utf-8':
287 # This behaviour mimics the Python interpreter
288 raise SyntaxError('encoding problem: utf-8')
289 return encoding
290
291 first = read_or_stop()
292 if first.startswith(BOM_UTF8):
293 bom_found = True
294 first = first[3:]
295 if not first:
296 return 'utf-8', []
297
298 encoding = find_cookie(first)
299 if encoding:
300 return encoding, [first]
301
302 second = read_or_stop()
303 if not second:
304 return 'utf-8', [first]
305
306 encoding = find_cookie(second)
307 if encoding:
308 return encoding, [first, second]
309
310 return 'utf-8', [first, second]
311
Martin v. Löwisef04c442008-03-19 05:04:44 +0000312def untokenize(iterable):
313 """Transform tokens back into Python source code.
314
315 Each element returned by the iterable must be a token sequence
316 with at least two elements, a token number and token value. If
317 only two tokens are passed, the resulting output is poor.
318
319 Round-trip invariant for full input:
320 Untokenized source will match input source exactly
321
322 Round-trip invariant for limited intput:
323 # Output text will tokenize the back to the input
324 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
325 newcode = untokenize(t1)
326 readline = iter(newcode.splitlines(1)).next
327 t2 = [tok[:2] for tokin generate_tokens(readline)]
328 assert t1 == t2
329 """
330 ut = Untokenizer()
331 return ut.untokenize(iterable)
332
333def generate_tokens(readline):
334 """
335 The generate_tokens() generator requires one argment, readline, which
336 must be a callable object which provides the same interface as the
337 readline() method of built-in file objects. Each call to the function
338 should return one line of input as a string. Alternately, readline
339 can be a callable function terminating with StopIteration:
340 readline = open(myfile).next # Example of alternate readline
341
342 The generator produces 5-tuples with these members: the token type; the
343 token string; a 2-tuple (srow, scol) of ints specifying the row and
344 column where the token begins in the source; a 2-tuple (erow, ecol) of
345 ints specifying the row and column where the token ends in the source;
346 and the line on which the token was found. The line passed is the
347 logical line; continuation lines are included.
348 """
349 lnum = parenlev = continued = 0
350 namechars, numchars = string.ascii_letters + '_', '0123456789'
351 contstr, needcont = '', 0
352 contline = None
353 indents = [0]
354
355 while 1: # loop over lines in stream
356 try:
357 line = readline()
358 except StopIteration:
359 line = ''
360 lnum = lnum + 1
361 pos, max = 0, len(line)
362
363 if contstr: # continued string
364 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000365 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000366 endmatch = endprog.match(line)
367 if endmatch:
368 pos = end = endmatch.end(0)
369 yield (STRING, contstr + line[:end],
370 strstart, (lnum, end), contline + line)
371 contstr, needcont = '', 0
372 contline = None
373 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
374 yield (ERRORTOKEN, contstr + line,
375 strstart, (lnum, len(line)), contline)
376 contstr = ''
377 contline = None
378 continue
379 else:
380 contstr = contstr + line
381 contline = contline + line
382 continue
383
384 elif parenlev == 0 and not continued: # new statement
385 if not line: break
386 column = 0
387 while pos < max: # measure leading whitespace
388 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000389 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000390 elif line[pos] == '\f': column = 0
391 else: break
392 pos = pos + 1
393 if pos == max: break
394
395 if line[pos] in '#\r\n': # skip comments or blank lines
396 if line[pos] == '#':
397 comment_token = line[pos:].rstrip('\r\n')
398 nl_pos = pos + len(comment_token)
399 yield (COMMENT, comment_token,
400 (lnum, pos), (lnum, pos + len(comment_token)), line)
401 yield (NL, line[nl_pos:],
402 (lnum, nl_pos), (lnum, len(line)), line)
403 else:
404 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
405 (lnum, pos), (lnum, len(line)), line)
406 continue
407
408 if column > indents[-1]: # count indents or dedents
409 indents.append(column)
410 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
411 while column < indents[-1]:
412 if column not in indents:
413 raise IndentationError(
414 "unindent does not match any outer indentation level",
415 ("<tokenize>", lnum, pos, line))
416 indents = indents[:-1]
417 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
418
419 else: # continued statement
420 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000421 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000422 continued = 0
423
424 while pos < max:
425 pseudomatch = pseudoprog.match(line, pos)
426 if pseudomatch: # scan for tokens
427 start, end = pseudomatch.span(1)
428 spos, epos, pos = (lnum, start), (lnum, end), end
429 token, initial = line[start:end], line[start]
430
431 if initial in numchars or \
432 (initial == '.' and token != '.'): # ordinary number
433 yield (NUMBER, token, spos, epos, line)
434 elif initial in '\r\n':
435 newline = NEWLINE
436 if parenlev > 0:
437 newline = NL
438 yield (newline, token, spos, epos, line)
439 elif initial == '#':
440 assert not token.endswith("\n")
441 yield (COMMENT, token, spos, epos, line)
442 elif token in triple_quoted:
443 endprog = endprogs[token]
444 endmatch = endprog.match(line, pos)
445 if endmatch: # all on one line
446 pos = endmatch.end(0)
447 token = line[start:pos]
448 yield (STRING, token, spos, (lnum, pos), line)
449 else:
450 strstart = (lnum, start) # multiple lines
451 contstr = line[start:]
452 contline = line
453 break
454 elif initial in single_quoted or \
455 token[:2] in single_quoted or \
456 token[:3] in single_quoted:
457 if token[-1] == '\n': # continued string
458 strstart = (lnum, start)
459 endprog = (endprogs[initial] or endprogs[token[1]] or
460 endprogs[token[2]])
461 contstr, needcont = line[start:], 1
462 contline = line
463 break
464 else: # ordinary string
465 yield (STRING, token, spos, epos, line)
466 elif initial in namechars: # ordinary name
467 yield (NAME, token, spos, epos, line)
468 elif initial == '\\': # continued stmt
469 # This yield is new; needed for better idempotency:
470 yield (NL, token, spos, (lnum, pos), line)
471 continued = 1
472 else:
473 if initial in '([{': parenlev = parenlev + 1
474 elif initial in ')]}': parenlev = parenlev - 1
475 yield (OP, token, spos, epos, line)
476 else:
477 yield (ERRORTOKEN, line[pos],
478 (lnum, pos), (lnum, pos+1), line)
479 pos = pos + 1
480
481 for indent in indents[1:]: # pop remaining indent levels
482 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
483 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
484
485if __name__ == '__main__': # testing
486 import sys
487 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
488 else: tokenize(sys.stdin.readline)