blob: 163c561100b6eb5aaf4869c7ad3d8fb10a981b22 [file] [log] [blame]
Martin v. Löwis5e37bae2008-03-19 04:43:46 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Peterson84ad84e2009-05-09 01:01:14 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
41def group(*choices): return '(' + '|'.join(choices) + ')'
42def any(*choices): return group(*choices) + '*'
43def maybe(*choices): return group(*choices) + '?'
44
45Whitespace = r'[ \f\t]*'
46Comment = r'#[^\r\n]*'
47Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
48Name = r'[a-zA-Z_]\w*'
49
50Binnumber = r'0[bB][01]*'
51Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
52Octnumber = r'0[oO]?[0-7]*[lL]?'
53Decnumber = r'[1-9]\d*[lL]?'
54Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
55Exponent = r'[eE][-+]?\d+'
56Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
57Expfloat = r'\d+' + Exponent
58Floatnumber = group(Pointfloat, Expfloat)
59Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
60Number = group(Imagnumber, Floatnumber, Intnumber)
61
62# Tail end of ' string.
63Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
64# Tail end of " string.
65Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
66# Tail end of ''' string.
67Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
68# Tail end of """ string.
69Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
70Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
71# Single-line ' or " string.
72String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
73 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
74
75# Because of leftmost-then-longest match semantics, be sure to put the
76# longest operators first (e.g., if = came before ==, == would get
77# recognized as two instances of =).
78Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
79 r"//=?", r"->",
80 r"[+\-*/%&|^=<>]=?",
81 r"~")
82
83Bracket = '[][(){}]'
84Special = group(r'\r?\n', r'[:;.,`@]')
85Funny = group(Operator, Bracket, Special)
86
87PlainToken = group(Number, Funny, String, Name)
88Token = Ignore + PlainToken
89
90# First (or only) line of ' or " string.
91ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
92 group("'", r'\\\r?\n'),
93 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
94 group('"', r'\\\r?\n'))
95PseudoExtras = group(r'\\\r?\n', Comment, Triple)
96PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
97
98tokenprog, pseudoprog, single3prog, double3prog = map(
99 re.compile, (Token, PseudoToken, Single3, Double3))
100endprogs = {"'": re.compile(Single), '"': re.compile(Double),
101 "'''": single3prog, '"""': double3prog,
102 "r'''": single3prog, 'r"""': double3prog,
103 "u'''": single3prog, 'u"""': double3prog,
104 "b'''": single3prog, 'b"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "br'''": single3prog, 'br"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "B'''": single3prog, 'B"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "bR'''": single3prog, 'bR"""': double3prog,
114 "Br'''": single3prog, 'Br"""': double3prog,
115 "BR'''": single3prog, 'BR"""': double3prog,
116 'r': None, 'R': None,
117 'u': None, 'U': None,
118 'b': None, 'B': None}
119
120triple_quoted = {}
121for t in ("'''", '"""',
122 "r'''", 'r"""', "R'''", 'R"""',
123 "u'''", 'u"""', "U'''", 'U"""',
124 "b'''", 'b"""', "B'''", 'B"""',
125 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
126 "uR'''", 'uR"""', "UR'''", 'UR"""',
127 "br'''", 'br"""', "Br'''", 'Br"""',
128 "bR'''", 'bR"""', "BR'''", 'BR"""',):
129 triple_quoted[t] = t
130single_quoted = {}
131for t in ("'", '"',
132 "r'", 'r"', "R'", 'R"',
133 "u'", 'u"', "U'", 'U"',
134 "b'", 'b"', "B'", 'B"',
135 "ur'", 'ur"', "Ur'", 'Ur"',
136 "uR'", 'uR"', "UR'", 'UR"',
137 "br'", 'br"', "Br'", 'Br"',
138 "bR'", 'bR"', "BR'", 'BR"', ):
139 single_quoted[t] = t
140
141tabsize = 8
142
143class TokenError(Exception): pass
144
145class StopTokenizing(Exception): pass
146
147def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
148 print "%d,%d-%d,%d:\t%s\t%s" % \
149 (srow, scol, erow, ecol, tok_name[type], repr(token))
150
151def tokenize(readline, tokeneater=printtoken):
152 """
153 The tokenize() function accepts two parameters: one representing the
154 input stream, and one providing an output mechanism for tokenize().
155
156 The first parameter, readline, must be a callable object which provides
157 the same interface as the readline() method of built-in file objects.
158 Each call to the function should return one line of input as a string.
159
160 The second parameter, tokeneater, must also be a callable object. It is
161 called once for each token, with five arguments, corresponding to the
162 tuples generated by generate_tokens().
163 """
164 try:
165 tokenize_loop(readline, tokeneater)
166 except StopTokenizing:
167 pass
168
169# backwards compatible interface
170def tokenize_loop(readline, tokeneater):
171 for token_info in generate_tokens(readline):
172 tokeneater(*token_info)
173
174class Untokenizer:
175
176 def __init__(self):
177 self.tokens = []
178 self.prev_row = 1
179 self.prev_col = 0
180
181 def add_whitespace(self, start):
182 row, col = start
183 assert row <= self.prev_row
184 col_offset = col - self.prev_col
185 if col_offset:
186 self.tokens.append(" " * col_offset)
187
188 def untokenize(self, iterable):
189 for t in iterable:
190 if len(t) == 2:
191 self.compat(t, iterable)
192 break
193 tok_type, token, start, end, line = t
194 self.add_whitespace(start)
195 self.tokens.append(token)
196 self.prev_row, self.prev_col = end
197 if tok_type in (NEWLINE, NL):
198 self.prev_row += 1
199 self.prev_col = 0
200 return "".join(self.tokens)
201
202 def compat(self, token, iterable):
203 startline = False
204 indents = []
205 toks_append = self.tokens.append
206 toknum, tokval = token
207 if toknum in (NAME, NUMBER):
208 tokval += ' '
209 if toknum in (NEWLINE, NL):
210 startline = True
211 for tok in iterable:
212 toknum, tokval = tok[:2]
213
214 if toknum in (NAME, NUMBER):
215 tokval += ' '
216
217 if toknum == INDENT:
218 indents.append(tokval)
219 continue
220 elif toknum == DEDENT:
221 indents.pop()
222 continue
223 elif toknum in (NEWLINE, NL):
224 startline = True
225 elif startline and indents:
226 toks_append(indents[-1])
227 startline = False
228 toks_append(tokval)
229
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000230cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
231
Benjamin Petersonf9e7d542009-11-02 18:12:12 +0000232def _get_normal_name(orig_enc):
233 """Imitates get_normal_name in tokenizer.c."""
234 # Only care about the first 12 characters.
235 enc = orig_enc[:12].lower().replace("_", "-")
236 if enc == "utf-8" or enc.startswith("utf-8-"):
237 return "utf-8"
238 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
239 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
240 return "iso-8859-1"
241 return orig_enc
242
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000243def detect_encoding(readline):
244 """
245 The detect_encoding() function is used to detect the encoding that should
246 be used to decode a Python source file. It requires one argment, readline,
247 in the same way as the tokenize() generator.
248
249 It will call readline a maximum of twice, and return the encoding used
250 (as a string) and a list of any lines (left as bytes) it has read
251 in.
252
253 It detects the encoding from the presence of a utf-8 bom or an encoding
254 cookie as specified in pep-0263. If both a bom and a cookie are present,
255 but disagree, a SyntaxError will be raised. If the encoding cookie is an
256 invalid charset, raise a SyntaxError.
257
258 If no encoding is specified, then the default of 'utf-8' will be returned.
259 """
260 bom_found = False
261 encoding = None
262 def read_or_stop():
263 try:
264 return readline()
265 except StopIteration:
266 return b''
267
268 def find_cookie(line):
269 try:
270 line_string = line.decode('ascii')
271 except UnicodeDecodeError:
272 return None
273
274 matches = cookie_re.findall(line_string)
275 if not matches:
276 return None
Benjamin Petersonf9e7d542009-11-02 18:12:12 +0000277 encoding = _get_normal_name(matches[0])
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000278 try:
279 codec = lookup(encoding)
280 except LookupError:
281 # This behaviour mimics the Python interpreter
282 raise SyntaxError("unknown encoding: " + encoding)
283
Benjamin Peterson42d26d92009-11-25 18:16:46 +0000284 if bom_found:
285 if codec.name != 'utf-8':
286 # This behaviour mimics the Python interpreter
287 raise SyntaxError('encoding problem: utf-8')
288 else:
289 # Allow it to be properly encoded and decoded.
290 encoding = 'utf-8-sig'
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000291 return encoding
292
293 first = read_or_stop()
294 if first.startswith(BOM_UTF8):
295 bom_found = True
296 first = first[3:]
297 if not first:
298 return 'utf-8', []
299
300 encoding = find_cookie(first)
301 if encoding:
302 return encoding, [first]
303
304 second = read_or_stop()
305 if not second:
306 return 'utf-8', [first]
307
308 encoding = find_cookie(second)
309 if encoding:
310 return encoding, [first, second]
311
312 return 'utf-8', [first, second]
313
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000314def untokenize(iterable):
315 """Transform tokens back into Python source code.
316
317 Each element returned by the iterable must be a token sequence
318 with at least two elements, a token number and token value. If
319 only two tokens are passed, the resulting output is poor.
320
321 Round-trip invariant for full input:
322 Untokenized source will match input source exactly
323
324 Round-trip invariant for limited intput:
325 # Output text will tokenize the back to the input
326 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
327 newcode = untokenize(t1)
328 readline = iter(newcode.splitlines(1)).next
329 t2 = [tok[:2] for tokin generate_tokens(readline)]
330 assert t1 == t2
331 """
332 ut = Untokenizer()
333 return ut.untokenize(iterable)
334
335def generate_tokens(readline):
336 """
337 The generate_tokens() generator requires one argment, readline, which
338 must be a callable object which provides the same interface as the
339 readline() method of built-in file objects. Each call to the function
340 should return one line of input as a string. Alternately, readline
341 can be a callable function terminating with StopIteration:
342 readline = open(myfile).next # Example of alternate readline
343
344 The generator produces 5-tuples with these members: the token type; the
345 token string; a 2-tuple (srow, scol) of ints specifying the row and
346 column where the token begins in the source; a 2-tuple (erow, ecol) of
347 ints specifying the row and column where the token ends in the source;
348 and the line on which the token was found. The line passed is the
349 logical line; continuation lines are included.
350 """
351 lnum = parenlev = continued = 0
352 namechars, numchars = string.ascii_letters + '_', '0123456789'
353 contstr, needcont = '', 0
354 contline = None
355 indents = [0]
356
357 while 1: # loop over lines in stream
358 try:
359 line = readline()
360 except StopIteration:
361 line = ''
362 lnum = lnum + 1
363 pos, max = 0, len(line)
364
365 if contstr: # continued string
366 if not line:
367 raise TokenError, ("EOF in multi-line string", strstart)
368 endmatch = endprog.match(line)
369 if endmatch:
370 pos = end = endmatch.end(0)
371 yield (STRING, contstr + line[:end],
372 strstart, (lnum, end), contline + line)
373 contstr, needcont = '', 0
374 contline = None
375 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
376 yield (ERRORTOKEN, contstr + line,
377 strstart, (lnum, len(line)), contline)
378 contstr = ''
379 contline = None
380 continue
381 else:
382 contstr = contstr + line
383 contline = contline + line
384 continue
385
386 elif parenlev == 0 and not continued: # new statement
387 if not line: break
388 column = 0
389 while pos < max: # measure leading whitespace
390 if line[pos] == ' ': column = column + 1
Benjamin Petersonf9e7d542009-11-02 18:12:12 +0000391 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000392 elif line[pos] == '\f': column = 0
393 else: break
394 pos = pos + 1
395 if pos == max: break
396
397 if line[pos] in '#\r\n': # skip comments or blank lines
398 if line[pos] == '#':
399 comment_token = line[pos:].rstrip('\r\n')
400 nl_pos = pos + len(comment_token)
401 yield (COMMENT, comment_token,
402 (lnum, pos), (lnum, pos + len(comment_token)), line)
403 yield (NL, line[nl_pos:],
404 (lnum, nl_pos), (lnum, len(line)), line)
405 else:
406 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
407 (lnum, pos), (lnum, len(line)), line)
408 continue
409
410 if column > indents[-1]: # count indents or dedents
411 indents.append(column)
412 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
413 while column < indents[-1]:
414 if column not in indents:
415 raise IndentationError(
416 "unindent does not match any outer indentation level",
417 ("<tokenize>", lnum, pos, line))
418 indents = indents[:-1]
419 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
420
421 else: # continued statement
422 if not line:
423 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
424 continued = 0
425
426 while pos < max:
427 pseudomatch = pseudoprog.match(line, pos)
428 if pseudomatch: # scan for tokens
429 start, end = pseudomatch.span(1)
430 spos, epos, pos = (lnum, start), (lnum, end), end
431 token, initial = line[start:end], line[start]
432
433 if initial in numchars or \
434 (initial == '.' and token != '.'): # ordinary number
435 yield (NUMBER, token, spos, epos, line)
436 elif initial in '\r\n':
437 newline = NEWLINE
438 if parenlev > 0:
439 newline = NL
440 yield (newline, token, spos, epos, line)
441 elif initial == '#':
442 assert not token.endswith("\n")
443 yield (COMMENT, token, spos, epos, line)
444 elif token in triple_quoted:
445 endprog = endprogs[token]
446 endmatch = endprog.match(line, pos)
447 if endmatch: # all on one line
448 pos = endmatch.end(0)
449 token = line[start:pos]
450 yield (STRING, token, spos, (lnum, pos), line)
451 else:
452 strstart = (lnum, start) # multiple lines
453 contstr = line[start:]
454 contline = line
455 break
456 elif initial in single_quoted or \
457 token[:2] in single_quoted or \
458 token[:3] in single_quoted:
459 if token[-1] == '\n': # continued string
460 strstart = (lnum, start)
461 endprog = (endprogs[initial] or endprogs[token[1]] or
462 endprogs[token[2]])
463 contstr, needcont = line[start:], 1
464 contline = line
465 break
466 else: # ordinary string
467 yield (STRING, token, spos, epos, line)
468 elif initial in namechars: # ordinary name
469 yield (NAME, token, spos, epos, line)
470 elif initial == '\\': # continued stmt
471 # This yield is new; needed for better idempotency:
472 yield (NL, token, spos, (lnum, pos), line)
473 continued = 1
474 else:
475 if initial in '([{': parenlev = parenlev + 1
476 elif initial in ')]}': parenlev = parenlev - 1
477 yield (OP, token, spos, epos, line)
478 else:
479 yield (ERRORTOKEN, line[pos],
480 (lnum, pos), (lnum, pos+1), line)
481 pos = pos + 1
482
483 for indent in indents[1:]: # pop remaining indent levels
484 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
485 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
486
487if __name__ == '__main__': # testing
488 import sys
489 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
490 else: tokenize(sys.stdin.readline)