blob: 9cf2a69dc559e7abb3c83c51a7bbee327bacd74a [file] [log] [blame]
Martin v. Löwis5e37bae2008-03-19 04:43:46 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Peterson84ad84e2009-05-09 01:01:14 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
Benjamin Peterson5dfad9d2010-05-07 18:58:23 +000041try:
42 bytes
43except NameError:
44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45 # valid Python 3 code.
46 bytes = str
47
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000048def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
57Binnumber = r'0[bB][01]*'
58Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
59Octnumber = r'0[oO]?[0-7]*[lL]?'
60Decnumber = r'[1-9]\d*[lL]?'
61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62Exponent = r'[eE][-+]?\d+'
63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64Expfloat = r'\d+' + Exponent
65Floatnumber = group(Pointfloat, Expfloat)
66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78# Single-line ' or " string.
79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
81
82# Because of leftmost-then-longest match semantics, be sure to put the
83# longest operators first (e.g., if = came before ==, == would get
84# recognized as two instances of =).
85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
86 r"//=?", r"->",
87 r"[+\-*/%&|^=<>]=?",
88 r"~")
89
90Bracket = '[][(){}]'
91Special = group(r'\r?\n', r'[:;.,`@]')
92Funny = group(Operator, Bracket, Special)
93
94PlainToken = group(Number, Funny, String, Name)
95Token = Ignore + PlainToken
96
97# First (or only) line of ' or " string.
98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
99 group("'", r'\\\r?\n'),
100 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
101 group('"', r'\\\r?\n'))
102PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
104
105tokenprog, pseudoprog, single3prog, double3prog = map(
106 re.compile, (Token, PseudoToken, Single3, Double3))
107endprogs = {"'": re.compile(Single), '"': re.compile(Double),
108 "'''": single3prog, '"""': double3prog,
109 "r'''": single3prog, 'r"""': double3prog,
110 "u'''": single3prog, 'u"""': double3prog,
111 "b'''": single3prog, 'b"""': double3prog,
112 "ur'''": single3prog, 'ur"""': double3prog,
113 "br'''": single3prog, 'br"""': double3prog,
114 "R'''": single3prog, 'R"""': double3prog,
115 "U'''": single3prog, 'U"""': double3prog,
116 "B'''": single3prog, 'B"""': double3prog,
117 "uR'''": single3prog, 'uR"""': double3prog,
118 "Ur'''": single3prog, 'Ur"""': double3prog,
119 "UR'''": single3prog, 'UR"""': double3prog,
120 "bR'''": single3prog, 'bR"""': double3prog,
121 "Br'''": single3prog, 'Br"""': double3prog,
122 "BR'''": single3prog, 'BR"""': double3prog,
123 'r': None, 'R': None,
124 'u': None, 'U': None,
125 'b': None, 'B': None}
126
127triple_quoted = {}
128for t in ("'''", '"""',
129 "r'''", 'r"""', "R'''", 'R"""',
130 "u'''", 'u"""', "U'''", 'U"""',
131 "b'''", 'b"""', "B'''", 'B"""',
132 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
133 "uR'''", 'uR"""', "UR'''", 'UR"""',
134 "br'''", 'br"""', "Br'''", 'Br"""',
135 "bR'''", 'bR"""', "BR'''", 'BR"""',):
136 triple_quoted[t] = t
137single_quoted = {}
138for t in ("'", '"',
139 "r'", 'r"', "R'", 'R"',
140 "u'", 'u"', "U'", 'U"',
141 "b'", 'b"', "B'", 'B"',
142 "ur'", 'ur"', "Ur'", 'Ur"',
143 "uR'", 'uR"', "UR'", 'UR"',
144 "br'", 'br"', "Br'", 'Br"',
145 "bR'", 'bR"', "BR'", 'BR"', ):
146 single_quoted[t] = t
147
148tabsize = 8
149
150class TokenError(Exception): pass
151
152class StopTokenizing(Exception): pass
153
Antoine Pitroub9d49632010-01-04 23:22:44 +0000154def printtoken(type, token, start, end, line): # for testing
155 (srow, scol) = start
156 (erow, ecol) = end
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000157 print "%d,%d-%d,%d:\t%s\t%s" % \
158 (srow, scol, erow, ecol, tok_name[type], repr(token))
159
160def tokenize(readline, tokeneater=printtoken):
161 """
162 The tokenize() function accepts two parameters: one representing the
163 input stream, and one providing an output mechanism for tokenize().
164
165 The first parameter, readline, must be a callable object which provides
166 the same interface as the readline() method of built-in file objects.
167 Each call to the function should return one line of input as a string.
168
169 The second parameter, tokeneater, must also be a callable object. It is
170 called once for each token, with five arguments, corresponding to the
171 tuples generated by generate_tokens().
172 """
173 try:
174 tokenize_loop(readline, tokeneater)
175 except StopTokenizing:
176 pass
177
178# backwards compatible interface
179def tokenize_loop(readline, tokeneater):
180 for token_info in generate_tokens(readline):
181 tokeneater(*token_info)
182
183class Untokenizer:
184
185 def __init__(self):
186 self.tokens = []
187 self.prev_row = 1
188 self.prev_col = 0
189
190 def add_whitespace(self, start):
191 row, col = start
192 assert row <= self.prev_row
193 col_offset = col - self.prev_col
194 if col_offset:
195 self.tokens.append(" " * col_offset)
196
197 def untokenize(self, iterable):
198 for t in iterable:
199 if len(t) == 2:
200 self.compat(t, iterable)
201 break
202 tok_type, token, start, end, line = t
203 self.add_whitespace(start)
204 self.tokens.append(token)
205 self.prev_row, self.prev_col = end
206 if tok_type in (NEWLINE, NL):
207 self.prev_row += 1
208 self.prev_col = 0
209 return "".join(self.tokens)
210
211 def compat(self, token, iterable):
212 startline = False
213 indents = []
214 toks_append = self.tokens.append
215 toknum, tokval = token
216 if toknum in (NAME, NUMBER):
217 tokval += ' '
218 if toknum in (NEWLINE, NL):
219 startline = True
220 for tok in iterable:
221 toknum, tokval = tok[:2]
222
223 if toknum in (NAME, NUMBER):
224 tokval += ' '
225
226 if toknum == INDENT:
227 indents.append(tokval)
228 continue
229 elif toknum == DEDENT:
230 indents.pop()
231 continue
232 elif toknum in (NEWLINE, NL):
233 startline = True
234 elif startline and indents:
235 toks_append(indents[-1])
236 startline = False
237 toks_append(tokval)
238
Serhiy Storchakae787bce2013-09-17 00:00:46 +0300239cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000240
Benjamin Petersonf9e7d542009-11-02 18:12:12 +0000241def _get_normal_name(orig_enc):
242 """Imitates get_normal_name in tokenizer.c."""
243 # Only care about the first 12 characters.
244 enc = orig_enc[:12].lower().replace("_", "-")
245 if enc == "utf-8" or enc.startswith("utf-8-"):
246 return "utf-8"
247 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
248 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
249 return "iso-8859-1"
250 return orig_enc
251
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000252def detect_encoding(readline):
253 """
254 The detect_encoding() function is used to detect the encoding that should
255 be used to decode a Python source file. It requires one argment, readline,
256 in the same way as the tokenize() generator.
257
258 It will call readline a maximum of twice, and return the encoding used
259 (as a string) and a list of any lines (left as bytes) it has read
260 in.
261
262 It detects the encoding from the presence of a utf-8 bom or an encoding
Benjamin Peterson798e5402010-03-22 22:40:06 +0000263 cookie as specified in pep-0263. If both a bom and a cookie are present, but
264 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
265 charset, raise a SyntaxError. Note that if a utf-8 bom is found,
266 'utf-8-sig' is returned.
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000267
268 If no encoding is specified, then the default of 'utf-8' will be returned.
269 """
270 bom_found = False
271 encoding = None
Benjamin Peterson798e5402010-03-22 22:40:06 +0000272 default = 'utf-8'
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000273 def read_or_stop():
274 try:
275 return readline()
276 except StopIteration:
Benjamin Peterson5dfad9d2010-05-07 18:58:23 +0000277 return bytes()
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000278
279 def find_cookie(line):
280 try:
281 line_string = line.decode('ascii')
282 except UnicodeDecodeError:
283 return None
Serhiy Storchakae787bce2013-09-17 00:00:46 +0300284 match = cookie_re.match(line_string)
285 if not match:
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000286 return None
Serhiy Storchakae787bce2013-09-17 00:00:46 +0300287 encoding = _get_normal_name(match.group(1))
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000288 try:
289 codec = lookup(encoding)
290 except LookupError:
291 # This behaviour mimics the Python interpreter
292 raise SyntaxError("unknown encoding: " + encoding)
293
Benjamin Peterson42d26d92009-11-25 18:16:46 +0000294 if bom_found:
295 if codec.name != 'utf-8':
296 # This behaviour mimics the Python interpreter
297 raise SyntaxError('encoding problem: utf-8')
Benjamin Peterson798e5402010-03-22 22:40:06 +0000298 encoding += '-sig'
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000299 return encoding
300
301 first = read_or_stop()
302 if first.startswith(BOM_UTF8):
303 bom_found = True
304 first = first[3:]
Benjamin Peterson798e5402010-03-22 22:40:06 +0000305 default = 'utf-8-sig'
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000306 if not first:
Benjamin Peterson798e5402010-03-22 22:40:06 +0000307 return default, []
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000308
309 encoding = find_cookie(first)
310 if encoding:
311 return encoding, [first]
312
313 second = read_or_stop()
314 if not second:
Benjamin Peterson798e5402010-03-22 22:40:06 +0000315 return default, [first]
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000316
317 encoding = find_cookie(second)
318 if encoding:
319 return encoding, [first, second]
320
Benjamin Peterson798e5402010-03-22 22:40:06 +0000321 return default, [first, second]
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000322
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000323def untokenize(iterable):
324 """Transform tokens back into Python source code.
325
326 Each element returned by the iterable must be a token sequence
327 with at least two elements, a token number and token value. If
328 only two tokens are passed, the resulting output is poor.
329
330 Round-trip invariant for full input:
331 Untokenized source will match input source exactly
332
333 Round-trip invariant for limited intput:
334 # Output text will tokenize the back to the input
335 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
336 newcode = untokenize(t1)
337 readline = iter(newcode.splitlines(1)).next
338 t2 = [tok[:2] for tokin generate_tokens(readline)]
339 assert t1 == t2
340 """
341 ut = Untokenizer()
342 return ut.untokenize(iterable)
343
344def generate_tokens(readline):
345 """
346 The generate_tokens() generator requires one argment, readline, which
347 must be a callable object which provides the same interface as the
348 readline() method of built-in file objects. Each call to the function
349 should return one line of input as a string. Alternately, readline
350 can be a callable function terminating with StopIteration:
351 readline = open(myfile).next # Example of alternate readline
352
353 The generator produces 5-tuples with these members: the token type; the
354 token string; a 2-tuple (srow, scol) of ints specifying the row and
355 column where the token begins in the source; a 2-tuple (erow, ecol) of
356 ints specifying the row and column where the token ends in the source;
357 and the line on which the token was found. The line passed is the
358 logical line; continuation lines are included.
359 """
360 lnum = parenlev = continued = 0
361 namechars, numchars = string.ascii_letters + '_', '0123456789'
362 contstr, needcont = '', 0
363 contline = None
364 indents = [0]
365
366 while 1: # loop over lines in stream
367 try:
368 line = readline()
369 except StopIteration:
370 line = ''
371 lnum = lnum + 1
372 pos, max = 0, len(line)
373
374 if contstr: # continued string
375 if not line:
376 raise TokenError, ("EOF in multi-line string", strstart)
377 endmatch = endprog.match(line)
378 if endmatch:
379 pos = end = endmatch.end(0)
380 yield (STRING, contstr + line[:end],
381 strstart, (lnum, end), contline + line)
382 contstr, needcont = '', 0
383 contline = None
384 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
385 yield (ERRORTOKEN, contstr + line,
386 strstart, (lnum, len(line)), contline)
387 contstr = ''
388 contline = None
389 continue
390 else:
391 contstr = contstr + line
392 contline = contline + line
393 continue
394
395 elif parenlev == 0 and not continued: # new statement
396 if not line: break
397 column = 0
398 while pos < max: # measure leading whitespace
399 if line[pos] == ' ': column = column + 1
Benjamin Petersonf9e7d542009-11-02 18:12:12 +0000400 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000401 elif line[pos] == '\f': column = 0
402 else: break
403 pos = pos + 1
404 if pos == max: break
405
406 if line[pos] in '#\r\n': # skip comments or blank lines
407 if line[pos] == '#':
408 comment_token = line[pos:].rstrip('\r\n')
409 nl_pos = pos + len(comment_token)
410 yield (COMMENT, comment_token,
411 (lnum, pos), (lnum, pos + len(comment_token)), line)
412 yield (NL, line[nl_pos:],
413 (lnum, nl_pos), (lnum, len(line)), line)
414 else:
415 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
416 (lnum, pos), (lnum, len(line)), line)
417 continue
418
419 if column > indents[-1]: # count indents or dedents
420 indents.append(column)
421 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
422 while column < indents[-1]:
423 if column not in indents:
424 raise IndentationError(
425 "unindent does not match any outer indentation level",
426 ("<tokenize>", lnum, pos, line))
427 indents = indents[:-1]
428 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
429
430 else: # continued statement
431 if not line:
432 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
433 continued = 0
434
435 while pos < max:
436 pseudomatch = pseudoprog.match(line, pos)
437 if pseudomatch: # scan for tokens
438 start, end = pseudomatch.span(1)
439 spos, epos, pos = (lnum, start), (lnum, end), end
440 token, initial = line[start:end], line[start]
441
442 if initial in numchars or \
443 (initial == '.' and token != '.'): # ordinary number
444 yield (NUMBER, token, spos, epos, line)
445 elif initial in '\r\n':
446 newline = NEWLINE
447 if parenlev > 0:
448 newline = NL
449 yield (newline, token, spos, epos, line)
450 elif initial == '#':
451 assert not token.endswith("\n")
452 yield (COMMENT, token, spos, epos, line)
453 elif token in triple_quoted:
454 endprog = endprogs[token]
455 endmatch = endprog.match(line, pos)
456 if endmatch: # all on one line
457 pos = endmatch.end(0)
458 token = line[start:pos]
459 yield (STRING, token, spos, (lnum, pos), line)
460 else:
461 strstart = (lnum, start) # multiple lines
462 contstr = line[start:]
463 contline = line
464 break
465 elif initial in single_quoted or \
466 token[:2] in single_quoted or \
467 token[:3] in single_quoted:
468 if token[-1] == '\n': # continued string
469 strstart = (lnum, start)
470 endprog = (endprogs[initial] or endprogs[token[1]] or
471 endprogs[token[2]])
472 contstr, needcont = line[start:], 1
473 contline = line
474 break
475 else: # ordinary string
476 yield (STRING, token, spos, epos, line)
477 elif initial in namechars: # ordinary name
478 yield (NAME, token, spos, epos, line)
479 elif initial == '\\': # continued stmt
480 # This yield is new; needed for better idempotency:
481 yield (NL, token, spos, (lnum, pos), line)
482 continued = 1
483 else:
484 if initial in '([{': parenlev = parenlev + 1
485 elif initial in ')]}': parenlev = parenlev - 1
486 yield (OP, token, spos, epos, line)
487 else:
488 yield (ERRORTOKEN, line[pos],
489 (lnum, pos), (lnum, pos+1), line)
490 pos = pos + 1
491
492 for indent in indents[1:]: # pop remaining indent levels
493 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
494 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
495
496if __name__ == '__main__': # testing
497 import sys
498 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
499 else: tokenize(sys.stdin.readline)