blob: 799566b383d60830c138e899e8c8ba6a8fd23042 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
41def group(*choices): return '(' + '|'.join(choices) + ')'
42def any(*choices): return group(*choices) + '*'
43def maybe(*choices): return group(*choices) + '?'
44
45Whitespace = r'[ \f\t]*'
46Comment = r'#[^\r\n]*'
47Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
48Name = r'[a-zA-Z_]\w*'
49
50Binnumber = r'0[bB][01]*'
51Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
52Octnumber = r'0[oO]?[0-7]*[lL]?'
53Decnumber = r'[1-9]\d*[lL]?'
54Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
55Exponent = r'[eE][-+]?\d+'
56Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
57Expfloat = r'\d+' + Exponent
58Floatnumber = group(Pointfloat, Expfloat)
59Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
60Number = group(Imagnumber, Floatnumber, Intnumber)
61
62# Tail end of ' string.
63Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
64# Tail end of " string.
65Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
66# Tail end of ''' string.
67Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
68# Tail end of """ string.
69Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
70Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
71# Single-line ' or " string.
72String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
73 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
74
75# Because of leftmost-then-longest match semantics, be sure to put the
76# longest operators first (e.g., if = came before ==, == would get
77# recognized as two instances of =).
78Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
79 r"//=?", r"->",
80 r"[+\-*/%&|^=<>]=?",
81 r"~")
82
83Bracket = '[][(){}]'
84Special = group(r'\r?\n', r'[:;.,`@]')
85Funny = group(Operator, Bracket, Special)
86
87PlainToken = group(Number, Funny, String, Name)
88Token = Ignore + PlainToken
89
90# First (or only) line of ' or " string.
91ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
92 group("'", r'\\\r?\n'),
93 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
94 group('"', r'\\\r?\n'))
95PseudoExtras = group(r'\\\r?\n', Comment, Triple)
96PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
97
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +000098tokenprog, pseudoprog, single3prog, double3prog = list(map(
99 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000100endprogs = {"'": re.compile(Single), '"': re.compile(Double),
101 "'''": single3prog, '"""': double3prog,
102 "r'''": single3prog, 'r"""': double3prog,
103 "u'''": single3prog, 'u"""': double3prog,
104 "b'''": single3prog, 'b"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "br'''": single3prog, 'br"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "B'''": single3prog, 'B"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "bR'''": single3prog, 'bR"""': double3prog,
114 "Br'''": single3prog, 'Br"""': double3prog,
115 "BR'''": single3prog, 'BR"""': double3prog,
116 'r': None, 'R': None,
117 'u': None, 'U': None,
118 'b': None, 'B': None}
119
120triple_quoted = {}
121for t in ("'''", '"""',
122 "r'''", 'r"""', "R'''", 'R"""',
123 "u'''", 'u"""', "U'''", 'U"""',
124 "b'''", 'b"""', "B'''", 'B"""',
125 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
126 "uR'''", 'uR"""', "UR'''", 'UR"""',
127 "br'''", 'br"""', "Br'''", 'Br"""',
128 "bR'''", 'bR"""', "BR'''", 'BR"""',):
129 triple_quoted[t] = t
130single_quoted = {}
131for t in ("'", '"',
132 "r'", 'r"', "R'", 'R"',
133 "u'", 'u"', "U'", 'U"',
134 "b'", 'b"', "B'", 'B"',
135 "ur'", 'ur"', "Ur'", 'Ur"',
136 "uR'", 'uR"', "UR'", 'UR"',
137 "br'", 'br"', "Br'", 'Br"',
138 "bR'", 'bR"', "BR'", 'BR"', ):
139 single_quoted[t] = t
140
141tabsize = 8
142
143class TokenError(Exception): pass
144
145class StopTokenizing(Exception): pass
146
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000147def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
148 (srow, scol) = xxx_todo_changeme
149 (erow, ecol) = xxx_todo_changeme1
150 print("%d,%d-%d,%d:\t%s\t%s" % \
151 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000152
153def tokenize(readline, tokeneater=printtoken):
154 """
155 The tokenize() function accepts two parameters: one representing the
156 input stream, and one providing an output mechanism for tokenize().
157
158 The first parameter, readline, must be a callable object which provides
159 the same interface as the readline() method of built-in file objects.
160 Each call to the function should return one line of input as a string.
161
162 The second parameter, tokeneater, must also be a callable object. It is
163 called once for each token, with five arguments, corresponding to the
164 tuples generated by generate_tokens().
165 """
166 try:
167 tokenize_loop(readline, tokeneater)
168 except StopTokenizing:
169 pass
170
171# backwards compatible interface
172def tokenize_loop(readline, tokeneater):
173 for token_info in generate_tokens(readline):
174 tokeneater(*token_info)
175
176class Untokenizer:
177
178 def __init__(self):
179 self.tokens = []
180 self.prev_row = 1
181 self.prev_col = 0
182
183 def add_whitespace(self, start):
184 row, col = start
185 assert row <= self.prev_row
186 col_offset = col - self.prev_col
187 if col_offset:
188 self.tokens.append(" " * col_offset)
189
190 def untokenize(self, iterable):
191 for t in iterable:
192 if len(t) == 2:
193 self.compat(t, iterable)
194 break
195 tok_type, token, start, end, line = t
196 self.add_whitespace(start)
197 self.tokens.append(token)
198 self.prev_row, self.prev_col = end
199 if tok_type in (NEWLINE, NL):
200 self.prev_row += 1
201 self.prev_col = 0
202 return "".join(self.tokens)
203
204 def compat(self, token, iterable):
205 startline = False
206 indents = []
207 toks_append = self.tokens.append
208 toknum, tokval = token
209 if toknum in (NAME, NUMBER):
210 tokval += ' '
211 if toknum in (NEWLINE, NL):
212 startline = True
213 for tok in iterable:
214 toknum, tokval = tok[:2]
215
216 if toknum in (NAME, NUMBER):
217 tokval += ' '
218
219 if toknum == INDENT:
220 indents.append(tokval)
221 continue
222 elif toknum == DEDENT:
223 indents.pop()
224 continue
225 elif toknum in (NEWLINE, NL):
226 startline = True
227 elif startline and indents:
228 toks_append(indents[-1])
229 startline = False
230 toks_append(tokval)
231
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000232cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
233
234def detect_encoding(readline):
235 """
236 The detect_encoding() function is used to detect the encoding that should
237 be used to decode a Python source file. It requires one argment, readline,
238 in the same way as the tokenize() generator.
239
240 It will call readline a maximum of twice, and return the encoding used
241 (as a string) and a list of any lines (left as bytes) it has read
242 in.
243
244 It detects the encoding from the presence of a utf-8 bom or an encoding
245 cookie as specified in pep-0263. If both a bom and a cookie are present,
246 but disagree, a SyntaxError will be raised. If the encoding cookie is an
247 invalid charset, raise a SyntaxError.
248
249 If no encoding is specified, then the default of 'utf-8' will be returned.
250 """
251 bom_found = False
252 encoding = None
253 def read_or_stop():
254 try:
255 return readline()
256 except StopIteration:
257 return b''
258
259 def find_cookie(line):
260 try:
261 line_string = line.decode('ascii')
262 except UnicodeDecodeError:
263 return None
264
265 matches = cookie_re.findall(line_string)
266 if not matches:
267 return None
268 encoding = matches[0]
269 try:
270 codec = lookup(encoding)
271 except LookupError:
272 # This behaviour mimics the Python interpreter
273 raise SyntaxError("unknown encoding: " + encoding)
274
275 if bom_found and codec.name != 'utf-8':
276 # This behaviour mimics the Python interpreter
277 raise SyntaxError('encoding problem: utf-8')
278 return encoding
279
280 first = read_or_stop()
281 if first.startswith(BOM_UTF8):
282 bom_found = True
283 first = first[3:]
284 if not first:
285 return 'utf-8', []
286
287 encoding = find_cookie(first)
288 if encoding:
289 return encoding, [first]
290
291 second = read_or_stop()
292 if not second:
293 return 'utf-8', [first]
294
295 encoding = find_cookie(second)
296 if encoding:
297 return encoding, [first, second]
298
299 return 'utf-8', [first, second]
300
Martin v. Löwisef04c442008-03-19 05:04:44 +0000301def untokenize(iterable):
302 """Transform tokens back into Python source code.
303
304 Each element returned by the iterable must be a token sequence
305 with at least two elements, a token number and token value. If
306 only two tokens are passed, the resulting output is poor.
307
308 Round-trip invariant for full input:
309 Untokenized source will match input source exactly
310
311 Round-trip invariant for limited intput:
312 # Output text will tokenize the back to the input
313 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
314 newcode = untokenize(t1)
315 readline = iter(newcode.splitlines(1)).next
316 t2 = [tok[:2] for tokin generate_tokens(readline)]
317 assert t1 == t2
318 """
319 ut = Untokenizer()
320 return ut.untokenize(iterable)
321
322def generate_tokens(readline):
323 """
324 The generate_tokens() generator requires one argment, readline, which
325 must be a callable object which provides the same interface as the
326 readline() method of built-in file objects. Each call to the function
327 should return one line of input as a string. Alternately, readline
328 can be a callable function terminating with StopIteration:
329 readline = open(myfile).next # Example of alternate readline
330
331 The generator produces 5-tuples with these members: the token type; the
332 token string; a 2-tuple (srow, scol) of ints specifying the row and
333 column where the token begins in the source; a 2-tuple (erow, ecol) of
334 ints specifying the row and column where the token ends in the source;
335 and the line on which the token was found. The line passed is the
336 logical line; continuation lines are included.
337 """
338 lnum = parenlev = continued = 0
339 namechars, numchars = string.ascii_letters + '_', '0123456789'
340 contstr, needcont = '', 0
341 contline = None
342 indents = [0]
343
344 while 1: # loop over lines in stream
345 try:
346 line = readline()
347 except StopIteration:
348 line = ''
349 lnum = lnum + 1
350 pos, max = 0, len(line)
351
352 if contstr: # continued string
353 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000354 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000355 endmatch = endprog.match(line)
356 if endmatch:
357 pos = end = endmatch.end(0)
358 yield (STRING, contstr + line[:end],
359 strstart, (lnum, end), contline + line)
360 contstr, needcont = '', 0
361 contline = None
362 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
363 yield (ERRORTOKEN, contstr + line,
364 strstart, (lnum, len(line)), contline)
365 contstr = ''
366 contline = None
367 continue
368 else:
369 contstr = contstr + line
370 contline = contline + line
371 continue
372
373 elif parenlev == 0 and not continued: # new statement
374 if not line: break
375 column = 0
376 while pos < max: # measure leading whitespace
377 if line[pos] == ' ': column = column + 1
378 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
379 elif line[pos] == '\f': column = 0
380 else: break
381 pos = pos + 1
382 if pos == max: break
383
384 if line[pos] in '#\r\n': # skip comments or blank lines
385 if line[pos] == '#':
386 comment_token = line[pos:].rstrip('\r\n')
387 nl_pos = pos + len(comment_token)
388 yield (COMMENT, comment_token,
389 (lnum, pos), (lnum, pos + len(comment_token)), line)
390 yield (NL, line[nl_pos:],
391 (lnum, nl_pos), (lnum, len(line)), line)
392 else:
393 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
394 (lnum, pos), (lnum, len(line)), line)
395 continue
396
397 if column > indents[-1]: # count indents or dedents
398 indents.append(column)
399 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
400 while column < indents[-1]:
401 if column not in indents:
402 raise IndentationError(
403 "unindent does not match any outer indentation level",
404 ("<tokenize>", lnum, pos, line))
405 indents = indents[:-1]
406 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
407
408 else: # continued statement
409 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000410 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000411 continued = 0
412
413 while pos < max:
414 pseudomatch = pseudoprog.match(line, pos)
415 if pseudomatch: # scan for tokens
416 start, end = pseudomatch.span(1)
417 spos, epos, pos = (lnum, start), (lnum, end), end
418 token, initial = line[start:end], line[start]
419
420 if initial in numchars or \
421 (initial == '.' and token != '.'): # ordinary number
422 yield (NUMBER, token, spos, epos, line)
423 elif initial in '\r\n':
424 newline = NEWLINE
425 if parenlev > 0:
426 newline = NL
427 yield (newline, token, spos, epos, line)
428 elif initial == '#':
429 assert not token.endswith("\n")
430 yield (COMMENT, token, spos, epos, line)
431 elif token in triple_quoted:
432 endprog = endprogs[token]
433 endmatch = endprog.match(line, pos)
434 if endmatch: # all on one line
435 pos = endmatch.end(0)
436 token = line[start:pos]
437 yield (STRING, token, spos, (lnum, pos), line)
438 else:
439 strstart = (lnum, start) # multiple lines
440 contstr = line[start:]
441 contline = line
442 break
443 elif initial in single_quoted or \
444 token[:2] in single_quoted or \
445 token[:3] in single_quoted:
446 if token[-1] == '\n': # continued string
447 strstart = (lnum, start)
448 endprog = (endprogs[initial] or endprogs[token[1]] or
449 endprogs[token[2]])
450 contstr, needcont = line[start:], 1
451 contline = line
452 break
453 else: # ordinary string
454 yield (STRING, token, spos, epos, line)
455 elif initial in namechars: # ordinary name
456 yield (NAME, token, spos, epos, line)
457 elif initial == '\\': # continued stmt
458 # This yield is new; needed for better idempotency:
459 yield (NL, token, spos, (lnum, pos), line)
460 continued = 1
461 else:
462 if initial in '([{': parenlev = parenlev + 1
463 elif initial in ')]}': parenlev = parenlev - 1
464 yield (OP, token, spos, epos, line)
465 else:
466 yield (ERRORTOKEN, line[pos],
467 (lnum, pos), (lnum, pos+1), line)
468 pos = pos + 1
469
470 for indent in indents[1:]: # pop remaining indent levels
471 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
472 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
473
474if __name__ == '__main__': # testing
475 import sys
476 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
477 else: tokenize(sys.stdin.readline)