blob: 5f56560a68f47a7bd5d997ad678cdddc3e6a8c3a [file] [log] [blame]
Martin v. Löwis5e37bae2008-03-19 04:43:46 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Peterson84ad84e2009-05-09 01:01:14 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwis5e37bae2008-03-19 04:43:46 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
41def group(*choices): return '(' + '|'.join(choices) + ')'
42def any(*choices): return group(*choices) + '*'
43def maybe(*choices): return group(*choices) + '?'
44
45Whitespace = r'[ \f\t]*'
46Comment = r'#[^\r\n]*'
47Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
48Name = r'[a-zA-Z_]\w*'
49
50Binnumber = r'0[bB][01]*'
51Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
52Octnumber = r'0[oO]?[0-7]*[lL]?'
53Decnumber = r'[1-9]\d*[lL]?'
54Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
55Exponent = r'[eE][-+]?\d+'
56Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
57Expfloat = r'\d+' + Exponent
58Floatnumber = group(Pointfloat, Expfloat)
59Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
60Number = group(Imagnumber, Floatnumber, Intnumber)
61
62# Tail end of ' string.
63Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
64# Tail end of " string.
65Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
66# Tail end of ''' string.
67Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
68# Tail end of """ string.
69Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
70Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
71# Single-line ' or " string.
72String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
73 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
74
75# Because of leftmost-then-longest match semantics, be sure to put the
76# longest operators first (e.g., if = came before ==, == would get
77# recognized as two instances of =).
78Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
79 r"//=?", r"->",
80 r"[+\-*/%&|^=<>]=?",
81 r"~")
82
83Bracket = '[][(){}]'
84Special = group(r'\r?\n', r'[:;.,`@]')
85Funny = group(Operator, Bracket, Special)
86
87PlainToken = group(Number, Funny, String, Name)
88Token = Ignore + PlainToken
89
90# First (or only) line of ' or " string.
91ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
92 group("'", r'\\\r?\n'),
93 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
94 group('"', r'\\\r?\n'))
95PseudoExtras = group(r'\\\r?\n', Comment, Triple)
96PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
97
98tokenprog, pseudoprog, single3prog, double3prog = map(
99 re.compile, (Token, PseudoToken, Single3, Double3))
100endprogs = {"'": re.compile(Single), '"': re.compile(Double),
101 "'''": single3prog, '"""': double3prog,
102 "r'''": single3prog, 'r"""': double3prog,
103 "u'''": single3prog, 'u"""': double3prog,
104 "b'''": single3prog, 'b"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "br'''": single3prog, 'br"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "B'''": single3prog, 'B"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "bR'''": single3prog, 'bR"""': double3prog,
114 "Br'''": single3prog, 'Br"""': double3prog,
115 "BR'''": single3prog, 'BR"""': double3prog,
116 'r': None, 'R': None,
117 'u': None, 'U': None,
118 'b': None, 'B': None}
119
120triple_quoted = {}
121for t in ("'''", '"""',
122 "r'''", 'r"""', "R'''", 'R"""',
123 "u'''", 'u"""', "U'''", 'U"""',
124 "b'''", 'b"""', "B'''", 'B"""',
125 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
126 "uR'''", 'uR"""', "UR'''", 'UR"""',
127 "br'''", 'br"""', "Br'''", 'Br"""',
128 "bR'''", 'bR"""', "BR'''", 'BR"""',):
129 triple_quoted[t] = t
130single_quoted = {}
131for t in ("'", '"',
132 "r'", 'r"', "R'", 'R"',
133 "u'", 'u"', "U'", 'U"',
134 "b'", 'b"', "B'", 'B"',
135 "ur'", 'ur"', "Ur'", 'Ur"',
136 "uR'", 'uR"', "UR'", 'UR"',
137 "br'", 'br"', "Br'", 'Br"',
138 "bR'", 'bR"', "BR'", 'BR"', ):
139 single_quoted[t] = t
140
141tabsize = 8
142
143class TokenError(Exception): pass
144
145class StopTokenizing(Exception): pass
146
147def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
148 print "%d,%d-%d,%d:\t%s\t%s" % \
149 (srow, scol, erow, ecol, tok_name[type], repr(token))
150
151def tokenize(readline, tokeneater=printtoken):
152 """
153 The tokenize() function accepts two parameters: one representing the
154 input stream, and one providing an output mechanism for tokenize().
155
156 The first parameter, readline, must be a callable object which provides
157 the same interface as the readline() method of built-in file objects.
158 Each call to the function should return one line of input as a string.
159
160 The second parameter, tokeneater, must also be a callable object. It is
161 called once for each token, with five arguments, corresponding to the
162 tuples generated by generate_tokens().
163 """
164 try:
165 tokenize_loop(readline, tokeneater)
166 except StopTokenizing:
167 pass
168
169# backwards compatible interface
170def tokenize_loop(readline, tokeneater):
171 for token_info in generate_tokens(readline):
172 tokeneater(*token_info)
173
174class Untokenizer:
175
176 def __init__(self):
177 self.tokens = []
178 self.prev_row = 1
179 self.prev_col = 0
180
181 def add_whitespace(self, start):
182 row, col = start
183 assert row <= self.prev_row
184 col_offset = col - self.prev_col
185 if col_offset:
186 self.tokens.append(" " * col_offset)
187
188 def untokenize(self, iterable):
189 for t in iterable:
190 if len(t) == 2:
191 self.compat(t, iterable)
192 break
193 tok_type, token, start, end, line = t
194 self.add_whitespace(start)
195 self.tokens.append(token)
196 self.prev_row, self.prev_col = end
197 if tok_type in (NEWLINE, NL):
198 self.prev_row += 1
199 self.prev_col = 0
200 return "".join(self.tokens)
201
202 def compat(self, token, iterable):
203 startline = False
204 indents = []
205 toks_append = self.tokens.append
206 toknum, tokval = token
207 if toknum in (NAME, NUMBER):
208 tokval += ' '
209 if toknum in (NEWLINE, NL):
210 startline = True
211 for tok in iterable:
212 toknum, tokval = tok[:2]
213
214 if toknum in (NAME, NUMBER):
215 tokval += ' '
216
217 if toknum == INDENT:
218 indents.append(tokval)
219 continue
220 elif toknum == DEDENT:
221 indents.pop()
222 continue
223 elif toknum in (NEWLINE, NL):
224 startline = True
225 elif startline and indents:
226 toks_append(indents[-1])
227 startline = False
228 toks_append(tokval)
229
Benjamin Peterson84ad84e2009-05-09 01:01:14 +0000230cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
231
232def detect_encoding(readline):
233 """
234 The detect_encoding() function is used to detect the encoding that should
235 be used to decode a Python source file. It requires one argment, readline,
236 in the same way as the tokenize() generator.
237
238 It will call readline a maximum of twice, and return the encoding used
239 (as a string) and a list of any lines (left as bytes) it has read
240 in.
241
242 It detects the encoding from the presence of a utf-8 bom or an encoding
243 cookie as specified in pep-0263. If both a bom and a cookie are present,
244 but disagree, a SyntaxError will be raised. If the encoding cookie is an
245 invalid charset, raise a SyntaxError.
246
247 If no encoding is specified, then the default of 'utf-8' will be returned.
248 """
249 bom_found = False
250 encoding = None
251 def read_or_stop():
252 try:
253 return readline()
254 except StopIteration:
255 return b''
256
257 def find_cookie(line):
258 try:
259 line_string = line.decode('ascii')
260 except UnicodeDecodeError:
261 return None
262
263 matches = cookie_re.findall(line_string)
264 if not matches:
265 return None
266 encoding = matches[0]
267 try:
268 codec = lookup(encoding)
269 except LookupError:
270 # This behaviour mimics the Python interpreter
271 raise SyntaxError("unknown encoding: " + encoding)
272
273 if bom_found and codec.name != 'utf-8':
274 # This behaviour mimics the Python interpreter
275 raise SyntaxError('encoding problem: utf-8')
276 return encoding
277
278 first = read_or_stop()
279 if first.startswith(BOM_UTF8):
280 bom_found = True
281 first = first[3:]
282 if not first:
283 return 'utf-8', []
284
285 encoding = find_cookie(first)
286 if encoding:
287 return encoding, [first]
288
289 second = read_or_stop()
290 if not second:
291 return 'utf-8', [first]
292
293 encoding = find_cookie(second)
294 if encoding:
295 return encoding, [first, second]
296
297 return 'utf-8', [first, second]
298
Martin v. Löwis5e37bae2008-03-19 04:43:46 +0000299def untokenize(iterable):
300 """Transform tokens back into Python source code.
301
302 Each element returned by the iterable must be a token sequence
303 with at least two elements, a token number and token value. If
304 only two tokens are passed, the resulting output is poor.
305
306 Round-trip invariant for full input:
307 Untokenized source will match input source exactly
308
309 Round-trip invariant for limited intput:
310 # Output text will tokenize the back to the input
311 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
312 newcode = untokenize(t1)
313 readline = iter(newcode.splitlines(1)).next
314 t2 = [tok[:2] for tokin generate_tokens(readline)]
315 assert t1 == t2
316 """
317 ut = Untokenizer()
318 return ut.untokenize(iterable)
319
320def generate_tokens(readline):
321 """
322 The generate_tokens() generator requires one argment, readline, which
323 must be a callable object which provides the same interface as the
324 readline() method of built-in file objects. Each call to the function
325 should return one line of input as a string. Alternately, readline
326 can be a callable function terminating with StopIteration:
327 readline = open(myfile).next # Example of alternate readline
328
329 The generator produces 5-tuples with these members: the token type; the
330 token string; a 2-tuple (srow, scol) of ints specifying the row and
331 column where the token begins in the source; a 2-tuple (erow, ecol) of
332 ints specifying the row and column where the token ends in the source;
333 and the line on which the token was found. The line passed is the
334 logical line; continuation lines are included.
335 """
336 lnum = parenlev = continued = 0
337 namechars, numchars = string.ascii_letters + '_', '0123456789'
338 contstr, needcont = '', 0
339 contline = None
340 indents = [0]
341
342 while 1: # loop over lines in stream
343 try:
344 line = readline()
345 except StopIteration:
346 line = ''
347 lnum = lnum + 1
348 pos, max = 0, len(line)
349
350 if contstr: # continued string
351 if not line:
352 raise TokenError, ("EOF in multi-line string", strstart)
353 endmatch = endprog.match(line)
354 if endmatch:
355 pos = end = endmatch.end(0)
356 yield (STRING, contstr + line[:end],
357 strstart, (lnum, end), contline + line)
358 contstr, needcont = '', 0
359 contline = None
360 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
361 yield (ERRORTOKEN, contstr + line,
362 strstart, (lnum, len(line)), contline)
363 contstr = ''
364 contline = None
365 continue
366 else:
367 contstr = contstr + line
368 contline = contline + line
369 continue
370
371 elif parenlev == 0 and not continued: # new statement
372 if not line: break
373 column = 0
374 while pos < max: # measure leading whitespace
375 if line[pos] == ' ': column = column + 1
376 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
377 elif line[pos] == '\f': column = 0
378 else: break
379 pos = pos + 1
380 if pos == max: break
381
382 if line[pos] in '#\r\n': # skip comments or blank lines
383 if line[pos] == '#':
384 comment_token = line[pos:].rstrip('\r\n')
385 nl_pos = pos + len(comment_token)
386 yield (COMMENT, comment_token,
387 (lnum, pos), (lnum, pos + len(comment_token)), line)
388 yield (NL, line[nl_pos:],
389 (lnum, nl_pos), (lnum, len(line)), line)
390 else:
391 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
392 (lnum, pos), (lnum, len(line)), line)
393 continue
394
395 if column > indents[-1]: # count indents or dedents
396 indents.append(column)
397 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
398 while column < indents[-1]:
399 if column not in indents:
400 raise IndentationError(
401 "unindent does not match any outer indentation level",
402 ("<tokenize>", lnum, pos, line))
403 indents = indents[:-1]
404 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
405
406 else: # continued statement
407 if not line:
408 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
409 continued = 0
410
411 while pos < max:
412 pseudomatch = pseudoprog.match(line, pos)
413 if pseudomatch: # scan for tokens
414 start, end = pseudomatch.span(1)
415 spos, epos, pos = (lnum, start), (lnum, end), end
416 token, initial = line[start:end], line[start]
417
418 if initial in numchars or \
419 (initial == '.' and token != '.'): # ordinary number
420 yield (NUMBER, token, spos, epos, line)
421 elif initial in '\r\n':
422 newline = NEWLINE
423 if parenlev > 0:
424 newline = NL
425 yield (newline, token, spos, epos, line)
426 elif initial == '#':
427 assert not token.endswith("\n")
428 yield (COMMENT, token, spos, epos, line)
429 elif token in triple_quoted:
430 endprog = endprogs[token]
431 endmatch = endprog.match(line, pos)
432 if endmatch: # all on one line
433 pos = endmatch.end(0)
434 token = line[start:pos]
435 yield (STRING, token, spos, (lnum, pos), line)
436 else:
437 strstart = (lnum, start) # multiple lines
438 contstr = line[start:]
439 contline = line
440 break
441 elif initial in single_quoted or \
442 token[:2] in single_quoted or \
443 token[:3] in single_quoted:
444 if token[-1] == '\n': # continued string
445 strstart = (lnum, start)
446 endprog = (endprogs[initial] or endprogs[token[1]] or
447 endprogs[token[2]])
448 contstr, needcont = line[start:], 1
449 contline = line
450 break
451 else: # ordinary string
452 yield (STRING, token, spos, epos, line)
453 elif initial in namechars: # ordinary name
454 yield (NAME, token, spos, epos, line)
455 elif initial == '\\': # continued stmt
456 # This yield is new; needed for better idempotency:
457 yield (NL, token, spos, (lnum, pos), line)
458 continued = 1
459 else:
460 if initial in '([{': parenlev = parenlev + 1
461 elif initial in ')]}': parenlev = parenlev - 1
462 yield (OP, token, spos, epos, line)
463 else:
464 yield (ERRORTOKEN, line[pos],
465 (lnum, pos), (lnum, pos+1), line)
466 pos = pos + 1
467
468 for indent in indents[1:]: # pop remaining indent levels
469 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
470 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
471
472if __name__ == '__main__': # testing
473 import sys
474 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
475 else: tokenize(sys.stdin.readline)