blob: 7ae02801ea448bc69c353b9d48664c74cb4b4ba7 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
Benjamin Petersond481e3d2009-05-09 19:42:23 +000033from codecs import BOM_UTF8, lookup
Martin v. Löwisef04c442008-03-19 05:04:44 +000034from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38 "generate_tokens", "untokenize"]
39del token
40
41def group(*choices): return '(' + '|'.join(choices) + ')'
42def any(*choices): return group(*choices) + '*'
43def maybe(*choices): return group(*choices) + '?'
44
45Whitespace = r'[ \f\t]*'
46Comment = r'#[^\r\n]*'
47Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
48Name = r'[a-zA-Z_]\w*'
49
50Binnumber = r'0[bB][01]*'
51Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
52Octnumber = r'0[oO]?[0-7]*[lL]?'
53Decnumber = r'[1-9]\d*[lL]?'
54Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
55Exponent = r'[eE][-+]?\d+'
56Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
57Expfloat = r'\d+' + Exponent
58Floatnumber = group(Pointfloat, Expfloat)
59Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
60Number = group(Imagnumber, Floatnumber, Intnumber)
61
62# Tail end of ' string.
63Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
64# Tail end of " string.
65Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
66# Tail end of ''' string.
67Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
68# Tail end of """ string.
69Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
70Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
71# Single-line ' or " string.
72String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
73 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
74
75# Because of leftmost-then-longest match semantics, be sure to put the
76# longest operators first (e.g., if = came before ==, == would get
77# recognized as two instances of =).
78Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
79 r"//=?", r"->",
80 r"[+\-*/%&|^=<>]=?",
81 r"~")
82
83Bracket = '[][(){}]'
84Special = group(r'\r?\n', r'[:;.,`@]')
85Funny = group(Operator, Bracket, Special)
86
87PlainToken = group(Number, Funny, String, Name)
88Token = Ignore + PlainToken
89
90# First (or only) line of ' or " string.
91ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
92 group("'", r'\\\r?\n'),
93 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
94 group('"', r'\\\r?\n'))
95PseudoExtras = group(r'\\\r?\n', Comment, Triple)
96PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
97
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +000098tokenprog, pseudoprog, single3prog, double3prog = list(map(
99 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000100endprogs = {"'": re.compile(Single), '"': re.compile(Double),
101 "'''": single3prog, '"""': double3prog,
102 "r'''": single3prog, 'r"""': double3prog,
103 "u'''": single3prog, 'u"""': double3prog,
104 "b'''": single3prog, 'b"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "br'''": single3prog, 'br"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "B'''": single3prog, 'B"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "bR'''": single3prog, 'bR"""': double3prog,
114 "Br'''": single3prog, 'Br"""': double3prog,
115 "BR'''": single3prog, 'BR"""': double3prog,
116 'r': None, 'R': None,
117 'u': None, 'U': None,
118 'b': None, 'B': None}
119
120triple_quoted = {}
121for t in ("'''", '"""',
122 "r'''", 'r"""', "R'''", 'R"""',
123 "u'''", 'u"""', "U'''", 'U"""',
124 "b'''", 'b"""', "B'''", 'B"""',
125 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
126 "uR'''", 'uR"""', "UR'''", 'UR"""',
127 "br'''", 'br"""', "Br'''", 'Br"""',
128 "bR'''", 'bR"""', "BR'''", 'BR"""',):
129 triple_quoted[t] = t
130single_quoted = {}
131for t in ("'", '"',
132 "r'", 'r"', "R'", 'R"',
133 "u'", 'u"', "U'", 'U"',
134 "b'", 'b"', "B'", 'B"',
135 "ur'", 'ur"', "Ur'", 'Ur"',
136 "uR'", 'uR"', "UR'", 'UR"',
137 "br'", 'br"', "Br'", 'Br"',
138 "bR'", 'bR"', "BR'", 'BR"', ):
139 single_quoted[t] = t
140
141tabsize = 8
142
143class TokenError(Exception): pass
144
145class StopTokenizing(Exception): pass
146
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000147def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
148 (srow, scol) = xxx_todo_changeme
149 (erow, ecol) = xxx_todo_changeme1
150 print("%d,%d-%d,%d:\t%s\t%s" % \
151 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000152
153def tokenize(readline, tokeneater=printtoken):
154 """
155 The tokenize() function accepts two parameters: one representing the
156 input stream, and one providing an output mechanism for tokenize().
157
158 The first parameter, readline, must be a callable object which provides
159 the same interface as the readline() method of built-in file objects.
160 Each call to the function should return one line of input as a string.
161
162 The second parameter, tokeneater, must also be a callable object. It is
163 called once for each token, with five arguments, corresponding to the
164 tuples generated by generate_tokens().
165 """
166 try:
167 tokenize_loop(readline, tokeneater)
168 except StopTokenizing:
169 pass
170
171# backwards compatible interface
172def tokenize_loop(readline, tokeneater):
173 for token_info in generate_tokens(readline):
174 tokeneater(*token_info)
175
176class Untokenizer:
177
178 def __init__(self):
179 self.tokens = []
180 self.prev_row = 1
181 self.prev_col = 0
182
183 def add_whitespace(self, start):
184 row, col = start
185 assert row <= self.prev_row
186 col_offset = col - self.prev_col
187 if col_offset:
188 self.tokens.append(" " * col_offset)
189
190 def untokenize(self, iterable):
191 for t in iterable:
192 if len(t) == 2:
193 self.compat(t, iterable)
194 break
195 tok_type, token, start, end, line = t
196 self.add_whitespace(start)
197 self.tokens.append(token)
198 self.prev_row, self.prev_col = end
199 if tok_type in (NEWLINE, NL):
200 self.prev_row += 1
201 self.prev_col = 0
202 return "".join(self.tokens)
203
204 def compat(self, token, iterable):
205 startline = False
206 indents = []
207 toks_append = self.tokens.append
208 toknum, tokval = token
209 if toknum in (NAME, NUMBER):
210 tokval += ' '
211 if toknum in (NEWLINE, NL):
212 startline = True
213 for tok in iterable:
214 toknum, tokval = tok[:2]
215
216 if toknum in (NAME, NUMBER):
217 tokval += ' '
218
219 if toknum == INDENT:
220 indents.append(tokval)
221 continue
222 elif toknum == DEDENT:
223 indents.pop()
224 continue
225 elif toknum in (NEWLINE, NL):
226 startline = True
227 elif startline and indents:
228 toks_append(indents[-1])
229 startline = False
230 toks_append(tokval)
231
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000232cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
233
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000234def _get_normal_name(orig_enc):
235 """Imitates get_normal_name in tokenizer.c."""
236 # Only care about the first 12 characters.
237 enc = orig_enc[:12].lower().replace("_", "-")
238 if enc == "utf-8" or enc.startswith("utf-8-"):
239 return "utf-8"
240 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
241 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
242 return "iso-8859-1"
243 return orig_enc
244
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000245def detect_encoding(readline):
246 """
247 The detect_encoding() function is used to detect the encoding that should
248 be used to decode a Python source file. It requires one argment, readline,
249 in the same way as the tokenize() generator.
250
251 It will call readline a maximum of twice, and return the encoding used
252 (as a string) and a list of any lines (left as bytes) it has read
253 in.
254
255 It detects the encoding from the presence of a utf-8 bom or an encoding
256 cookie as specified in pep-0263. If both a bom and a cookie are present,
257 but disagree, a SyntaxError will be raised. If the encoding cookie is an
258 invalid charset, raise a SyntaxError.
259
260 If no encoding is specified, then the default of 'utf-8' will be returned.
261 """
262 bom_found = False
263 encoding = None
264 def read_or_stop():
265 try:
266 return readline()
267 except StopIteration:
268 return b''
269
270 def find_cookie(line):
271 try:
272 line_string = line.decode('ascii')
273 except UnicodeDecodeError:
274 return None
275
276 matches = cookie_re.findall(line_string)
277 if not matches:
278 return None
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000279 encoding = _get_normal_name(matches[0])
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000280 try:
281 codec = lookup(encoding)
282 except LookupError:
283 # This behaviour mimics the Python interpreter
284 raise SyntaxError("unknown encoding: " + encoding)
285
Benjamin Peterson20211002009-11-25 18:34:42 +0000286 if bom_found:
287 if codec.name != 'utf-8':
288 # This behaviour mimics the Python interpreter
289 raise SyntaxError('encoding problem: utf-8')
290 else:
291 # Allow it to be properly encoded and decoded.
292 encoding = 'utf-8-sig'
Benjamin Petersond481e3d2009-05-09 19:42:23 +0000293 return encoding
294
295 first = read_or_stop()
296 if first.startswith(BOM_UTF8):
297 bom_found = True
298 first = first[3:]
299 if not first:
300 return 'utf-8', []
301
302 encoding = find_cookie(first)
303 if encoding:
304 return encoding, [first]
305
306 second = read_or_stop()
307 if not second:
308 return 'utf-8', [first]
309
310 encoding = find_cookie(second)
311 if encoding:
312 return encoding, [first, second]
313
314 return 'utf-8', [first, second]
315
Martin v. Löwisef04c442008-03-19 05:04:44 +0000316def untokenize(iterable):
317 """Transform tokens back into Python source code.
318
319 Each element returned by the iterable must be a token sequence
320 with at least two elements, a token number and token value. If
321 only two tokens are passed, the resulting output is poor.
322
323 Round-trip invariant for full input:
324 Untokenized source will match input source exactly
325
326 Round-trip invariant for limited intput:
327 # Output text will tokenize the back to the input
328 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
329 newcode = untokenize(t1)
330 readline = iter(newcode.splitlines(1)).next
331 t2 = [tok[:2] for tokin generate_tokens(readline)]
332 assert t1 == t2
333 """
334 ut = Untokenizer()
335 return ut.untokenize(iterable)
336
337def generate_tokens(readline):
338 """
339 The generate_tokens() generator requires one argment, readline, which
340 must be a callable object which provides the same interface as the
341 readline() method of built-in file objects. Each call to the function
342 should return one line of input as a string. Alternately, readline
343 can be a callable function terminating with StopIteration:
344 readline = open(myfile).next # Example of alternate readline
345
346 The generator produces 5-tuples with these members: the token type; the
347 token string; a 2-tuple (srow, scol) of ints specifying the row and
348 column where the token begins in the source; a 2-tuple (erow, ecol) of
349 ints specifying the row and column where the token ends in the source;
350 and the line on which the token was found. The line passed is the
351 logical line; continuation lines are included.
352 """
353 lnum = parenlev = continued = 0
354 namechars, numchars = string.ascii_letters + '_', '0123456789'
355 contstr, needcont = '', 0
356 contline = None
357 indents = [0]
358
359 while 1: # loop over lines in stream
360 try:
361 line = readline()
362 except StopIteration:
363 line = ''
364 lnum = lnum + 1
365 pos, max = 0, len(line)
366
367 if contstr: # continued string
368 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000369 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000370 endmatch = endprog.match(line)
371 if endmatch:
372 pos = end = endmatch.end(0)
373 yield (STRING, contstr + line[:end],
374 strstart, (lnum, end), contline + line)
375 contstr, needcont = '', 0
376 contline = None
377 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
378 yield (ERRORTOKEN, contstr + line,
379 strstart, (lnum, len(line)), contline)
380 contstr = ''
381 contline = None
382 continue
383 else:
384 contstr = contstr + line
385 contline = contline + line
386 continue
387
388 elif parenlev == 0 and not continued: # new statement
389 if not line: break
390 column = 0
391 while pos < max: # measure leading whitespace
392 if line[pos] == ' ': column = column + 1
Benjamin Petersond9af52b2009-11-02 18:16:28 +0000393 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
Martin v. Löwisef04c442008-03-19 05:04:44 +0000394 elif line[pos] == '\f': column = 0
395 else: break
396 pos = pos + 1
397 if pos == max: break
398
399 if line[pos] in '#\r\n': # skip comments or blank lines
400 if line[pos] == '#':
401 comment_token = line[pos:].rstrip('\r\n')
402 nl_pos = pos + len(comment_token)
403 yield (COMMENT, comment_token,
404 (lnum, pos), (lnum, pos + len(comment_token)), line)
405 yield (NL, line[nl_pos:],
406 (lnum, nl_pos), (lnum, len(line)), line)
407 else:
408 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
409 (lnum, pos), (lnum, len(line)), line)
410 continue
411
412 if column > indents[-1]: # count indents or dedents
413 indents.append(column)
414 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
415 while column < indents[-1]:
416 if column not in indents:
417 raise IndentationError(
418 "unindent does not match any outer indentation level",
419 ("<tokenize>", lnum, pos, line))
420 indents = indents[:-1]
421 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
422
423 else: # continued statement
424 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000425 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000426 continued = 0
427
428 while pos < max:
429 pseudomatch = pseudoprog.match(line, pos)
430 if pseudomatch: # scan for tokens
431 start, end = pseudomatch.span(1)
432 spos, epos, pos = (lnum, start), (lnum, end), end
433 token, initial = line[start:end], line[start]
434
435 if initial in numchars or \
436 (initial == '.' and token != '.'): # ordinary number
437 yield (NUMBER, token, spos, epos, line)
438 elif initial in '\r\n':
439 newline = NEWLINE
440 if parenlev > 0:
441 newline = NL
442 yield (newline, token, spos, epos, line)
443 elif initial == '#':
444 assert not token.endswith("\n")
445 yield (COMMENT, token, spos, epos, line)
446 elif token in triple_quoted:
447 endprog = endprogs[token]
448 endmatch = endprog.match(line, pos)
449 if endmatch: # all on one line
450 pos = endmatch.end(0)
451 token = line[start:pos]
452 yield (STRING, token, spos, (lnum, pos), line)
453 else:
454 strstart = (lnum, start) # multiple lines
455 contstr = line[start:]
456 contline = line
457 break
458 elif initial in single_quoted or \
459 token[:2] in single_quoted or \
460 token[:3] in single_quoted:
461 if token[-1] == '\n': # continued string
462 strstart = (lnum, start)
463 endprog = (endprogs[initial] or endprogs[token[1]] or
464 endprogs[token[2]])
465 contstr, needcont = line[start:], 1
466 contline = line
467 break
468 else: # ordinary string
469 yield (STRING, token, spos, epos, line)
470 elif initial in namechars: # ordinary name
471 yield (NAME, token, spos, epos, line)
472 elif initial == '\\': # continued stmt
473 # This yield is new; needed for better idempotency:
474 yield (NL, token, spos, (lnum, pos), line)
475 continued = 1
476 else:
477 if initial in '([{': parenlev = parenlev + 1
478 elif initial in ')]}': parenlev = parenlev - 1
479 yield (OP, token, spos, epos, line)
480 else:
481 yield (ERRORTOKEN, line[pos],
482 (lnum, pos), (lnum, pos+1), line)
483 pos = pos + 1
484
485 for indent in indents[1:]: # pop remaining indent levels
486 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
487 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
488
489if __name__ == '__main__': # testing
490 import sys
491 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
492 else: tokenize(sys.stdin.readline)