blob: c31d54921043e7dc9abb770742eaa1ef8d13350a [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
33from lib2to3.pgen2.token import *
34
35from . import token
36__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
37 "generate_tokens", "untokenize"]
38del token
39
40def group(*choices): return '(' + '|'.join(choices) + ')'
41def any(*choices): return group(*choices) + '*'
42def maybe(*choices): return group(*choices) + '?'
43
44Whitespace = r'[ \f\t]*'
45Comment = r'#[^\r\n]*'
46Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
47Name = r'[a-zA-Z_]\w*'
48
49Binnumber = r'0[bB][01]*'
50Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
51Octnumber = r'0[oO]?[0-7]*[lL]?'
52Decnumber = r'[1-9]\d*[lL]?'
53Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
54Exponent = r'[eE][-+]?\d+'
55Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
56Expfloat = r'\d+' + Exponent
57Floatnumber = group(Pointfloat, Expfloat)
58Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
59Number = group(Imagnumber, Floatnumber, Intnumber)
60
61# Tail end of ' string.
62Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
63# Tail end of " string.
64Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
65# Tail end of ''' string.
66Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
67# Tail end of """ string.
68Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
69Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
70# Single-line ' or " string.
71String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
72 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
73
74# Because of leftmost-then-longest match semantics, be sure to put the
75# longest operators first (e.g., if = came before ==, == would get
76# recognized as two instances of =).
77Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
78 r"//=?", r"->",
79 r"[+\-*/%&|^=<>]=?",
80 r"~")
81
82Bracket = '[][(){}]'
83Special = group(r'\r?\n', r'[:;.,`@]')
84Funny = group(Operator, Bracket, Special)
85
86PlainToken = group(Number, Funny, String, Name)
87Token = Ignore + PlainToken
88
89# First (or only) line of ' or " string.
90ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
91 group("'", r'\\\r?\n'),
92 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
93 group('"', r'\\\r?\n'))
94PseudoExtras = group(r'\\\r?\n', Comment, Triple)
95PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
96
97tokenprog, pseudoprog, single3prog, double3prog = map(
98 re.compile, (Token, PseudoToken, Single3, Double3))
99endprogs = {"'": re.compile(Single), '"': re.compile(Double),
100 "'''": single3prog, '"""': double3prog,
101 "r'''": single3prog, 'r"""': double3prog,
102 "u'''": single3prog, 'u"""': double3prog,
103 "b'''": single3prog, 'b"""': double3prog,
104 "ur'''": single3prog, 'ur"""': double3prog,
105 "br'''": single3prog, 'br"""': double3prog,
106 "R'''": single3prog, 'R"""': double3prog,
107 "U'''": single3prog, 'U"""': double3prog,
108 "B'''": single3prog, 'B"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 "bR'''": single3prog, 'bR"""': double3prog,
113 "Br'''": single3prog, 'Br"""': double3prog,
114 "BR'''": single3prog, 'BR"""': double3prog,
115 'r': None, 'R': None,
116 'u': None, 'U': None,
117 'b': None, 'B': None}
118
119triple_quoted = {}
120for t in ("'''", '"""',
121 "r'''", 'r"""', "R'''", 'R"""',
122 "u'''", 'u"""', "U'''", 'U"""',
123 "b'''", 'b"""', "B'''", 'B"""',
124 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
125 "uR'''", 'uR"""', "UR'''", 'UR"""',
126 "br'''", 'br"""', "Br'''", 'Br"""',
127 "bR'''", 'bR"""', "BR'''", 'BR"""',):
128 triple_quoted[t] = t
129single_quoted = {}
130for t in ("'", '"',
131 "r'", 'r"', "R'", 'R"',
132 "u'", 'u"', "U'", 'U"',
133 "b'", 'b"', "B'", 'B"',
134 "ur'", 'ur"', "Ur'", 'Ur"',
135 "uR'", 'uR"', "UR'", 'UR"',
136 "br'", 'br"', "Br'", 'Br"',
137 "bR'", 'bR"', "BR'", 'BR"', ):
138 single_quoted[t] = t
139
140tabsize = 8
141
142class TokenError(Exception): pass
143
144class StopTokenizing(Exception): pass
145
146def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
147 print "%d,%d-%d,%d:\t%s\t%s" % \
148 (srow, scol, erow, ecol, tok_name[type], repr(token))
149
150def tokenize(readline, tokeneater=printtoken):
151 """
152 The tokenize() function accepts two parameters: one representing the
153 input stream, and one providing an output mechanism for tokenize().
154
155 The first parameter, readline, must be a callable object which provides
156 the same interface as the readline() method of built-in file objects.
157 Each call to the function should return one line of input as a string.
158
159 The second parameter, tokeneater, must also be a callable object. It is
160 called once for each token, with five arguments, corresponding to the
161 tuples generated by generate_tokens().
162 """
163 try:
164 tokenize_loop(readline, tokeneater)
165 except StopTokenizing:
166 pass
167
168# backwards compatible interface
169def tokenize_loop(readline, tokeneater):
170 for token_info in generate_tokens(readline):
171 tokeneater(*token_info)
172
173class Untokenizer:
174
175 def __init__(self):
176 self.tokens = []
177 self.prev_row = 1
178 self.prev_col = 0
179
180 def add_whitespace(self, start):
181 row, col = start
182 assert row <= self.prev_row
183 col_offset = col - self.prev_col
184 if col_offset:
185 self.tokens.append(" " * col_offset)
186
187 def untokenize(self, iterable):
188 for t in iterable:
189 if len(t) == 2:
190 self.compat(t, iterable)
191 break
192 tok_type, token, start, end, line = t
193 self.add_whitespace(start)
194 self.tokens.append(token)
195 self.prev_row, self.prev_col = end
196 if tok_type in (NEWLINE, NL):
197 self.prev_row += 1
198 self.prev_col = 0
199 return "".join(self.tokens)
200
201 def compat(self, token, iterable):
202 startline = False
203 indents = []
204 toks_append = self.tokens.append
205 toknum, tokval = token
206 if toknum in (NAME, NUMBER):
207 tokval += ' '
208 if toknum in (NEWLINE, NL):
209 startline = True
210 for tok in iterable:
211 toknum, tokval = tok[:2]
212
213 if toknum in (NAME, NUMBER):
214 tokval += ' '
215
216 if toknum == INDENT:
217 indents.append(tokval)
218 continue
219 elif toknum == DEDENT:
220 indents.pop()
221 continue
222 elif toknum in (NEWLINE, NL):
223 startline = True
224 elif startline and indents:
225 toks_append(indents[-1])
226 startline = False
227 toks_append(tokval)
228
229def untokenize(iterable):
230 """Transform tokens back into Python source code.
231
232 Each element returned by the iterable must be a token sequence
233 with at least two elements, a token number and token value. If
234 only two tokens are passed, the resulting output is poor.
235
236 Round-trip invariant for full input:
237 Untokenized source will match input source exactly
238
239 Round-trip invariant for limited intput:
240 # Output text will tokenize the back to the input
241 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
242 newcode = untokenize(t1)
243 readline = iter(newcode.splitlines(1)).next
244 t2 = [tok[:2] for tokin generate_tokens(readline)]
245 assert t1 == t2
246 """
247 ut = Untokenizer()
248 return ut.untokenize(iterable)
249
250def generate_tokens(readline):
251 """
252 The generate_tokens() generator requires one argment, readline, which
253 must be a callable object which provides the same interface as the
254 readline() method of built-in file objects. Each call to the function
255 should return one line of input as a string. Alternately, readline
256 can be a callable function terminating with StopIteration:
257 readline = open(myfile).next # Example of alternate readline
258
259 The generator produces 5-tuples with these members: the token type; the
260 token string; a 2-tuple (srow, scol) of ints specifying the row and
261 column where the token begins in the source; a 2-tuple (erow, ecol) of
262 ints specifying the row and column where the token ends in the source;
263 and the line on which the token was found. The line passed is the
264 logical line; continuation lines are included.
265 """
266 lnum = parenlev = continued = 0
267 namechars, numchars = string.ascii_letters + '_', '0123456789'
268 contstr, needcont = '', 0
269 contline = None
270 indents = [0]
271
272 while 1: # loop over lines in stream
273 try:
274 line = readline()
275 except StopIteration:
276 line = ''
277 lnum = lnum + 1
278 pos, max = 0, len(line)
279
280 if contstr: # continued string
281 if not line:
282 raise TokenError, ("EOF in multi-line string", strstart)
283 endmatch = endprog.match(line)
284 if endmatch:
285 pos = end = endmatch.end(0)
286 yield (STRING, contstr + line[:end],
287 strstart, (lnum, end), contline + line)
288 contstr, needcont = '', 0
289 contline = None
290 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
291 yield (ERRORTOKEN, contstr + line,
292 strstart, (lnum, len(line)), contline)
293 contstr = ''
294 contline = None
295 continue
296 else:
297 contstr = contstr + line
298 contline = contline + line
299 continue
300
301 elif parenlev == 0 and not continued: # new statement
302 if not line: break
303 column = 0
304 while pos < max: # measure leading whitespace
305 if line[pos] == ' ': column = column + 1
306 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
307 elif line[pos] == '\f': column = 0
308 else: break
309 pos = pos + 1
310 if pos == max: break
311
312 if line[pos] in '#\r\n': # skip comments or blank lines
313 if line[pos] == '#':
314 comment_token = line[pos:].rstrip('\r\n')
315 nl_pos = pos + len(comment_token)
316 yield (COMMENT, comment_token,
317 (lnum, pos), (lnum, pos + len(comment_token)), line)
318 yield (NL, line[nl_pos:],
319 (lnum, nl_pos), (lnum, len(line)), line)
320 else:
321 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
322 (lnum, pos), (lnum, len(line)), line)
323 continue
324
325 if column > indents[-1]: # count indents or dedents
326 indents.append(column)
327 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
328 while column < indents[-1]:
329 if column not in indents:
330 raise IndentationError(
331 "unindent does not match any outer indentation level",
332 ("<tokenize>", lnum, pos, line))
333 indents = indents[:-1]
334 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
335
336 else: # continued statement
337 if not line:
338 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
339 continued = 0
340
341 while pos < max:
342 pseudomatch = pseudoprog.match(line, pos)
343 if pseudomatch: # scan for tokens
344 start, end = pseudomatch.span(1)
345 spos, epos, pos = (lnum, start), (lnum, end), end
346 token, initial = line[start:end], line[start]
347
348 if initial in numchars or \
349 (initial == '.' and token != '.'): # ordinary number
350 yield (NUMBER, token, spos, epos, line)
351 elif initial in '\r\n':
352 newline = NEWLINE
353 if parenlev > 0:
354 newline = NL
355 yield (newline, token, spos, epos, line)
356 elif initial == '#':
357 assert not token.endswith("\n")
358 yield (COMMENT, token, spos, epos, line)
359 elif token in triple_quoted:
360 endprog = endprogs[token]
361 endmatch = endprog.match(line, pos)
362 if endmatch: # all on one line
363 pos = endmatch.end(0)
364 token = line[start:pos]
365 yield (STRING, token, spos, (lnum, pos), line)
366 else:
367 strstart = (lnum, start) # multiple lines
368 contstr = line[start:]
369 contline = line
370 break
371 elif initial in single_quoted or \
372 token[:2] in single_quoted or \
373 token[:3] in single_quoted:
374 if token[-1] == '\n': # continued string
375 strstart = (lnum, start)
376 endprog = (endprogs[initial] or endprogs[token[1]] or
377 endprogs[token[2]])
378 contstr, needcont = line[start:], 1
379 contline = line
380 break
381 else: # ordinary string
382 yield (STRING, token, spos, epos, line)
383 elif initial in namechars: # ordinary name
384 yield (NAME, token, spos, epos, line)
385 elif initial == '\\': # continued stmt
386 # This yield is new; needed for better idempotency:
387 yield (NL, token, spos, (lnum, pos), line)
388 continued = 1
389 else:
390 if initial in '([{': parenlev = parenlev + 1
391 elif initial in ')]}': parenlev = parenlev - 1
392 yield (OP, token, spos, epos, line)
393 else:
394 yield (ERRORTOKEN, line[pos],
395 (lnum, pos), (lnum, pos+1), line)
396 pos = pos + 1
397
398 for indent in indents[1:]: # pop remaining indent levels
399 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
400 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
401
402if __name__ == '__main__': # testing
403 import sys
404 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
405 else: tokenize(sys.stdin.readline)