blob: 33cfc33b7059f0685d5469cff555f8fbccd033d7 [file] [log] [blame]
Martin v. Löwisef04c442008-03-19 05:04:44 +00001# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens. It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF). It generates
95-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22 tokenize_loop(readline, tokeneater)
23 tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
33from lib2to3.pgen2.token import *
34
35from . import token
36__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
37 "generate_tokens", "untokenize"]
38del token
39
40def group(*choices): return '(' + '|'.join(choices) + ')'
41def any(*choices): return group(*choices) + '*'
42def maybe(*choices): return group(*choices) + '?'
43
44Whitespace = r'[ \f\t]*'
45Comment = r'#[^\r\n]*'
46Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
47Name = r'[a-zA-Z_]\w*'
48
49Binnumber = r'0[bB][01]*'
50Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
51Octnumber = r'0[oO]?[0-7]*[lL]?'
52Decnumber = r'[1-9]\d*[lL]?'
53Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
54Exponent = r'[eE][-+]?\d+'
55Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
56Expfloat = r'\d+' + Exponent
57Floatnumber = group(Pointfloat, Expfloat)
58Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
59Number = group(Imagnumber, Floatnumber, Intnumber)
60
61# Tail end of ' string.
62Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
63# Tail end of " string.
64Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
65# Tail end of ''' string.
66Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
67# Tail end of """ string.
68Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
69Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
70# Single-line ' or " string.
71String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
72 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
73
74# Because of leftmost-then-longest match semantics, be sure to put the
75# longest operators first (e.g., if = came before ==, == would get
76# recognized as two instances of =).
77Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
78 r"//=?", r"->",
79 r"[+\-*/%&|^=<>]=?",
80 r"~")
81
82Bracket = '[][(){}]'
83Special = group(r'\r?\n', r'[:;.,`@]')
84Funny = group(Operator, Bracket, Special)
85
86PlainToken = group(Number, Funny, String, Name)
87Token = Ignore + PlainToken
88
89# First (or only) line of ' or " string.
90ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
91 group("'", r'\\\r?\n'),
92 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
93 group('"', r'\\\r?\n'))
94PseudoExtras = group(r'\\\r?\n', Comment, Triple)
95PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
96
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +000097tokenprog, pseudoprog, single3prog, double3prog = list(map(
98 re.compile, (Token, PseudoToken, Single3, Double3)))
Martin v. Löwisef04c442008-03-19 05:04:44 +000099endprogs = {"'": re.compile(Single), '"': re.compile(Double),
100 "'''": single3prog, '"""': double3prog,
101 "r'''": single3prog, 'r"""': double3prog,
102 "u'''": single3prog, 'u"""': double3prog,
103 "b'''": single3prog, 'b"""': double3prog,
104 "ur'''": single3prog, 'ur"""': double3prog,
105 "br'''": single3prog, 'br"""': double3prog,
106 "R'''": single3prog, 'R"""': double3prog,
107 "U'''": single3prog, 'U"""': double3prog,
108 "B'''": single3prog, 'B"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 "bR'''": single3prog, 'bR"""': double3prog,
113 "Br'''": single3prog, 'Br"""': double3prog,
114 "BR'''": single3prog, 'BR"""': double3prog,
115 'r': None, 'R': None,
116 'u': None, 'U': None,
117 'b': None, 'B': None}
118
119triple_quoted = {}
120for t in ("'''", '"""',
121 "r'''", 'r"""', "R'''", 'R"""',
122 "u'''", 'u"""', "U'''", 'U"""',
123 "b'''", 'b"""', "B'''", 'B"""',
124 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
125 "uR'''", 'uR"""', "UR'''", 'UR"""',
126 "br'''", 'br"""', "Br'''", 'Br"""',
127 "bR'''", 'bR"""', "BR'''", 'BR"""',):
128 triple_quoted[t] = t
129single_quoted = {}
130for t in ("'", '"',
131 "r'", 'r"', "R'", 'R"',
132 "u'", 'u"', "U'", 'U"',
133 "b'", 'b"', "B'", 'B"',
134 "ur'", 'ur"', "Ur'", 'Ur"',
135 "uR'", 'uR"', "UR'", 'UR"',
136 "br'", 'br"', "Br'", 'Br"',
137 "bR'", 'bR"', "BR'", 'BR"', ):
138 single_quoted[t] = t
139
140tabsize = 8
141
142class TokenError(Exception): pass
143
144class StopTokenizing(Exception): pass
145
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000146def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
147 (srow, scol) = xxx_todo_changeme
148 (erow, ecol) = xxx_todo_changeme1
149 print("%d,%d-%d,%d:\t%s\t%s" % \
150 (srow, scol, erow, ecol, tok_name[type], repr(token)))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000151
152def tokenize(readline, tokeneater=printtoken):
153 """
154 The tokenize() function accepts two parameters: one representing the
155 input stream, and one providing an output mechanism for tokenize().
156
157 The first parameter, readline, must be a callable object which provides
158 the same interface as the readline() method of built-in file objects.
159 Each call to the function should return one line of input as a string.
160
161 The second parameter, tokeneater, must also be a callable object. It is
162 called once for each token, with five arguments, corresponding to the
163 tuples generated by generate_tokens().
164 """
165 try:
166 tokenize_loop(readline, tokeneater)
167 except StopTokenizing:
168 pass
169
170# backwards compatible interface
171def tokenize_loop(readline, tokeneater):
172 for token_info in generate_tokens(readline):
173 tokeneater(*token_info)
174
175class Untokenizer:
176
177 def __init__(self):
178 self.tokens = []
179 self.prev_row = 1
180 self.prev_col = 0
181
182 def add_whitespace(self, start):
183 row, col = start
184 assert row <= self.prev_row
185 col_offset = col - self.prev_col
186 if col_offset:
187 self.tokens.append(" " * col_offset)
188
189 def untokenize(self, iterable):
190 for t in iterable:
191 if len(t) == 2:
192 self.compat(t, iterable)
193 break
194 tok_type, token, start, end, line = t
195 self.add_whitespace(start)
196 self.tokens.append(token)
197 self.prev_row, self.prev_col = end
198 if tok_type in (NEWLINE, NL):
199 self.prev_row += 1
200 self.prev_col = 0
201 return "".join(self.tokens)
202
203 def compat(self, token, iterable):
204 startline = False
205 indents = []
206 toks_append = self.tokens.append
207 toknum, tokval = token
208 if toknum in (NAME, NUMBER):
209 tokval += ' '
210 if toknum in (NEWLINE, NL):
211 startline = True
212 for tok in iterable:
213 toknum, tokval = tok[:2]
214
215 if toknum in (NAME, NUMBER):
216 tokval += ' '
217
218 if toknum == INDENT:
219 indents.append(tokval)
220 continue
221 elif toknum == DEDENT:
222 indents.pop()
223 continue
224 elif toknum in (NEWLINE, NL):
225 startline = True
226 elif startline and indents:
227 toks_append(indents[-1])
228 startline = False
229 toks_append(tokval)
230
231def untokenize(iterable):
232 """Transform tokens back into Python source code.
233
234 Each element returned by the iterable must be a token sequence
235 with at least two elements, a token number and token value. If
236 only two tokens are passed, the resulting output is poor.
237
238 Round-trip invariant for full input:
239 Untokenized source will match input source exactly
240
241 Round-trip invariant for limited intput:
242 # Output text will tokenize the back to the input
243 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
244 newcode = untokenize(t1)
245 readline = iter(newcode.splitlines(1)).next
246 t2 = [tok[:2] for tokin generate_tokens(readline)]
247 assert t1 == t2
248 """
249 ut = Untokenizer()
250 return ut.untokenize(iterable)
251
252def generate_tokens(readline):
253 """
254 The generate_tokens() generator requires one argment, readline, which
255 must be a callable object which provides the same interface as the
256 readline() method of built-in file objects. Each call to the function
257 should return one line of input as a string. Alternately, readline
258 can be a callable function terminating with StopIteration:
259 readline = open(myfile).next # Example of alternate readline
260
261 The generator produces 5-tuples with these members: the token type; the
262 token string; a 2-tuple (srow, scol) of ints specifying the row and
263 column where the token begins in the source; a 2-tuple (erow, ecol) of
264 ints specifying the row and column where the token ends in the source;
265 and the line on which the token was found. The line passed is the
266 logical line; continuation lines are included.
267 """
268 lnum = parenlev = continued = 0
269 namechars, numchars = string.ascii_letters + '_', '0123456789'
270 contstr, needcont = '', 0
271 contline = None
272 indents = [0]
273
274 while 1: # loop over lines in stream
275 try:
276 line = readline()
277 except StopIteration:
278 line = ''
279 lnum = lnum + 1
280 pos, max = 0, len(line)
281
282 if contstr: # continued string
283 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000284 raise TokenError("EOF in multi-line string", strstart)
Martin v. Löwisef04c442008-03-19 05:04:44 +0000285 endmatch = endprog.match(line)
286 if endmatch:
287 pos = end = endmatch.end(0)
288 yield (STRING, contstr + line[:end],
289 strstart, (lnum, end), contline + line)
290 contstr, needcont = '', 0
291 contline = None
292 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
293 yield (ERRORTOKEN, contstr + line,
294 strstart, (lnum, len(line)), contline)
295 contstr = ''
296 contline = None
297 continue
298 else:
299 contstr = contstr + line
300 contline = contline + line
301 continue
302
303 elif parenlev == 0 and not continued: # new statement
304 if not line: break
305 column = 0
306 while pos < max: # measure leading whitespace
307 if line[pos] == ' ': column = column + 1
308 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
309 elif line[pos] == '\f': column = 0
310 else: break
311 pos = pos + 1
312 if pos == max: break
313
314 if line[pos] in '#\r\n': # skip comments or blank lines
315 if line[pos] == '#':
316 comment_token = line[pos:].rstrip('\r\n')
317 nl_pos = pos + len(comment_token)
318 yield (COMMENT, comment_token,
319 (lnum, pos), (lnum, pos + len(comment_token)), line)
320 yield (NL, line[nl_pos:],
321 (lnum, nl_pos), (lnum, len(line)), line)
322 else:
323 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
324 (lnum, pos), (lnum, len(line)), line)
325 continue
326
327 if column > indents[-1]: # count indents or dedents
328 indents.append(column)
329 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
330 while column < indents[-1]:
331 if column not in indents:
332 raise IndentationError(
333 "unindent does not match any outer indentation level",
334 ("<tokenize>", lnum, pos, line))
335 indents = indents[:-1]
336 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
337
338 else: # continued statement
339 if not line:
Martin v. Löwis8a5f8ca2008-03-19 05:33:36 +0000340 raise TokenError("EOF in multi-line statement", (lnum, 0))
Martin v. Löwisef04c442008-03-19 05:04:44 +0000341 continued = 0
342
343 while pos < max:
344 pseudomatch = pseudoprog.match(line, pos)
345 if pseudomatch: # scan for tokens
346 start, end = pseudomatch.span(1)
347 spos, epos, pos = (lnum, start), (lnum, end), end
348 token, initial = line[start:end], line[start]
349
350 if initial in numchars or \
351 (initial == '.' and token != '.'): # ordinary number
352 yield (NUMBER, token, spos, epos, line)
353 elif initial in '\r\n':
354 newline = NEWLINE
355 if parenlev > 0:
356 newline = NL
357 yield (newline, token, spos, epos, line)
358 elif initial == '#':
359 assert not token.endswith("\n")
360 yield (COMMENT, token, spos, epos, line)
361 elif token in triple_quoted:
362 endprog = endprogs[token]
363 endmatch = endprog.match(line, pos)
364 if endmatch: # all on one line
365 pos = endmatch.end(0)
366 token = line[start:pos]
367 yield (STRING, token, spos, (lnum, pos), line)
368 else:
369 strstart = (lnum, start) # multiple lines
370 contstr = line[start:]
371 contline = line
372 break
373 elif initial in single_quoted or \
374 token[:2] in single_quoted or \
375 token[:3] in single_quoted:
376 if token[-1] == '\n': # continued string
377 strstart = (lnum, start)
378 endprog = (endprogs[initial] or endprogs[token[1]] or
379 endprogs[token[2]])
380 contstr, needcont = line[start:], 1
381 contline = line
382 break
383 else: # ordinary string
384 yield (STRING, token, spos, epos, line)
385 elif initial in namechars: # ordinary name
386 yield (NAME, token, spos, epos, line)
387 elif initial == '\\': # continued stmt
388 # This yield is new; needed for better idempotency:
389 yield (NL, token, spos, (lnum, pos), line)
390 continued = 1
391 else:
392 if initial in '([{': parenlev = parenlev + 1
393 elif initial in ')]}': parenlev = parenlev - 1
394 yield (OP, token, spos, epos, line)
395 else:
396 yield (ERRORTOKEN, line[pos],
397 (lnum, pos), (lnum, pos+1), line)
398 pos = pos + 1
399
400 for indent in indents[1:]: # pop remaining indent levels
401 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
402 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
403
404if __name__ == '__main__': # testing
405 import sys
406 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
407 else: tokenize(sys.stdin.readline)