blob: 4c42bbcdabf78471b87fa257c5b479e512eeb503 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Brett Cannonf3042782011-02-22 03:25:12 +000027import builtins
Florent Xicluna43e4ea12010-09-03 19:54:02 +000028import re
29import sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000030from token import *
Benjamin Peterson433f32c2008-12-12 01:25:05 +000031from codecs import lookup, BOM_UTF8
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000032import collections
Victor Stinner58c07522010-11-09 01:08:59 +000033from io import TextIOWrapper
Trent Nelson428de652008-03-18 22:41:35 +000034cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000035
Skip Montanaro40fc1602001-03-01 04:27:19 +000036import token
Alexander Belopolskyb9d10d02010-11-11 14:07:41 +000037__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
38 "NL", "untokenize", "ENCODING", "TokenInfo"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000039del token
40
Guido van Rossum1aec3231997-04-08 14:24:39 +000041COMMENT = N_TOKENS
42tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000043NL = N_TOKENS + 1
44tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000045ENCODING = N_TOKENS + 2
46tok_name[ENCODING] = 'ENCODING'
47N_TOKENS += 3
Meador Inge00c7f852012-01-19 00:44:45 -060048EXACT_TOKEN_TYPES = {
49 '(': LPAR,
50 ')': RPAR,
51 '[': LSQB,
52 ']': RSQB,
53 ':': COLON,
54 ',': COMMA,
55 ';': SEMI,
56 '+': PLUS,
57 '-': MINUS,
58 '*': STAR,
59 '/': SLASH,
60 '|': VBAR,
61 '&': AMPER,
62 '<': LESS,
63 '>': GREATER,
64 '=': EQUAL,
65 '.': DOT,
66 '%': PERCENT,
67 '{': LBRACE,
68 '}': RBRACE,
69 '==': EQEQUAL,
70 '!=': NOTEQUAL,
71 '<=': LESSEQUAL,
72 '>=': GREATEREQUAL,
73 '~': TILDE,
74 '^': CIRCUMFLEX,
75 '<<': LEFTSHIFT,
76 '>>': RIGHTSHIFT,
77 '**': DOUBLESTAR,
78 '+=': PLUSEQUAL,
79 '-=': MINEQUAL,
80 '*=': STAREQUAL,
81 '/=': SLASHEQUAL,
82 '%=': PERCENTEQUAL,
83 '&=': AMPEREQUAL,
84 '|=': VBAREQUAL,
85 '^=': CIRCUMFLEXEQUAL,
86 '<<=': LEFTSHIFTEQUAL,
87 '>>=': RIGHTSHIFTEQUAL,
88 '**=': DOUBLESTAREQUAL,
89 '//': DOUBLESLASH,
90 '//=': DOUBLESLASHEQUAL,
91 '@': AT
92}
Guido van Rossum1aec3231997-04-08 14:24:39 +000093
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000094class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000095 def __repr__(self):
Raymond Hettingera0e79402010-09-09 08:29:05 +000096 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
97 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
98 self._replace(type=annotated_type))
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000099
Meador Inge00c7f852012-01-19 00:44:45 -0600100 @property
101 def exact_type(self):
102 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
103 return EXACT_TOKEN_TYPES[self.string]
104 else:
105 return self.type
106
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000107def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +0000108def any(*choices): return group(*choices) + '*'
109def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000110
Antoine Pitroufd036452008-08-19 17:56:33 +0000111# Note: we use unicode matching for names ("\w") but ascii matching for
112# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +0000113Whitespace = r'[ \f\t]*'
114Comment = r'#[^\r\n]*'
115Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000116Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000117
Antoine Pitroufd036452008-08-19 17:56:33 +0000118Hexnumber = r'0[xX][0-9a-fA-F]+'
Georg Brandlfceab5a2008-01-19 20:08:23 +0000119Binnumber = r'0[bB][01]+'
120Octnumber = r'0[oO][0-7]+'
Antoine Pitroufd036452008-08-19 17:56:33 +0000121Decnumber = r'(?:0+|[1-9][0-9]*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000122Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Antoine Pitroufd036452008-08-19 17:56:33 +0000123Exponent = r'[eE][-+]?[0-9]+'
124Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
125Expfloat = r'[0-9]+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +0000126Floatnumber = group(Pointfloat, Expfloat)
Antoine Pitroufd036452008-08-19 17:56:33 +0000127Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +0000128Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000129
Tim Petersde495832000-10-07 05:09:39 +0000130# Tail end of ' string.
131Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
132# Tail end of " string.
133Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
134# Tail end of ''' string.
135Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
136# Tail end of """ string.
137Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000138Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +0000139# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000140String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
141 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000142
Tim Petersde495832000-10-07 05:09:39 +0000143# Because of leftmost-then-longest match semantics, be sure to put the
144# longest operators first (e.g., if = came before ==, == would get
145# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +0000146Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +0000147 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +0000148 r"[+\-*/%&|^=<>]=?",
149 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +0000150
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000151Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +0000152Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000153Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000154
Guido van Rossum3b631771997-10-27 20:44:15 +0000155PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000156Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000157
Tim Petersde495832000-10-07 05:09:39 +0000158# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000159ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000160 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000161 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000162 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000163PseudoExtras = group(r'\\\r?\n', Comment, Triple)
164PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000165
Benjamin Peterson33856de2010-08-30 14:41:20 +0000166def _compile(expr):
167 return re.compile(expr, re.UNICODE)
168
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200169endpats = {"'": Single, '"': Double,
170 "'''": Single3, '"""': Double3,
171 "r'''": Single3, 'r"""': Double3,
172 "b'''": Single3, 'b"""': Double3,
173 "br'''": Single3, 'br"""': Double3,
174 "R'''": Single3, 'R"""': Double3,
175 "B'''": Single3, 'B"""': Double3,
176 "bR'''": Single3, 'bR"""': Double3,
177 "Br'''": Single3, 'Br"""': Double3,
178 "BR'''": Single3, 'BR"""': Double3,
179 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000180
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000181triple_quoted = {}
182for t in ("'''", '"""',
183 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000184 "b'''", 'b"""', "B'''", 'B"""',
185 "br'''", 'br"""', "Br'''", 'Br"""',
186 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000187 triple_quoted[t] = t
188single_quoted = {}
189for t in ("'", '"',
190 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000191 "b'", 'b"', "B'", 'B"',
192 "br'", 'br"', "Br'", 'Br"',
193 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000194 single_quoted[t] = t
195
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000196tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000197
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000198class TokenError(Exception): pass
199
200class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000201
Tim Peters5ca576e2001-06-18 22:08:13 +0000202
Thomas Wouters89f507f2006-12-13 04:49:30 +0000203class Untokenizer:
204
205 def __init__(self):
206 self.tokens = []
207 self.prev_row = 1
208 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000209 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000210
211 def add_whitespace(self, start):
212 row, col = start
213 assert row <= self.prev_row
214 col_offset = col - self.prev_col
215 if col_offset:
216 self.tokens.append(" " * col_offset)
217
218 def untokenize(self, iterable):
219 for t in iterable:
220 if len(t) == 2:
221 self.compat(t, iterable)
222 break
223 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000224 if tok_type == ENCODING:
225 self.encoding = token
226 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000227 self.add_whitespace(start)
228 self.tokens.append(token)
229 self.prev_row, self.prev_col = end
230 if tok_type in (NEWLINE, NL):
231 self.prev_row += 1
232 self.prev_col = 0
233 return "".join(self.tokens)
234
235 def compat(self, token, iterable):
236 startline = False
237 indents = []
238 toks_append = self.tokens.append
239 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000240
Thomas Wouters89f507f2006-12-13 04:49:30 +0000241 if toknum in (NAME, NUMBER):
242 tokval += ' '
243 if toknum in (NEWLINE, NL):
244 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000245 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000246 for tok in iterable:
247 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000248 if toknum == ENCODING:
249 self.encoding = tokval
250 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000251
252 if toknum in (NAME, NUMBER):
253 tokval += ' '
254
Christian Heimesba4af492008-03-28 00:55:15 +0000255 # Insert a space between two consecutive strings
256 if toknum == STRING:
257 if prevstring:
258 tokval = ' ' + tokval
259 prevstring = True
260 else:
261 prevstring = False
262
Thomas Wouters89f507f2006-12-13 04:49:30 +0000263 if toknum == INDENT:
264 indents.append(tokval)
265 continue
266 elif toknum == DEDENT:
267 indents.pop()
268 continue
269 elif toknum in (NEWLINE, NL):
270 startline = True
271 elif startline and indents:
272 toks_append(indents[-1])
273 startline = False
274 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000275
Trent Nelson428de652008-03-18 22:41:35 +0000276
Raymond Hettinger68c04532005-06-10 11:05:19 +0000277def untokenize(iterable):
278 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000279 It returns a bytes object, encoded using the ENCODING
280 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000281
282 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000283 with at least two elements, a token number and token value. If
284 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000285
Thomas Wouters89f507f2006-12-13 04:49:30 +0000286 Round-trip invariant for full input:
287 Untokenized source will match input source exactly
288
289 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000290 # Output bytes will tokenize the back to the input
291 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000292 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000293 readline = BytesIO(newcode).readline
294 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000295 assert t1 == t2
296 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000297 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000298 out = ut.untokenize(iterable)
299 if ut.encoding is not None:
300 out = out.encode(ut.encoding)
301 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000302
Trent Nelson428de652008-03-18 22:41:35 +0000303
Benjamin Petersond3afada2009-10-09 21:43:09 +0000304def _get_normal_name(orig_enc):
305 """Imitates get_normal_name in tokenizer.c."""
306 # Only care about the first 12 characters.
307 enc = orig_enc[:12].lower().replace("_", "-")
308 if enc == "utf-8" or enc.startswith("utf-8-"):
309 return "utf-8"
310 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
311 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
312 return "iso-8859-1"
313 return orig_enc
314
Trent Nelson428de652008-03-18 22:41:35 +0000315def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000316 """
Trent Nelson428de652008-03-18 22:41:35 +0000317 The detect_encoding() function is used to detect the encoding that should
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000318 be used to decode a Python source file. It requires one argment, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000319 in the same way as the tokenize() generator.
320
321 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000322 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000323
324 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000325 cookie as specified in pep-0263. If both a bom and a cookie are present,
326 but disagree, a SyntaxError will be raised. If the encoding cookie is an
327 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000328 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000329
330 If no encoding is specified, then the default of 'utf-8' will be returned.
331 """
Trent Nelson428de652008-03-18 22:41:35 +0000332 bom_found = False
333 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000334 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000335 def read_or_stop():
336 try:
337 return readline()
338 except StopIteration:
339 return b''
340
341 def find_cookie(line):
342 try:
343 line_string = line.decode('ascii')
344 except UnicodeDecodeError:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000345 return None
346
347 matches = cookie_re.findall(line_string)
348 if not matches:
349 return None
Benjamin Petersond3afada2009-10-09 21:43:09 +0000350 encoding = _get_normal_name(matches[0])
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000351 try:
352 codec = lookup(encoding)
353 except LookupError:
354 # This behaviour mimics the Python interpreter
355 raise SyntaxError("unknown encoding: " + encoding)
356
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000357 if bom_found:
358 if codec.name != 'utf-8':
359 # This behaviour mimics the Python interpreter
360 raise SyntaxError('encoding problem: utf-8')
361 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000362 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000363
364 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000365 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000366 bom_found = True
367 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000368 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000369 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000370 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000371
372 encoding = find_cookie(first)
373 if encoding:
374 return encoding, [first]
375
376 second = read_or_stop()
377 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000378 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000379
380 encoding = find_cookie(second)
381 if encoding:
382 return encoding, [first, second]
383
Benjamin Peterson689a5582010-03-18 22:29:52 +0000384 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000385
386
Victor Stinner58c07522010-11-09 01:08:59 +0000387def open(filename):
388 """Open a file in read only mode using the encoding detected by
389 detect_encoding().
390 """
Brett Cannonf3042782011-02-22 03:25:12 +0000391 buffer = builtins.open(filename, 'rb')
Victor Stinner58c07522010-11-09 01:08:59 +0000392 encoding, lines = detect_encoding(buffer.readline)
393 buffer.seek(0)
394 text = TextIOWrapper(buffer, encoding, line_buffering=True)
395 text.mode = 'r'
396 return text
397
398
Trent Nelson428de652008-03-18 22:41:35 +0000399def tokenize(readline):
400 """
401 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000402 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000403 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000404 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000405 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000406 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000407
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000408 The generator produces 5-tuples with these members: the token type; the
409 token string; a 2-tuple (srow, scol) of ints specifying the row and
410 column where the token begins in the source; a 2-tuple (erow, ecol) of
411 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000412 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000413 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000414
415 The first token sequence will always be an ENCODING token
416 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000417 """
Benjamin Peterson21db77e2009-11-14 16:27:26 +0000418 # This import is here to avoid problems when the itertools module is not
419 # built yet and tokenize is imported.
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000420 from itertools import chain, repeat
Trent Nelson428de652008-03-18 22:41:35 +0000421 encoding, consumed = detect_encoding(readline)
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000422 rl_gen = iter(readline, b"")
423 empty = repeat(b"")
424 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000425
426
427def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000428 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000429 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000430 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000431 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000432 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000433
Trent Nelson428de652008-03-18 22:41:35 +0000434 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000435 if encoding == "utf-8-sig":
436 # BOM will already have been stripped.
437 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000438 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000439 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000440 try:
441 line = readline()
442 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000443 line = b''
444
445 if encoding is not None:
446 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000447 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000448 pos, max = 0, len(line)
449
450 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000451 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000452 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000453 endmatch = endprog.match(line)
454 if endmatch:
455 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000456 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000457 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000458 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000459 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000460 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000461 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000462 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000463 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000464 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000465 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000466 else:
467 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000468 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000469 continue
470
Guido van Rossum1aec3231997-04-08 14:24:39 +0000471 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000472 if not line: break
473 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000474 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000475 if line[pos] == ' ':
476 column += 1
477 elif line[pos] == '\t':
478 column = (column//tabsize + 1)*tabsize
479 elif line[pos] == '\f':
480 column = 0
481 else:
482 break
483 pos += 1
484 if pos == max:
485 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000486
487 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000488 if line[pos] == '#':
489 comment_token = line[pos:].rstrip('\r\n')
490 nl_pos = pos + len(comment_token)
Raymond Hettingera48db392009-04-29 00:34:27 +0000491 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000492 (lnum, pos), (lnum, pos + len(comment_token)), line)
Raymond Hettingera48db392009-04-29 00:34:27 +0000493 yield TokenInfo(NL, line[nl_pos:],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000494 (lnum, nl_pos), (lnum, len(line)), line)
495 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000496 yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000497 (lnum, pos), (lnum, len(line)), line)
498 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000499
500 if column > indents[-1]: # count indents or dedents
501 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000502 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000503 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000504 if column not in indents:
505 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000506 "unindent does not match any outer indentation level",
507 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000508 indents = indents[:-1]
Raymond Hettingera48db392009-04-29 00:34:27 +0000509 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000510
511 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000512 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000513 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000514 continued = 0
515
516 while pos < max:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200517 pseudomatch = _compile(PseudoToken).match(line, pos)
Guido van Rossum3b631771997-10-27 20:44:15 +0000518 if pseudomatch: # scan for tokens
519 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000520 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000521 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000522
Georg Brandldde00282007-03-18 19:01:53 +0000523 if (initial in numchars or # ordinary number
524 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000525 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000526 elif initial in '\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000527 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000528 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000529 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000530 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000531 yield TokenInfo(COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000532 elif token in triple_quoted:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200533 endprog = _compile(endpats[token])
Guido van Rossum3b631771997-10-27 20:44:15 +0000534 endmatch = endprog.match(line, pos)
535 if endmatch: # all on one line
536 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000537 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000538 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000539 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000540 strstart = (lnum, start) # multiple lines
541 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000542 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000543 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000544 elif initial in single_quoted or \
545 token[:2] in single_quoted or \
546 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000547 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000548 strstart = (lnum, start)
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200549 endprog = _compile(endpats[initial] or
550 endpats[token[1]] or
551 endpats[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000552 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000553 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000554 break
555 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000556 yield TokenInfo(STRING, token, spos, epos, line)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000557 elif initial.isidentifier(): # ordinary name
Raymond Hettingera48db392009-04-29 00:34:27 +0000558 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000559 elif initial == '\\': # continued stmt
560 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000561 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000562 if initial in '([{':
563 parenlev += 1
564 elif initial in ')]}':
565 parenlev -= 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000566 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000567 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000568 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000569 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000570 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000571
572 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000573 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
574 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000575
Trent Nelson428de652008-03-18 22:41:35 +0000576
577# An undocumented, backwards compatible, API for all the places in the standard
578# library that expect to be able to use tokenize with strings
579def generate_tokens(readline):
580 return _tokenize(readline, None)
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000581
Meador Inge14c0f032011-10-07 08:53:38 -0500582def main():
583 import argparse
584
585 # Helper error handling routines
586 def perror(message):
587 print(message, file=sys.stderr)
588
589 def error(message, filename=None, location=None):
590 if location:
591 args = (filename,) + location + (message,)
592 perror("%s:%d:%d: error: %s" % args)
593 elif filename:
594 perror("%s: error: %s" % (filename, message))
595 else:
596 perror("error: %s" % message)
597 sys.exit(1)
598
599 # Parse the arguments and options
600 parser = argparse.ArgumentParser(prog='python -m tokenize')
601 parser.add_argument(dest='filename', nargs='?',
602 metavar='filename.py',
603 help='the file to tokenize; defaults to stdin')
Meador Inge00c7f852012-01-19 00:44:45 -0600604 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
605 help='display token names using the exact type')
Meador Inge14c0f032011-10-07 08:53:38 -0500606 args = parser.parse_args()
607
608 try:
609 # Tokenize the input
610 if args.filename:
611 filename = args.filename
612 with builtins.open(filename, 'rb') as f:
613 tokens = list(tokenize(f.readline))
614 else:
615 filename = "<stdin>"
616 tokens = _tokenize(sys.stdin.readline, None)
617
618 # Output the tokenization
619 for token in tokens:
Meador Inge00c7f852012-01-19 00:44:45 -0600620 token_type = token.type
621 if args.exact:
622 token_type = token.exact_type
Meador Inge14c0f032011-10-07 08:53:38 -0500623 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
624 print("%-20s%-15s%-15r" %
Meador Inge00c7f852012-01-19 00:44:45 -0600625 (token_range, tok_name[token_type], token.string))
Meador Inge14c0f032011-10-07 08:53:38 -0500626 except IndentationError as err:
627 line, column = err.args[1][1:3]
628 error(err.args[0], filename, (line, column))
629 except TokenError as err:
630 line, column = err.args[1]
631 error(err.args[0], filename, (line, column))
632 except SyntaxError as err:
633 error(err, filename)
634 except IOError as err:
635 error(err)
636 except KeyboardInterrupt:
637 print("interrupted\n")
638 except Exception as err:
639 perror("unexpected error: %s" % err)
640 raise
641
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000642if __name__ == "__main__":
Meador Inge14c0f032011-10-07 08:53:38 -0500643 main()