blob: 0eccc9b08d44519181cdc42e07f7d9ecfee359ed [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020027from builtins import open as _builtin_open
Benjamin Peterson433f32c2008-12-12 01:25:05 +000028from codecs import lookup, BOM_UTF8
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000029import collections
Victor Stinner58c07522010-11-09 01:08:59 +000030from io import TextIOWrapper
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050031from itertools import chain
Eric V. Smith1c8222c2015-10-26 04:37:55 -040032import itertools as _itertools
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050033import re
34import sys
35from token import *
36
Serhiy Storchakae431d3c2016-03-20 23:36:29 +020037cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +020038blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000039
Skip Montanaro40fc1602001-03-01 04:27:19 +000040import token
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +010041__all__ = token.__all__ + ["tokenize", "detect_encoding",
42 "untokenize", "TokenInfo"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000043del token
44
Meador Inge00c7f852012-01-19 00:44:45 -060045EXACT_TOKEN_TYPES = {
46 '(': LPAR,
47 ')': RPAR,
48 '[': LSQB,
49 ']': RSQB,
50 ':': COLON,
51 ',': COMMA,
52 ';': SEMI,
53 '+': PLUS,
54 '-': MINUS,
55 '*': STAR,
56 '/': SLASH,
57 '|': VBAR,
58 '&': AMPER,
59 '<': LESS,
60 '>': GREATER,
61 '=': EQUAL,
62 '.': DOT,
63 '%': PERCENT,
64 '{': LBRACE,
65 '}': RBRACE,
66 '==': EQEQUAL,
67 '!=': NOTEQUAL,
68 '<=': LESSEQUAL,
69 '>=': GREATEREQUAL,
70 '~': TILDE,
71 '^': CIRCUMFLEX,
72 '<<': LEFTSHIFT,
73 '>>': RIGHTSHIFT,
74 '**': DOUBLESTAR,
75 '+=': PLUSEQUAL,
76 '-=': MINEQUAL,
77 '*=': STAREQUAL,
78 '/=': SLASHEQUAL,
79 '%=': PERCENTEQUAL,
80 '&=': AMPEREQUAL,
81 '|=': VBAREQUAL,
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +020082 '^=': CIRCUMFLEXEQUAL,
Meador Inge00c7f852012-01-19 00:44:45 -060083 '<<=': LEFTSHIFTEQUAL,
84 '>>=': RIGHTSHIFTEQUAL,
85 '**=': DOUBLESTAREQUAL,
86 '//': DOUBLESLASH,
87 '//=': DOUBLESLASHEQUAL,
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +020088 '...': ELLIPSIS,
89 '->': RARROW,
Benjamin Petersond51374e2014-04-09 23:55:56 -040090 '@': AT,
91 '@=': ATEQUAL,
Meador Inge00c7f852012-01-19 00:44:45 -060092}
Guido van Rossum1aec3231997-04-08 14:24:39 +000093
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000094class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000095 def __repr__(self):
Raymond Hettingera0e79402010-09-09 08:29:05 +000096 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
97 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
98 self._replace(type=annotated_type))
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000099
Meador Inge00c7f852012-01-19 00:44:45 -0600100 @property
101 def exact_type(self):
102 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
103 return EXACT_TOKEN_TYPES[self.string]
104 else:
105 return self.type
106
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000107def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +0000108def any(*choices): return group(*choices) + '*'
109def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000110
Antoine Pitroufd036452008-08-19 17:56:33 +0000111# Note: we use unicode matching for names ("\w") but ascii matching for
112# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +0000113Whitespace = r'[ \f\t]*'
114Comment = r'#[^\r\n]*'
115Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000116Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000117
Brett Cannona721aba2016-09-09 14:57:09 -0700118Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
119Binnumber = r'0[bB](?:_?[01])+'
120Octnumber = r'0[oO](?:_?[0-7])+'
121Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000122Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Brett Cannona721aba2016-09-09 14:57:09 -0700123Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
124Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
125 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
126Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +0000127Floatnumber = group(Pointfloat, Expfloat)
Brett Cannona721aba2016-09-09 14:57:09 -0700128Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +0000129Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000130
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400131# Return the empty string, plus all of the valid string prefixes.
132def _all_string_prefixes():
133 # The valid string prefixes. Only contain the lower case versions,
134 # and don't contain any permuations (include 'fr', but not
135 # 'rf'). The various permutations will be generated.
136 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
137 # if we add binary f-strings, add: ['fb', 'fbr']
Jon Dufresne39726282017-05-18 07:35:54 -0700138 result = {''}
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400139 for prefix in _valid_string_prefixes:
140 for t in _itertools.permutations(prefix):
141 # create a list with upper and lower versions of each
142 # character
143 for u in _itertools.product(*[(c, c.upper()) for c in t]):
144 result.add(''.join(u))
145 return result
146
147def _compile(expr):
148 return re.compile(expr, re.UNICODE)
149
150# Note that since _all_string_prefixes includes the empty string,
151# StringPrefix can be the empty string (making it optional).
152StringPrefix = group(*_all_string_prefixes())
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000153
Tim Petersde495832000-10-07 05:09:39 +0000154# Tail end of ' string.
155Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
156# Tail end of " string.
157Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
158# Tail end of ''' string.
159Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
160# Tail end of """ string.
161Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000162Triple = group(StringPrefix + "'''", StringPrefix + '"""')
Tim Petersde495832000-10-07 05:09:39 +0000163# Single-line ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000164String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
165 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000166
Tim Petersde495832000-10-07 05:09:39 +0000167# Because of leftmost-then-longest match semantics, be sure to put the
168# longest operators first (e.g., if = came before ==, == would get
169# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +0000170Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +0000171 r"//=?", r"->",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400172 r"[+\-*/%&@|^=<>]=?",
Tim Petersde495832000-10-07 05:09:39 +0000173 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +0000174
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000175Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +0000176Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000177Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000178
Guido van Rossum3b631771997-10-27 20:44:15 +0000179PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000180Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000181
Tim Petersde495832000-10-07 05:09:39 +0000182# First (or only) line of ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000183ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000184 group("'", r'\\\r?\n'),
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000185 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000186 group('"', r'\\\r?\n'))
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200187PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
Guido van Rossum3b631771997-10-27 20:44:15 +0000188PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000189
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400190# For a given string prefix plus quotes, endpats maps it to a regex
191# to match the remainder of that string. _prefix can be empty, for
192# a normal single or triple quoted string (with no prefix).
193endpats = {}
194for _prefix in _all_string_prefixes():
195 endpats[_prefix + "'"] = Single
196 endpats[_prefix + '"'] = Double
197 endpats[_prefix + "'''"] = Single3
198 endpats[_prefix + '"""'] = Double3
Benjamin Peterson33856de2010-08-30 14:41:20 +0000199
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400200# A set of all of the single and triple quoted string prefixes,
201# including the opening quotes.
202single_quoted = set()
203triple_quoted = set()
204for t in _all_string_prefixes():
205 for u in (t + '"', t + "'"):
206 single_quoted.add(u)
207 for u in (t + '"""', t + "'''"):
208 triple_quoted.add(u)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000209
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000210tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000211
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000212class TokenError(Exception): pass
213
214class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000215
Tim Peters5ca576e2001-06-18 22:08:13 +0000216
Thomas Wouters89f507f2006-12-13 04:49:30 +0000217class Untokenizer:
218
219 def __init__(self):
220 self.tokens = []
221 self.prev_row = 1
222 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000223 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000224
225 def add_whitespace(self, start):
226 row, col = start
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500227 if row < self.prev_row or row == self.prev_row and col < self.prev_col:
228 raise ValueError("start ({},{}) precedes previous end ({},{})"
229 .format(row, col, self.prev_row, self.prev_col))
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500230 row_offset = row - self.prev_row
Terry Jan Reedyf106f8f2014-02-23 23:39:57 -0500231 if row_offset:
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500232 self.tokens.append("\\\n" * row_offset)
233 self.prev_col = 0
Thomas Wouters89f507f2006-12-13 04:49:30 +0000234 col_offset = col - self.prev_col
235 if col_offset:
236 self.tokens.append(" " * col_offset)
237
238 def untokenize(self, iterable):
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500239 it = iter(iterable)
Dingyuan Wange411b662015-06-22 10:01:12 +0800240 indents = []
241 startline = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500242 for t in it:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000243 if len(t) == 2:
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500244 self.compat(t, it)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000245 break
246 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000247 if tok_type == ENCODING:
248 self.encoding = token
249 continue
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500250 if tok_type == ENDMARKER:
251 break
Dingyuan Wange411b662015-06-22 10:01:12 +0800252 if tok_type == INDENT:
253 indents.append(token)
254 continue
255 elif tok_type == DEDENT:
256 indents.pop()
257 self.prev_row, self.prev_col = end
258 continue
259 elif tok_type in (NEWLINE, NL):
260 startline = True
261 elif startline and indents:
262 indent = indents[-1]
263 if start[1] >= len(indent):
264 self.tokens.append(indent)
265 self.prev_col = len(indent)
266 startline = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000267 self.add_whitespace(start)
268 self.tokens.append(token)
269 self.prev_row, self.prev_col = end
270 if tok_type in (NEWLINE, NL):
271 self.prev_row += 1
272 self.prev_col = 0
273 return "".join(self.tokens)
274
275 def compat(self, token, iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000276 indents = []
277 toks_append = self.tokens.append
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500278 startline = token[0] in (NEWLINE, NL)
Christian Heimesba4af492008-03-28 00:55:15 +0000279 prevstring = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500280
281 for tok in chain([token], iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000282 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000283 if toknum == ENCODING:
284 self.encoding = tokval
285 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000286
Miss Islington (bot)4dc3c8f2018-04-11 10:07:23 -0700287 if toknum in (NAME, NUMBER):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000288 tokval += ' '
289
Christian Heimesba4af492008-03-28 00:55:15 +0000290 # Insert a space between two consecutive strings
291 if toknum == STRING:
292 if prevstring:
293 tokval = ' ' + tokval
294 prevstring = True
295 else:
296 prevstring = False
297
Thomas Wouters89f507f2006-12-13 04:49:30 +0000298 if toknum == INDENT:
299 indents.append(tokval)
300 continue
301 elif toknum == DEDENT:
302 indents.pop()
303 continue
304 elif toknum in (NEWLINE, NL):
305 startline = True
306 elif startline and indents:
307 toks_append(indents[-1])
308 startline = False
309 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000310
Trent Nelson428de652008-03-18 22:41:35 +0000311
Raymond Hettinger68c04532005-06-10 11:05:19 +0000312def untokenize(iterable):
313 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000314 It returns a bytes object, encoded using the ENCODING
315 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000316
317 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000318 with at least two elements, a token number and token value. If
319 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000320
Thomas Wouters89f507f2006-12-13 04:49:30 +0000321 Round-trip invariant for full input:
322 Untokenized source will match input source exactly
323
Berker Peksagff8d0872015-12-30 01:41:58 +0200324 Round-trip invariant for limited input:
325 # Output bytes will tokenize back to the input
Trent Nelson428de652008-03-18 22:41:35 +0000326 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000327 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000328 readline = BytesIO(newcode).readline
329 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000330 assert t1 == t2
331 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000332 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000333 out = ut.untokenize(iterable)
334 if ut.encoding is not None:
335 out = out.encode(ut.encoding)
336 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000337
Trent Nelson428de652008-03-18 22:41:35 +0000338
Benjamin Petersond3afada2009-10-09 21:43:09 +0000339def _get_normal_name(orig_enc):
340 """Imitates get_normal_name in tokenizer.c."""
341 # Only care about the first 12 characters.
342 enc = orig_enc[:12].lower().replace("_", "-")
343 if enc == "utf-8" or enc.startswith("utf-8-"):
344 return "utf-8"
345 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
346 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
347 return "iso-8859-1"
348 return orig_enc
349
Trent Nelson428de652008-03-18 22:41:35 +0000350def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000351 """
Trent Nelson428de652008-03-18 22:41:35 +0000352 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200353 be used to decode a Python source file. It requires one argument, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000354 in the same way as the tokenize() generator.
355
356 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000357 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000358
359 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000360 cookie as specified in pep-0263. If both a bom and a cookie are present,
361 but disagree, a SyntaxError will be raised. If the encoding cookie is an
362 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000363 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000364
365 If no encoding is specified, then the default of 'utf-8' will be returned.
366 """
Brett Cannonc33f3f22012-04-20 13:23:54 -0400367 try:
368 filename = readline.__self__.name
369 except AttributeError:
370 filename = None
Trent Nelson428de652008-03-18 22:41:35 +0000371 bom_found = False
372 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000373 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000374 def read_or_stop():
375 try:
376 return readline()
377 except StopIteration:
378 return b''
379
380 def find_cookie(line):
381 try:
Martin v. Löwis63674f42012-04-20 14:36:47 +0200382 # Decode as UTF-8. Either the line is an encoding declaration,
383 # in which case it should be pure ASCII, or it must be UTF-8
384 # per default encoding.
385 line_string = line.decode('utf-8')
Trent Nelson428de652008-03-18 22:41:35 +0000386 except UnicodeDecodeError:
Brett Cannonc33f3f22012-04-20 13:23:54 -0400387 msg = "invalid or missing encoding declaration"
388 if filename is not None:
389 msg = '{} for {!r}'.format(msg, filename)
390 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000391
Serhiy Storchakadafea852013-09-16 23:51:56 +0300392 match = cookie_re.match(line_string)
393 if not match:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000394 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300395 encoding = _get_normal_name(match.group(1))
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000396 try:
397 codec = lookup(encoding)
398 except LookupError:
399 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400400 if filename is None:
401 msg = "unknown encoding: " + encoding
402 else:
403 msg = "unknown encoding for {!r}: {}".format(filename,
404 encoding)
405 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000406
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000407 if bom_found:
Florent Xicluna11f0b412012-07-07 12:13:35 +0200408 if encoding != 'utf-8':
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000409 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400410 if filename is None:
411 msg = 'encoding problem: utf-8'
412 else:
413 msg = 'encoding problem for {!r}: utf-8'.format(filename)
414 raise SyntaxError(msg)
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000415 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000416 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000417
418 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000419 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000420 bom_found = True
421 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000422 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000423 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000424 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000425
426 encoding = find_cookie(first)
427 if encoding:
428 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200429 if not blank_re.match(first):
430 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000431
432 second = read_or_stop()
433 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000434 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000435
436 encoding = find_cookie(second)
437 if encoding:
438 return encoding, [first, second]
439
Benjamin Peterson689a5582010-03-18 22:29:52 +0000440 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000441
442
Victor Stinner58c07522010-11-09 01:08:59 +0000443def open(filename):
444 """Open a file in read only mode using the encoding detected by
445 detect_encoding().
446 """
Victor Stinner96917502014-12-05 10:17:10 +0100447 buffer = _builtin_open(filename, 'rb')
Victor Stinner387729e2015-05-26 00:43:58 +0200448 try:
449 encoding, lines = detect_encoding(buffer.readline)
450 buffer.seek(0)
451 text = TextIOWrapper(buffer, encoding, line_buffering=True)
452 text.mode = 'r'
453 return text
454 except:
455 buffer.close()
456 raise
Victor Stinner58c07522010-11-09 01:08:59 +0000457
458
Trent Nelson428de652008-03-18 22:41:35 +0000459def tokenize(readline):
460 """
Berker Peksagff8d0872015-12-30 01:41:58 +0200461 The tokenize() generator requires one argument, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000462 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000463 readline() method of built-in file objects. Each call to the function
Berker Peksagff8d0872015-12-30 01:41:58 +0200464 should return one line of input as bytes. Alternatively, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000465 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000466 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000467
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000468 The generator produces 5-tuples with these members: the token type; the
469 token string; a 2-tuple (srow, scol) of ints specifying the row and
470 column where the token begins in the source; a 2-tuple (erow, ecol) of
471 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000472 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000473 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000474
475 The first token sequence will always be an ENCODING token
476 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000477 """
Benjamin Peterson21db77e2009-11-14 16:27:26 +0000478 # This import is here to avoid problems when the itertools module is not
479 # built yet and tokenize is imported.
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000480 from itertools import chain, repeat
Trent Nelson428de652008-03-18 22:41:35 +0000481 encoding, consumed = detect_encoding(readline)
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000482 rl_gen = iter(readline, b"")
483 empty = repeat(b"")
484 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000485
486
487def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000488 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000489 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000490 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000491 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000492 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000493
Trent Nelson428de652008-03-18 22:41:35 +0000494 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000495 if encoding == "utf-8-sig":
496 # BOM will already have been stripped.
497 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000498 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Ammar Askarab75d9e2018-07-06 06:21:05 -0400499 last_line = b''
500 line = b''
501 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000502 try:
Ammar Askarab75d9e2018-07-06 06:21:05 -0400503 # We capture the value of the line variable here because
504 # readline uses the empty string '' to signal end of input,
505 # hence `line` itself will always be overwritten at the end
506 # of this loop.
507 last_line = line
Raymond Hettinger68c04532005-06-10 11:05:19 +0000508 line = readline()
509 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000510 line = b''
511
512 if encoding is not None:
513 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000514 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000515 pos, max = 0, len(line)
516
517 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000518 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000519 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000520 endmatch = endprog.match(line)
521 if endmatch:
522 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000523 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000524 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000525 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000526 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000527 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000528 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000529 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000530 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000531 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000532 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000533 else:
534 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000535 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000536 continue
537
Guido van Rossum1aec3231997-04-08 14:24:39 +0000538 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000539 if not line: break
540 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000541 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000542 if line[pos] == ' ':
543 column += 1
544 elif line[pos] == '\t':
545 column = (column//tabsize + 1)*tabsize
546 elif line[pos] == '\f':
547 column = 0
548 else:
549 break
550 pos += 1
551 if pos == max:
552 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000553
554 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000555 if line[pos] == '#':
556 comment_token = line[pos:].rstrip('\r\n')
Raymond Hettingera48db392009-04-29 00:34:27 +0000557 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000558 (lnum, pos), (lnum, pos + len(comment_token)), line)
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +0100559 pos += len(comment_token)
560
561 yield TokenInfo(NL, line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000562 (lnum, pos), (lnum, len(line)), line)
563 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000564
565 if column > indents[-1]: # count indents or dedents
566 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000567 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000568 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000569 if column not in indents:
570 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000571 "unindent does not match any outer indentation level",
572 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000573 indents = indents[:-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400574
Raymond Hettingera48db392009-04-29 00:34:27 +0000575 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000576
577 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000578 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000579 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000580 continued = 0
581
582 while pos < max:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200583 pseudomatch = _compile(PseudoToken).match(line, pos)
Guido van Rossum3b631771997-10-27 20:44:15 +0000584 if pseudomatch: # scan for tokens
585 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000586 spos, epos, pos = (lnum, start), (lnum, end), end
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200587 if start == end:
588 continue
Guido van Rossum1aec3231997-04-08 14:24:39 +0000589 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000590
Georg Brandldde00282007-03-18 19:01:53 +0000591 if (initial in numchars or # ordinary number
592 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000593 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000594 elif initial in '\r\n':
Yury Selivanov96ec9342015-07-23 15:01:58 +0300595 if parenlev > 0:
596 yield TokenInfo(NL, token, spos, epos, line)
597 else:
598 yield TokenInfo(NEWLINE, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300599
Guido van Rossum1aec3231997-04-08 14:24:39 +0000600 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000601 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000602 yield TokenInfo(COMMENT, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400603
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000604 elif token in triple_quoted:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200605 endprog = _compile(endpats[token])
Guido van Rossum3b631771997-10-27 20:44:15 +0000606 endmatch = endprog.match(line, pos)
607 if endmatch: # all on one line
608 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000609 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000610 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000611 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000612 strstart = (lnum, start) # multiple lines
613 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000614 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000615 break
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400616
617 # Check up to the first 3 chars of the token to see if
618 # they're in the single_quoted set. If so, they start
619 # a string.
620 # We're using the first 3, because we're looking for
621 # "rb'" (for example) at the start of the token. If
622 # we switch to longer prefixes, this needs to be
623 # adjusted.
624 # Note that initial == token[:1].
Berker Peksaga7161e72015-12-30 01:42:43 +0200625 # Also note that single quote checking must come after
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400626 # triple quote checking (above).
627 elif (initial in single_quoted or
628 token[:2] in single_quoted or
629 token[:3] in single_quoted):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000630 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000631 strstart = (lnum, start)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400632 # Again, using the first 3 chars of the
633 # token. This is looking for the matching end
634 # regex for the correct type of quote
635 # character. So it's really looking for
636 # endpats["'"] or endpats['"'], by trying to
637 # skip string prefix characters, if any.
638 endprog = _compile(endpats.get(initial) or
639 endpats.get(token[1]) or
640 endpats.get(token[2]))
Guido van Rossumde655271997-04-09 17:15:54 +0000641 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000642 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000643 break
644 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000645 yield TokenInfo(STRING, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400646
Benjamin Peterson33856de2010-08-30 14:41:20 +0000647 elif initial.isidentifier(): # ordinary name
Jelle Zijlstraac317702017-10-05 20:24:46 -0700648 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000649 elif initial == '\\': # continued stmt
650 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000651 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000652 if initial in '([{':
653 parenlev += 1
654 elif initial in ')]}':
655 parenlev -= 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000656 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000657 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000658 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000659 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000660 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000661
Ammar Askarab75d9e2018-07-06 06:21:05 -0400662 # Add an implicit NEWLINE if the input doesn't end in one
663 if last_line and last_line[-1] not in '\r\n':
664 yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000665 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000666 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
667 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000668
Trent Nelson428de652008-03-18 22:41:35 +0000669
670# An undocumented, backwards compatible, API for all the places in the standard
671# library that expect to be able to use tokenize with strings
672def generate_tokens(readline):
673 return _tokenize(readline, None)
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000674
Meador Inge14c0f032011-10-07 08:53:38 -0500675def main():
676 import argparse
677
678 # Helper error handling routines
679 def perror(message):
680 print(message, file=sys.stderr)
681
682 def error(message, filename=None, location=None):
683 if location:
684 args = (filename,) + location + (message,)
685 perror("%s:%d:%d: error: %s" % args)
686 elif filename:
687 perror("%s: error: %s" % (filename, message))
688 else:
689 perror("error: %s" % message)
690 sys.exit(1)
691
692 # Parse the arguments and options
693 parser = argparse.ArgumentParser(prog='python -m tokenize')
694 parser.add_argument(dest='filename', nargs='?',
695 metavar='filename.py',
696 help='the file to tokenize; defaults to stdin')
Meador Inge00c7f852012-01-19 00:44:45 -0600697 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
698 help='display token names using the exact type')
Meador Inge14c0f032011-10-07 08:53:38 -0500699 args = parser.parse_args()
700
701 try:
702 # Tokenize the input
703 if args.filename:
704 filename = args.filename
Victor Stinner96917502014-12-05 10:17:10 +0100705 with _builtin_open(filename, 'rb') as f:
Meador Inge14c0f032011-10-07 08:53:38 -0500706 tokens = list(tokenize(f.readline))
707 else:
708 filename = "<stdin>"
709 tokens = _tokenize(sys.stdin.readline, None)
710
711 # Output the tokenization
712 for token in tokens:
Meador Inge00c7f852012-01-19 00:44:45 -0600713 token_type = token.type
714 if args.exact:
715 token_type = token.exact_type
Meador Inge14c0f032011-10-07 08:53:38 -0500716 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
717 print("%-20s%-15s%-15r" %
Meador Inge00c7f852012-01-19 00:44:45 -0600718 (token_range, tok_name[token_type], token.string))
Meador Inge14c0f032011-10-07 08:53:38 -0500719 except IndentationError as err:
720 line, column = err.args[1][1:3]
721 error(err.args[0], filename, (line, column))
722 except TokenError as err:
723 line, column = err.args[1]
724 error(err.args[0], filename, (line, column))
725 except SyntaxError as err:
726 error(err, filename)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200727 except OSError as err:
Meador Inge14c0f032011-10-07 08:53:38 -0500728 error(err)
729 except KeyboardInterrupt:
730 print("interrupted\n")
731 except Exception as err:
732 perror("unexpected error: %s" % err)
733 raise
734
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000735if __name__ == "__main__":
Meador Inge14c0f032011-10-07 08:53:38 -0500736 main()