blob: 5fa4152609378a968a5f5f552e778e8f891ffeb1 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020027from builtins import open as _builtin_open
Benjamin Peterson433f32c2008-12-12 01:25:05 +000028from codecs import lookup, BOM_UTF8
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000029import collections
Victor Stinner58c07522010-11-09 01:08:59 +000030from io import TextIOWrapper
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050031from itertools import chain
Eric V. Smith1c8222c2015-10-26 04:37:55 -040032import itertools as _itertools
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050033import re
34import sys
35from token import *
36
Serhiy Storchakae431d3c2016-03-20 23:36:29 +020037cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +020038blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000039
Skip Montanaro40fc1602001-03-01 04:27:19 +000040import token
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +010041__all__ = token.__all__ + ["tokenize", "detect_encoding",
42 "untokenize", "TokenInfo"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000043del token
44
Meador Inge00c7f852012-01-19 00:44:45 -060045EXACT_TOKEN_TYPES = {
46 '(': LPAR,
47 ')': RPAR,
48 '[': LSQB,
49 ']': RSQB,
50 ':': COLON,
51 ',': COMMA,
52 ';': SEMI,
53 '+': PLUS,
54 '-': MINUS,
55 '*': STAR,
56 '/': SLASH,
57 '|': VBAR,
58 '&': AMPER,
59 '<': LESS,
60 '>': GREATER,
61 '=': EQUAL,
62 '.': DOT,
63 '%': PERCENT,
64 '{': LBRACE,
65 '}': RBRACE,
66 '==': EQEQUAL,
67 '!=': NOTEQUAL,
68 '<=': LESSEQUAL,
69 '>=': GREATEREQUAL,
70 '~': TILDE,
71 '^': CIRCUMFLEX,
72 '<<': LEFTSHIFT,
73 '>>': RIGHTSHIFT,
74 '**': DOUBLESTAR,
75 '+=': PLUSEQUAL,
76 '-=': MINEQUAL,
77 '*=': STAREQUAL,
78 '/=': SLASHEQUAL,
79 '%=': PERCENTEQUAL,
80 '&=': AMPEREQUAL,
81 '|=': VBAREQUAL,
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +020082 '^=': CIRCUMFLEXEQUAL,
Meador Inge00c7f852012-01-19 00:44:45 -060083 '<<=': LEFTSHIFTEQUAL,
84 '>>=': RIGHTSHIFTEQUAL,
85 '**=': DOUBLESTAREQUAL,
86 '//': DOUBLESLASH,
87 '//=': DOUBLESLASHEQUAL,
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +020088 '...': ELLIPSIS,
89 '->': RARROW,
Benjamin Petersond51374e2014-04-09 23:55:56 -040090 '@': AT,
91 '@=': ATEQUAL,
Meador Inge00c7f852012-01-19 00:44:45 -060092}
Guido van Rossum1aec3231997-04-08 14:24:39 +000093
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000094class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000095 def __repr__(self):
Raymond Hettingera0e79402010-09-09 08:29:05 +000096 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
97 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
98 self._replace(type=annotated_type))
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000099
Meador Inge00c7f852012-01-19 00:44:45 -0600100 @property
101 def exact_type(self):
102 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
103 return EXACT_TOKEN_TYPES[self.string]
104 else:
105 return self.type
106
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000107def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +0000108def any(*choices): return group(*choices) + '*'
109def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000110
Antoine Pitroufd036452008-08-19 17:56:33 +0000111# Note: we use unicode matching for names ("\w") but ascii matching for
112# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +0000113Whitespace = r'[ \f\t]*'
114Comment = r'#[^\r\n]*'
115Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000116Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000117
Brett Cannona721aba2016-09-09 14:57:09 -0700118Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
119Binnumber = r'0[bB](?:_?[01])+'
120Octnumber = r'0[oO](?:_?[0-7])+'
121Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000122Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Brett Cannona721aba2016-09-09 14:57:09 -0700123Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
124Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
125 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
126Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +0000127Floatnumber = group(Pointfloat, Expfloat)
Brett Cannona721aba2016-09-09 14:57:09 -0700128Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +0000129Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000130
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400131# Return the empty string, plus all of the valid string prefixes.
132def _all_string_prefixes():
133 # The valid string prefixes. Only contain the lower case versions,
134 # and don't contain any permuations (include 'fr', but not
135 # 'rf'). The various permutations will be generated.
136 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
137 # if we add binary f-strings, add: ['fb', 'fbr']
Jon Dufresne39726282017-05-18 07:35:54 -0700138 result = {''}
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400139 for prefix in _valid_string_prefixes:
140 for t in _itertools.permutations(prefix):
141 # create a list with upper and lower versions of each
142 # character
143 for u in _itertools.product(*[(c, c.upper()) for c in t]):
144 result.add(''.join(u))
145 return result
146
147def _compile(expr):
148 return re.compile(expr, re.UNICODE)
149
150# Note that since _all_string_prefixes includes the empty string,
151# StringPrefix can be the empty string (making it optional).
152StringPrefix = group(*_all_string_prefixes())
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000153
Tim Petersde495832000-10-07 05:09:39 +0000154# Tail end of ' string.
155Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
156# Tail end of " string.
157Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
158# Tail end of ''' string.
159Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
160# Tail end of """ string.
161Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000162Triple = group(StringPrefix + "'''", StringPrefix + '"""')
Tim Petersde495832000-10-07 05:09:39 +0000163# Single-line ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000164String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
165 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000166
Tim Petersde495832000-10-07 05:09:39 +0000167# Because of leftmost-then-longest match semantics, be sure to put the
168# longest operators first (e.g., if = came before ==, == would get
169# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +0000170Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +0000171 r"//=?", r"->",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400172 r"[+\-*/%&@|^=<>]=?",
Tim Petersde495832000-10-07 05:09:39 +0000173 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +0000174
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000175Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +0000176Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000177Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000178
Guido van Rossum3b631771997-10-27 20:44:15 +0000179PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000180Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000181
Tim Petersde495832000-10-07 05:09:39 +0000182# First (or only) line of ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000183ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000184 group("'", r'\\\r?\n'),
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000185 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000186 group('"', r'\\\r?\n'))
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200187PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
Guido van Rossum3b631771997-10-27 20:44:15 +0000188PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000189
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400190# For a given string prefix plus quotes, endpats maps it to a regex
191# to match the remainder of that string. _prefix can be empty, for
192# a normal single or triple quoted string (with no prefix).
193endpats = {}
194for _prefix in _all_string_prefixes():
195 endpats[_prefix + "'"] = Single
196 endpats[_prefix + '"'] = Double
197 endpats[_prefix + "'''"] = Single3
198 endpats[_prefix + '"""'] = Double3
Benjamin Peterson33856de2010-08-30 14:41:20 +0000199
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400200# A set of all of the single and triple quoted string prefixes,
201# including the opening quotes.
202single_quoted = set()
203triple_quoted = set()
204for t in _all_string_prefixes():
205 for u in (t + '"', t + "'"):
206 single_quoted.add(u)
207 for u in (t + '"""', t + "'''"):
208 triple_quoted.add(u)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000209
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000210tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000211
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000212class TokenError(Exception): pass
213
214class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000215
Tim Peters5ca576e2001-06-18 22:08:13 +0000216
Thomas Wouters89f507f2006-12-13 04:49:30 +0000217class Untokenizer:
218
219 def __init__(self):
220 self.tokens = []
221 self.prev_row = 1
222 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000223 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000224
225 def add_whitespace(self, start):
226 row, col = start
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500227 if row < self.prev_row or row == self.prev_row and col < self.prev_col:
228 raise ValueError("start ({},{}) precedes previous end ({},{})"
229 .format(row, col, self.prev_row, self.prev_col))
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500230 row_offset = row - self.prev_row
Terry Jan Reedyf106f8f2014-02-23 23:39:57 -0500231 if row_offset:
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500232 self.tokens.append("\\\n" * row_offset)
233 self.prev_col = 0
Thomas Wouters89f507f2006-12-13 04:49:30 +0000234 col_offset = col - self.prev_col
235 if col_offset:
236 self.tokens.append(" " * col_offset)
237
238 def untokenize(self, iterable):
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500239 it = iter(iterable)
Dingyuan Wange411b662015-06-22 10:01:12 +0800240 indents = []
241 startline = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500242 for t in it:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000243 if len(t) == 2:
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500244 self.compat(t, it)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000245 break
246 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000247 if tok_type == ENCODING:
248 self.encoding = token
249 continue
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500250 if tok_type == ENDMARKER:
251 break
Dingyuan Wange411b662015-06-22 10:01:12 +0800252 if tok_type == INDENT:
253 indents.append(token)
254 continue
255 elif tok_type == DEDENT:
256 indents.pop()
257 self.prev_row, self.prev_col = end
258 continue
259 elif tok_type in (NEWLINE, NL):
260 startline = True
261 elif startline and indents:
262 indent = indents[-1]
263 if start[1] >= len(indent):
264 self.tokens.append(indent)
265 self.prev_col = len(indent)
266 startline = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000267 self.add_whitespace(start)
268 self.tokens.append(token)
269 self.prev_row, self.prev_col = end
270 if tok_type in (NEWLINE, NL):
271 self.prev_row += 1
272 self.prev_col = 0
273 return "".join(self.tokens)
274
275 def compat(self, token, iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000276 indents = []
277 toks_append = self.tokens.append
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500278 startline = token[0] in (NEWLINE, NL)
Christian Heimesba4af492008-03-28 00:55:15 +0000279 prevstring = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500280
281 for tok in chain([token], iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000282 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000283 if toknum == ENCODING:
284 self.encoding = tokval
285 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000286
Yury Selivanov75445082015-05-11 22:57:16 -0400287 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000288 tokval += ' '
289
Christian Heimesba4af492008-03-28 00:55:15 +0000290 # Insert a space between two consecutive strings
291 if toknum == STRING:
292 if prevstring:
293 tokval = ' ' + tokval
294 prevstring = True
295 else:
296 prevstring = False
297
Thomas Wouters89f507f2006-12-13 04:49:30 +0000298 if toknum == INDENT:
299 indents.append(tokval)
300 continue
301 elif toknum == DEDENT:
302 indents.pop()
303 continue
304 elif toknum in (NEWLINE, NL):
305 startline = True
306 elif startline and indents:
307 toks_append(indents[-1])
308 startline = False
309 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000310
Trent Nelson428de652008-03-18 22:41:35 +0000311
Raymond Hettinger68c04532005-06-10 11:05:19 +0000312def untokenize(iterable):
313 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000314 It returns a bytes object, encoded using the ENCODING
315 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000316
317 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000318 with at least two elements, a token number and token value. If
319 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000320
Thomas Wouters89f507f2006-12-13 04:49:30 +0000321 Round-trip invariant for full input:
322 Untokenized source will match input source exactly
323
Berker Peksagff8d0872015-12-30 01:41:58 +0200324 Round-trip invariant for limited input:
325 # Output bytes will tokenize back to the input
Trent Nelson428de652008-03-18 22:41:35 +0000326 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000327 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000328 readline = BytesIO(newcode).readline
329 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000330 assert t1 == t2
331 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000332 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000333 out = ut.untokenize(iterable)
334 if ut.encoding is not None:
335 out = out.encode(ut.encoding)
336 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000337
Trent Nelson428de652008-03-18 22:41:35 +0000338
Benjamin Petersond3afada2009-10-09 21:43:09 +0000339def _get_normal_name(orig_enc):
340 """Imitates get_normal_name in tokenizer.c."""
341 # Only care about the first 12 characters.
342 enc = orig_enc[:12].lower().replace("_", "-")
343 if enc == "utf-8" or enc.startswith("utf-8-"):
344 return "utf-8"
345 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
346 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
347 return "iso-8859-1"
348 return orig_enc
349
Trent Nelson428de652008-03-18 22:41:35 +0000350def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000351 """
Trent Nelson428de652008-03-18 22:41:35 +0000352 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200353 be used to decode a Python source file. It requires one argument, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000354 in the same way as the tokenize() generator.
355
356 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000357 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000358
359 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000360 cookie as specified in pep-0263. If both a bom and a cookie are present,
361 but disagree, a SyntaxError will be raised. If the encoding cookie is an
362 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000363 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000364
365 If no encoding is specified, then the default of 'utf-8' will be returned.
366 """
Brett Cannonc33f3f22012-04-20 13:23:54 -0400367 try:
368 filename = readline.__self__.name
369 except AttributeError:
370 filename = None
Trent Nelson428de652008-03-18 22:41:35 +0000371 bom_found = False
372 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000373 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000374 def read_or_stop():
375 try:
376 return readline()
377 except StopIteration:
378 return b''
379
380 def find_cookie(line):
381 try:
Martin v. Löwis63674f42012-04-20 14:36:47 +0200382 # Decode as UTF-8. Either the line is an encoding declaration,
383 # in which case it should be pure ASCII, or it must be UTF-8
384 # per default encoding.
385 line_string = line.decode('utf-8')
Trent Nelson428de652008-03-18 22:41:35 +0000386 except UnicodeDecodeError:
Brett Cannonc33f3f22012-04-20 13:23:54 -0400387 msg = "invalid or missing encoding declaration"
388 if filename is not None:
389 msg = '{} for {!r}'.format(msg, filename)
390 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000391
Serhiy Storchakadafea852013-09-16 23:51:56 +0300392 match = cookie_re.match(line_string)
393 if not match:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000394 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300395 encoding = _get_normal_name(match.group(1))
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000396 try:
397 codec = lookup(encoding)
398 except LookupError:
399 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400400 if filename is None:
401 msg = "unknown encoding: " + encoding
402 else:
403 msg = "unknown encoding for {!r}: {}".format(filename,
404 encoding)
405 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000406
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000407 if bom_found:
Florent Xicluna11f0b412012-07-07 12:13:35 +0200408 if encoding != 'utf-8':
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000409 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400410 if filename is None:
411 msg = 'encoding problem: utf-8'
412 else:
413 msg = 'encoding problem for {!r}: utf-8'.format(filename)
414 raise SyntaxError(msg)
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000415 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000416 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000417
418 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000419 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000420 bom_found = True
421 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000422 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000423 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000424 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000425
426 encoding = find_cookie(first)
427 if encoding:
428 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200429 if not blank_re.match(first):
430 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000431
432 second = read_or_stop()
433 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000434 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000435
436 encoding = find_cookie(second)
437 if encoding:
438 return encoding, [first, second]
439
Benjamin Peterson689a5582010-03-18 22:29:52 +0000440 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000441
442
Victor Stinner58c07522010-11-09 01:08:59 +0000443def open(filename):
444 """Open a file in read only mode using the encoding detected by
445 detect_encoding().
446 """
Victor Stinner96917502014-12-05 10:17:10 +0100447 buffer = _builtin_open(filename, 'rb')
Victor Stinner387729e2015-05-26 00:43:58 +0200448 try:
449 encoding, lines = detect_encoding(buffer.readline)
450 buffer.seek(0)
451 text = TextIOWrapper(buffer, encoding, line_buffering=True)
452 text.mode = 'r'
453 return text
454 except:
455 buffer.close()
456 raise
Victor Stinner58c07522010-11-09 01:08:59 +0000457
458
Trent Nelson428de652008-03-18 22:41:35 +0000459def tokenize(readline):
460 """
Berker Peksagff8d0872015-12-30 01:41:58 +0200461 The tokenize() generator requires one argument, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000462 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000463 readline() method of built-in file objects. Each call to the function
Berker Peksagff8d0872015-12-30 01:41:58 +0200464 should return one line of input as bytes. Alternatively, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000465 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000466 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000467
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000468 The generator produces 5-tuples with these members: the token type; the
469 token string; a 2-tuple (srow, scol) of ints specifying the row and
470 column where the token begins in the source; a 2-tuple (erow, ecol) of
471 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000472 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000473 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000474
475 The first token sequence will always be an ENCODING token
476 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000477 """
Benjamin Peterson21db77e2009-11-14 16:27:26 +0000478 # This import is here to avoid problems when the itertools module is not
479 # built yet and tokenize is imported.
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000480 from itertools import chain, repeat
Trent Nelson428de652008-03-18 22:41:35 +0000481 encoding, consumed = detect_encoding(readline)
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000482 rl_gen = iter(readline, b"")
483 empty = repeat(b"")
484 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000485
486
487def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000488 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000489 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000490 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000491 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000492 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000493
Yury Selivanov96ec9342015-07-23 15:01:58 +0300494 # 'stashed' and 'async_*' are used for async/await parsing
Yury Selivanov75445082015-05-11 22:57:16 -0400495 stashed = None
Yury Selivanov96ec9342015-07-23 15:01:58 +0300496 async_def = False
497 async_def_indent = 0
498 async_def_nl = False
Yury Selivanov75445082015-05-11 22:57:16 -0400499
Trent Nelson428de652008-03-18 22:41:35 +0000500 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000501 if encoding == "utf-8-sig":
502 # BOM will already have been stripped.
503 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000504 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000505 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000506 try:
507 line = readline()
508 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000509 line = b''
510
511 if encoding is not None:
512 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000513 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000514 pos, max = 0, len(line)
515
516 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000517 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000518 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000519 endmatch = endprog.match(line)
520 if endmatch:
521 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000522 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000523 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000524 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000525 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000526 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000527 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000528 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000529 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000530 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000531 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000532 else:
533 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000534 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000535 continue
536
Guido van Rossum1aec3231997-04-08 14:24:39 +0000537 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000538 if not line: break
539 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000540 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000541 if line[pos] == ' ':
542 column += 1
543 elif line[pos] == '\t':
544 column = (column//tabsize + 1)*tabsize
545 elif line[pos] == '\f':
546 column = 0
547 else:
548 break
549 pos += 1
550 if pos == max:
551 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000552
553 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000554 if line[pos] == '#':
555 comment_token = line[pos:].rstrip('\r\n')
Raymond Hettingera48db392009-04-29 00:34:27 +0000556 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000557 (lnum, pos), (lnum, pos + len(comment_token)), line)
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +0100558 pos += len(comment_token)
559
560 yield TokenInfo(NL, line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000561 (lnum, pos), (lnum, len(line)), line)
562 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000563
564 if column > indents[-1]: # count indents or dedents
565 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000566 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000567 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000568 if column not in indents:
569 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000570 "unindent does not match any outer indentation level",
571 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000572 indents = indents[:-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400573
Yury Selivanov96ec9342015-07-23 15:01:58 +0300574 if async_def and async_def_indent >= indents[-1]:
575 async_def = False
576 async_def_nl = False
577 async_def_indent = 0
Yury Selivanov75445082015-05-11 22:57:16 -0400578
Raymond Hettingera48db392009-04-29 00:34:27 +0000579 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000580
Yury Selivanov96ec9342015-07-23 15:01:58 +0300581 if async_def and async_def_nl and async_def_indent >= indents[-1]:
582 async_def = False
583 async_def_nl = False
584 async_def_indent = 0
585
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000586 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000587 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000588 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000589 continued = 0
590
591 while pos < max:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200592 pseudomatch = _compile(PseudoToken).match(line, pos)
Guido van Rossum3b631771997-10-27 20:44:15 +0000593 if pseudomatch: # scan for tokens
594 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000595 spos, epos, pos = (lnum, start), (lnum, end), end
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200596 if start == end:
597 continue
Guido van Rossum1aec3231997-04-08 14:24:39 +0000598 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000599
Georg Brandldde00282007-03-18 19:01:53 +0000600 if (initial in numchars or # ordinary number
601 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000602 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000603 elif initial in '\r\n':
Yury Selivanov75445082015-05-11 22:57:16 -0400604 if stashed:
605 yield stashed
606 stashed = None
Yury Selivanov96ec9342015-07-23 15:01:58 +0300607 if parenlev > 0:
608 yield TokenInfo(NL, token, spos, epos, line)
609 else:
610 yield TokenInfo(NEWLINE, token, spos, epos, line)
611 if async_def:
612 async_def_nl = True
613
Guido van Rossum1aec3231997-04-08 14:24:39 +0000614 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000615 assert not token.endswith("\n")
Yury Selivanov75445082015-05-11 22:57:16 -0400616 if stashed:
617 yield stashed
618 stashed = None
Raymond Hettingera48db392009-04-29 00:34:27 +0000619 yield TokenInfo(COMMENT, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400620
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000621 elif token in triple_quoted:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200622 endprog = _compile(endpats[token])
Guido van Rossum3b631771997-10-27 20:44:15 +0000623 endmatch = endprog.match(line, pos)
624 if endmatch: # all on one line
625 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000626 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000627 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000628 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000629 strstart = (lnum, start) # multiple lines
630 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000631 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000632 break
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400633
634 # Check up to the first 3 chars of the token to see if
635 # they're in the single_quoted set. If so, they start
636 # a string.
637 # We're using the first 3, because we're looking for
638 # "rb'" (for example) at the start of the token. If
639 # we switch to longer prefixes, this needs to be
640 # adjusted.
641 # Note that initial == token[:1].
Berker Peksaga7161e72015-12-30 01:42:43 +0200642 # Also note that single quote checking must come after
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400643 # triple quote checking (above).
644 elif (initial in single_quoted or
645 token[:2] in single_quoted or
646 token[:3] in single_quoted):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000647 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000648 strstart = (lnum, start)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400649 # Again, using the first 3 chars of the
650 # token. This is looking for the matching end
651 # regex for the correct type of quote
652 # character. So it's really looking for
653 # endpats["'"] or endpats['"'], by trying to
654 # skip string prefix characters, if any.
655 endprog = _compile(endpats.get(initial) or
656 endpats.get(token[1]) or
657 endpats.get(token[2]))
Guido van Rossumde655271997-04-09 17:15:54 +0000658 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000659 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000660 break
661 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000662 yield TokenInfo(STRING, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400663
Benjamin Peterson33856de2010-08-30 14:41:20 +0000664 elif initial.isidentifier(): # ordinary name
Yury Selivanov75445082015-05-11 22:57:16 -0400665 if token in ('async', 'await'):
Yury Selivanov96ec9342015-07-23 15:01:58 +0300666 if async_def:
Yury Selivanov75445082015-05-11 22:57:16 -0400667 yield TokenInfo(
668 ASYNC if token == 'async' else AWAIT,
669 token, spos, epos, line)
670 continue
671
672 tok = TokenInfo(NAME, token, spos, epos, line)
673 if token == 'async' and not stashed:
674 stashed = tok
675 continue
676
677 if token == 'def':
678 if (stashed
679 and stashed.type == NAME
680 and stashed.string == 'async'):
681
Yury Selivanov96ec9342015-07-23 15:01:58 +0300682 async_def = True
683 async_def_indent = indents[-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400684
685 yield TokenInfo(ASYNC, stashed.string,
686 stashed.start, stashed.end,
687 stashed.line)
688 stashed = None
Yury Selivanov75445082015-05-11 22:57:16 -0400689
690 if stashed:
691 yield stashed
692 stashed = None
693
694 yield tok
Guido van Rossum3b631771997-10-27 20:44:15 +0000695 elif initial == '\\': # continued stmt
696 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000697 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000698 if initial in '([{':
699 parenlev += 1
700 elif initial in ')]}':
701 parenlev -= 1
Yury Selivanov75445082015-05-11 22:57:16 -0400702 if stashed:
703 yield stashed
704 stashed = None
Raymond Hettingera48db392009-04-29 00:34:27 +0000705 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000706 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000707 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000708 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000709 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000710
Yury Selivanov75445082015-05-11 22:57:16 -0400711 if stashed:
712 yield stashed
713 stashed = None
714
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000715 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000716 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
717 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000718
Trent Nelson428de652008-03-18 22:41:35 +0000719
720# An undocumented, backwards compatible, API for all the places in the standard
721# library that expect to be able to use tokenize with strings
722def generate_tokens(readline):
723 return _tokenize(readline, None)
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000724
Meador Inge14c0f032011-10-07 08:53:38 -0500725def main():
726 import argparse
727
728 # Helper error handling routines
729 def perror(message):
730 print(message, file=sys.stderr)
731
732 def error(message, filename=None, location=None):
733 if location:
734 args = (filename,) + location + (message,)
735 perror("%s:%d:%d: error: %s" % args)
736 elif filename:
737 perror("%s: error: %s" % (filename, message))
738 else:
739 perror("error: %s" % message)
740 sys.exit(1)
741
742 # Parse the arguments and options
743 parser = argparse.ArgumentParser(prog='python -m tokenize')
744 parser.add_argument(dest='filename', nargs='?',
745 metavar='filename.py',
746 help='the file to tokenize; defaults to stdin')
Meador Inge00c7f852012-01-19 00:44:45 -0600747 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
748 help='display token names using the exact type')
Meador Inge14c0f032011-10-07 08:53:38 -0500749 args = parser.parse_args()
750
751 try:
752 # Tokenize the input
753 if args.filename:
754 filename = args.filename
Victor Stinner96917502014-12-05 10:17:10 +0100755 with _builtin_open(filename, 'rb') as f:
Meador Inge14c0f032011-10-07 08:53:38 -0500756 tokens = list(tokenize(f.readline))
757 else:
758 filename = "<stdin>"
759 tokens = _tokenize(sys.stdin.readline, None)
760
761 # Output the tokenization
762 for token in tokens:
Meador Inge00c7f852012-01-19 00:44:45 -0600763 token_type = token.type
764 if args.exact:
765 token_type = token.exact_type
Meador Inge14c0f032011-10-07 08:53:38 -0500766 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
767 print("%-20s%-15s%-15r" %
Meador Inge00c7f852012-01-19 00:44:45 -0600768 (token_range, tok_name[token_type], token.string))
Meador Inge14c0f032011-10-07 08:53:38 -0500769 except IndentationError as err:
770 line, column = err.args[1][1:3]
771 error(err.args[0], filename, (line, column))
772 except TokenError as err:
773 line, column = err.args[1]
774 error(err.args[0], filename, (line, column))
775 except SyntaxError as err:
776 error(err, filename)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200777 except OSError as err:
Meador Inge14c0f032011-10-07 08:53:38 -0500778 error(err)
779 except KeyboardInterrupt:
780 print("interrupted\n")
781 except Exception as err:
782 perror("unexpected error: %s" % err)
783 raise
784
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000785if __name__ == "__main__":
Meador Inge14c0f032011-10-07 08:53:38 -0500786 main()