blob: f5c6ac7f5e055924438f8e27767abd449aadf322 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020027from builtins import open as _builtin_open
Benjamin Peterson433f32c2008-12-12 01:25:05 +000028from codecs import lookup, BOM_UTF8
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000029import collections
Victor Stinner58c07522010-11-09 01:08:59 +000030from io import TextIOWrapper
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050031from itertools import chain
Eric V. Smith1c8222c2015-10-26 04:37:55 -040032import itertools as _itertools
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050033import re
34import sys
35from token import *
36
Serhiy Storchakae431d3c2016-03-20 23:36:29 +020037cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +020038blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000039
Skip Montanaro40fc1602001-03-01 04:27:19 +000040import token
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +010041__all__ = token.__all__ + ["tokenize", "detect_encoding",
42 "untokenize", "TokenInfo"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000043del token
44
Meador Inge00c7f852012-01-19 00:44:45 -060045EXACT_TOKEN_TYPES = {
46 '(': LPAR,
47 ')': RPAR,
48 '[': LSQB,
49 ']': RSQB,
50 ':': COLON,
51 ',': COMMA,
52 ';': SEMI,
53 '+': PLUS,
54 '-': MINUS,
55 '*': STAR,
56 '/': SLASH,
57 '|': VBAR,
58 '&': AMPER,
59 '<': LESS,
60 '>': GREATER,
61 '=': EQUAL,
62 '.': DOT,
63 '%': PERCENT,
64 '{': LBRACE,
65 '}': RBRACE,
66 '==': EQEQUAL,
67 '!=': NOTEQUAL,
68 '<=': LESSEQUAL,
69 '>=': GREATEREQUAL,
70 '~': TILDE,
71 '^': CIRCUMFLEX,
72 '<<': LEFTSHIFT,
73 '>>': RIGHTSHIFT,
74 '**': DOUBLESTAR,
75 '+=': PLUSEQUAL,
76 '-=': MINEQUAL,
77 '*=': STAREQUAL,
78 '/=': SLASHEQUAL,
79 '%=': PERCENTEQUAL,
80 '&=': AMPEREQUAL,
81 '|=': VBAREQUAL,
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +020082 '^=': CIRCUMFLEXEQUAL,
Meador Inge00c7f852012-01-19 00:44:45 -060083 '<<=': LEFTSHIFTEQUAL,
84 '>>=': RIGHTSHIFTEQUAL,
85 '**=': DOUBLESTAREQUAL,
86 '//': DOUBLESLASH,
87 '//=': DOUBLESLASHEQUAL,
Jim Fasarakis-Hilliardd4914e92017-03-14 22:16:15 +020088 '...': ELLIPSIS,
89 '->': RARROW,
Benjamin Petersond51374e2014-04-09 23:55:56 -040090 '@': AT,
91 '@=': ATEQUAL,
Meador Inge00c7f852012-01-19 00:44:45 -060092}
Guido van Rossum1aec3231997-04-08 14:24:39 +000093
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000094class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000095 def __repr__(self):
Raymond Hettingera0e79402010-09-09 08:29:05 +000096 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
97 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
98 self._replace(type=annotated_type))
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000099
Meador Inge00c7f852012-01-19 00:44:45 -0600100 @property
101 def exact_type(self):
102 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
103 return EXACT_TOKEN_TYPES[self.string]
104 else:
105 return self.type
106
Eric S. Raymondb08b2d32001-02-09 11:10:16 +0000107def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +0000108def any(*choices): return group(*choices) + '*'
109def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000110
Antoine Pitroufd036452008-08-19 17:56:33 +0000111# Note: we use unicode matching for names ("\w") but ascii matching for
112# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +0000113Whitespace = r'[ \f\t]*'
114Comment = r'#[^\r\n]*'
115Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000116Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000117
Brett Cannona721aba2016-09-09 14:57:09 -0700118Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
119Binnumber = r'0[bB](?:_?[01])+'
120Octnumber = r'0[oO](?:_?[0-7])+'
121Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000122Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Brett Cannona721aba2016-09-09 14:57:09 -0700123Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
124Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
125 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
126Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +0000127Floatnumber = group(Pointfloat, Expfloat)
Brett Cannona721aba2016-09-09 14:57:09 -0700128Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +0000129Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000130
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400131# Return the empty string, plus all of the valid string prefixes.
132def _all_string_prefixes():
133 # The valid string prefixes. Only contain the lower case versions,
134 # and don't contain any permuations (include 'fr', but not
135 # 'rf'). The various permutations will be generated.
136 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
137 # if we add binary f-strings, add: ['fb', 'fbr']
Jon Dufresne39726282017-05-18 07:35:54 -0700138 result = {''}
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400139 for prefix in _valid_string_prefixes:
140 for t in _itertools.permutations(prefix):
141 # create a list with upper and lower versions of each
142 # character
143 for u in _itertools.product(*[(c, c.upper()) for c in t]):
144 result.add(''.join(u))
145 return result
146
147def _compile(expr):
148 return re.compile(expr, re.UNICODE)
149
150# Note that since _all_string_prefixes includes the empty string,
151# StringPrefix can be the empty string (making it optional).
152StringPrefix = group(*_all_string_prefixes())
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000153
Tim Petersde495832000-10-07 05:09:39 +0000154# Tail end of ' string.
155Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
156# Tail end of " string.
157Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
158# Tail end of ''' string.
159Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
160# Tail end of """ string.
161Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000162Triple = group(StringPrefix + "'''", StringPrefix + '"""')
Tim Petersde495832000-10-07 05:09:39 +0000163# Single-line ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000164String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
165 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000166
Tim Petersde495832000-10-07 05:09:39 +0000167# Because of leftmost-then-longest match semantics, be sure to put the
168# longest operators first (e.g., if = came before ==, == would get
169# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +0000170Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +0000171 r"//=?", r"->",
Benjamin Petersond51374e2014-04-09 23:55:56 -0400172 r"[+\-*/%&@|^=<>]=?",
Tim Petersde495832000-10-07 05:09:39 +0000173 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +0000174
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000175Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +0000176Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000177Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000178
Guido van Rossum3b631771997-10-27 20:44:15 +0000179PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000180Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000181
Tim Petersde495832000-10-07 05:09:39 +0000182# First (or only) line of ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000183ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000184 group("'", r'\\\r?\n'),
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000185 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000186 group('"', r'\\\r?\n'))
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200187PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
Guido van Rossum3b631771997-10-27 20:44:15 +0000188PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000189
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400190# For a given string prefix plus quotes, endpats maps it to a regex
191# to match the remainder of that string. _prefix can be empty, for
192# a normal single or triple quoted string (with no prefix).
193endpats = {}
194for _prefix in _all_string_prefixes():
195 endpats[_prefix + "'"] = Single
196 endpats[_prefix + '"'] = Double
197 endpats[_prefix + "'''"] = Single3
198 endpats[_prefix + '"""'] = Double3
Benjamin Peterson33856de2010-08-30 14:41:20 +0000199
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400200# A set of all of the single and triple quoted string prefixes,
201# including the opening quotes.
202single_quoted = set()
203triple_quoted = set()
204for t in _all_string_prefixes():
205 for u in (t + '"', t + "'"):
206 single_quoted.add(u)
207 for u in (t + '"""', t + "'''"):
208 triple_quoted.add(u)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000209
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000210tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000211
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000212class TokenError(Exception): pass
213
214class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000215
Tim Peters5ca576e2001-06-18 22:08:13 +0000216
Thomas Wouters89f507f2006-12-13 04:49:30 +0000217class Untokenizer:
218
219 def __init__(self):
220 self.tokens = []
221 self.prev_row = 1
222 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000223 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000224
225 def add_whitespace(self, start):
226 row, col = start
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500227 if row < self.prev_row or row == self.prev_row and col < self.prev_col:
228 raise ValueError("start ({},{}) precedes previous end ({},{})"
229 .format(row, col, self.prev_row, self.prev_col))
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500230 row_offset = row - self.prev_row
Terry Jan Reedyf106f8f2014-02-23 23:39:57 -0500231 if row_offset:
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500232 self.tokens.append("\\\n" * row_offset)
233 self.prev_col = 0
Thomas Wouters89f507f2006-12-13 04:49:30 +0000234 col_offset = col - self.prev_col
235 if col_offset:
236 self.tokens.append(" " * col_offset)
237
238 def untokenize(self, iterable):
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500239 it = iter(iterable)
Dingyuan Wange411b662015-06-22 10:01:12 +0800240 indents = []
241 startline = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500242 for t in it:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000243 if len(t) == 2:
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500244 self.compat(t, it)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000245 break
246 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000247 if tok_type == ENCODING:
248 self.encoding = token
249 continue
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500250 if tok_type == ENDMARKER:
251 break
Dingyuan Wange411b662015-06-22 10:01:12 +0800252 if tok_type == INDENT:
253 indents.append(token)
254 continue
255 elif tok_type == DEDENT:
256 indents.pop()
257 self.prev_row, self.prev_col = end
258 continue
259 elif tok_type in (NEWLINE, NL):
260 startline = True
261 elif startline and indents:
262 indent = indents[-1]
263 if start[1] >= len(indent):
264 self.tokens.append(indent)
265 self.prev_col = len(indent)
266 startline = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000267 self.add_whitespace(start)
268 self.tokens.append(token)
269 self.prev_row, self.prev_col = end
270 if tok_type in (NEWLINE, NL):
271 self.prev_row += 1
272 self.prev_col = 0
273 return "".join(self.tokens)
274
275 def compat(self, token, iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000276 indents = []
277 toks_append = self.tokens.append
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500278 startline = token[0] in (NEWLINE, NL)
Christian Heimesba4af492008-03-28 00:55:15 +0000279 prevstring = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500280
281 for tok in chain([token], iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000282 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000283 if toknum == ENCODING:
284 self.encoding = tokval
285 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000286
Yury Selivanov75445082015-05-11 22:57:16 -0400287 if toknum in (NAME, NUMBER, ASYNC, AWAIT):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000288 tokval += ' '
289
Christian Heimesba4af492008-03-28 00:55:15 +0000290 # Insert a space between two consecutive strings
291 if toknum == STRING:
292 if prevstring:
293 tokval = ' ' + tokval
294 prevstring = True
295 else:
296 prevstring = False
297
Thomas Wouters89f507f2006-12-13 04:49:30 +0000298 if toknum == INDENT:
299 indents.append(tokval)
300 continue
301 elif toknum == DEDENT:
302 indents.pop()
303 continue
304 elif toknum in (NEWLINE, NL):
305 startline = True
306 elif startline and indents:
307 toks_append(indents[-1])
308 startline = False
309 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000310
Trent Nelson428de652008-03-18 22:41:35 +0000311
Raymond Hettinger68c04532005-06-10 11:05:19 +0000312def untokenize(iterable):
313 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000314 It returns a bytes object, encoded using the ENCODING
315 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000316
317 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000318 with at least two elements, a token number and token value. If
319 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000320
Thomas Wouters89f507f2006-12-13 04:49:30 +0000321 Round-trip invariant for full input:
322 Untokenized source will match input source exactly
323
Berker Peksagff8d0872015-12-30 01:41:58 +0200324 Round-trip invariant for limited input:
325 # Output bytes will tokenize back to the input
Trent Nelson428de652008-03-18 22:41:35 +0000326 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000327 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000328 readline = BytesIO(newcode).readline
329 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000330 assert t1 == t2
331 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000332 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000333 out = ut.untokenize(iterable)
334 if ut.encoding is not None:
335 out = out.encode(ut.encoding)
336 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000337
Trent Nelson428de652008-03-18 22:41:35 +0000338
Benjamin Petersond3afada2009-10-09 21:43:09 +0000339def _get_normal_name(orig_enc):
340 """Imitates get_normal_name in tokenizer.c."""
341 # Only care about the first 12 characters.
342 enc = orig_enc[:12].lower().replace("_", "-")
343 if enc == "utf-8" or enc.startswith("utf-8-"):
344 return "utf-8"
345 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
346 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
347 return "iso-8859-1"
348 return orig_enc
349
Trent Nelson428de652008-03-18 22:41:35 +0000350def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000351 """
Trent Nelson428de652008-03-18 22:41:35 +0000352 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200353 be used to decode a Python source file. It requires one argument, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000354 in the same way as the tokenize() generator.
355
356 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000357 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000358
359 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000360 cookie as specified in pep-0263. If both a bom and a cookie are present,
361 but disagree, a SyntaxError will be raised. If the encoding cookie is an
362 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000363 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000364
365 If no encoding is specified, then the default of 'utf-8' will be returned.
366 """
Brett Cannonc33f3f22012-04-20 13:23:54 -0400367 try:
368 filename = readline.__self__.name
369 except AttributeError:
370 filename = None
Trent Nelson428de652008-03-18 22:41:35 +0000371 bom_found = False
372 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000373 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000374 def read_or_stop():
375 try:
376 return readline()
377 except StopIteration:
378 return b''
379
380 def find_cookie(line):
381 try:
Martin v. Löwis63674f42012-04-20 14:36:47 +0200382 # Decode as UTF-8. Either the line is an encoding declaration,
383 # in which case it should be pure ASCII, or it must be UTF-8
384 # per default encoding.
385 line_string = line.decode('utf-8')
Trent Nelson428de652008-03-18 22:41:35 +0000386 except UnicodeDecodeError:
Brett Cannonc33f3f22012-04-20 13:23:54 -0400387 msg = "invalid or missing encoding declaration"
388 if filename is not None:
389 msg = '{} for {!r}'.format(msg, filename)
390 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000391
Serhiy Storchakadafea852013-09-16 23:51:56 +0300392 match = cookie_re.match(line_string)
393 if not match:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000394 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300395 encoding = _get_normal_name(match.group(1))
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000396 try:
397 codec = lookup(encoding)
398 except LookupError:
399 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400400 if filename is None:
401 msg = "unknown encoding: " + encoding
402 else:
403 msg = "unknown encoding for {!r}: {}".format(filename,
404 encoding)
405 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000406
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000407 if bom_found:
Florent Xicluna11f0b412012-07-07 12:13:35 +0200408 if encoding != 'utf-8':
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000409 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400410 if filename is None:
411 msg = 'encoding problem: utf-8'
412 else:
413 msg = 'encoding problem for {!r}: utf-8'.format(filename)
414 raise SyntaxError(msg)
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000415 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000416 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000417
418 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000419 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000420 bom_found = True
421 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000422 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000423 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000424 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000425
426 encoding = find_cookie(first)
427 if encoding:
428 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200429 if not blank_re.match(first):
430 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000431
432 second = read_or_stop()
433 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000434 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000435
436 encoding = find_cookie(second)
437 if encoding:
438 return encoding, [first, second]
439
Benjamin Peterson689a5582010-03-18 22:29:52 +0000440 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000441
442
Victor Stinner58c07522010-11-09 01:08:59 +0000443def open(filename):
444 """Open a file in read only mode using the encoding detected by
445 detect_encoding().
446 """
Victor Stinner96917502014-12-05 10:17:10 +0100447 buffer = _builtin_open(filename, 'rb')
Victor Stinner387729e2015-05-26 00:43:58 +0200448 try:
449 encoding, lines = detect_encoding(buffer.readline)
450 buffer.seek(0)
451 text = TextIOWrapper(buffer, encoding, line_buffering=True)
452 text.mode = 'r'
453 return text
454 except:
455 buffer.close()
456 raise
Victor Stinner58c07522010-11-09 01:08:59 +0000457
458
Trent Nelson428de652008-03-18 22:41:35 +0000459def tokenize(readline):
460 """
Berker Peksagff8d0872015-12-30 01:41:58 +0200461 The tokenize() generator requires one argument, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000462 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000463 readline() method of built-in file objects. Each call to the function
Berker Peksagff8d0872015-12-30 01:41:58 +0200464 should return one line of input as bytes. Alternatively, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000465 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000466 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000467
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000468 The generator produces 5-tuples with these members: the token type; the
469 token string; a 2-tuple (srow, scol) of ints specifying the row and
470 column where the token begins in the source; a 2-tuple (erow, ecol) of
471 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000472 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000473 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000474
475 The first token sequence will always be an ENCODING token
476 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000477 """
Benjamin Peterson21db77e2009-11-14 16:27:26 +0000478 # This import is here to avoid problems when the itertools module is not
479 # built yet and tokenize is imported.
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000480 from itertools import chain, repeat
Trent Nelson428de652008-03-18 22:41:35 +0000481 encoding, consumed = detect_encoding(readline)
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000482 rl_gen = iter(readline, b"")
483 empty = repeat(b"")
484 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000485
486
487def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000488 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000489 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000490 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000491 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000492 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000493
Trent Nelson428de652008-03-18 22:41:35 +0000494 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000495 if encoding == "utf-8-sig":
496 # BOM will already have been stripped.
497 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000498 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000499 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000500 try:
501 line = readline()
502 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000503 line = b''
504
505 if encoding is not None:
506 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000507 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000508 pos, max = 0, len(line)
509
510 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000511 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000512 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000513 endmatch = endprog.match(line)
514 if endmatch:
515 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000516 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000517 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000518 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000519 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000520 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000521 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000522 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000523 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000524 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000525 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000526 else:
527 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000528 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000529 continue
530
Guido van Rossum1aec3231997-04-08 14:24:39 +0000531 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000532 if not line: break
533 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000534 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000535 if line[pos] == ' ':
536 column += 1
537 elif line[pos] == '\t':
538 column = (column//tabsize + 1)*tabsize
539 elif line[pos] == '\f':
540 column = 0
541 else:
542 break
543 pos += 1
544 if pos == max:
545 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000546
547 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000548 if line[pos] == '#':
549 comment_token = line[pos:].rstrip('\r\n')
Raymond Hettingera48db392009-04-29 00:34:27 +0000550 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000551 (lnum, pos), (lnum, pos + len(comment_token)), line)
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +0100552 pos += len(comment_token)
553
554 yield TokenInfo(NL, line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000555 (lnum, pos), (lnum, len(line)), line)
556 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000557
558 if column > indents[-1]: # count indents or dedents
559 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000560 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000561 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000562 if column not in indents:
563 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000564 "unindent does not match any outer indentation level",
565 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000566 indents = indents[:-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400567
Raymond Hettingera48db392009-04-29 00:34:27 +0000568 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000569
570 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000571 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000572 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000573 continued = 0
574
575 while pos < max:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200576 pseudomatch = _compile(PseudoToken).match(line, pos)
Guido van Rossum3b631771997-10-27 20:44:15 +0000577 if pseudomatch: # scan for tokens
578 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000579 spos, epos, pos = (lnum, start), (lnum, end), end
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200580 if start == end:
581 continue
Guido van Rossum1aec3231997-04-08 14:24:39 +0000582 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000583
Georg Brandldde00282007-03-18 19:01:53 +0000584 if (initial in numchars or # ordinary number
585 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000586 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000587 elif initial in '\r\n':
Yury Selivanov96ec9342015-07-23 15:01:58 +0300588 if parenlev > 0:
589 yield TokenInfo(NL, token, spos, epos, line)
590 else:
591 yield TokenInfo(NEWLINE, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300592
Guido van Rossum1aec3231997-04-08 14:24:39 +0000593 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000594 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000595 yield TokenInfo(COMMENT, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400596
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000597 elif token in triple_quoted:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200598 endprog = _compile(endpats[token])
Guido van Rossum3b631771997-10-27 20:44:15 +0000599 endmatch = endprog.match(line, pos)
600 if endmatch: # all on one line
601 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000602 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000603 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000604 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000605 strstart = (lnum, start) # multiple lines
606 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000607 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000608 break
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400609
610 # Check up to the first 3 chars of the token to see if
611 # they're in the single_quoted set. If so, they start
612 # a string.
613 # We're using the first 3, because we're looking for
614 # "rb'" (for example) at the start of the token. If
615 # we switch to longer prefixes, this needs to be
616 # adjusted.
617 # Note that initial == token[:1].
Berker Peksaga7161e72015-12-30 01:42:43 +0200618 # Also note that single quote checking must come after
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400619 # triple quote checking (above).
620 elif (initial in single_quoted or
621 token[:2] in single_quoted or
622 token[:3] in single_quoted):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000623 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000624 strstart = (lnum, start)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400625 # Again, using the first 3 chars of the
626 # token. This is looking for the matching end
627 # regex for the correct type of quote
628 # character. So it's really looking for
629 # endpats["'"] or endpats['"'], by trying to
630 # skip string prefix characters, if any.
631 endprog = _compile(endpats.get(initial) or
632 endpats.get(token[1]) or
633 endpats.get(token[2]))
Guido van Rossumde655271997-04-09 17:15:54 +0000634 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000635 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000636 break
637 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000638 yield TokenInfo(STRING, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400639
Benjamin Peterson33856de2010-08-30 14:41:20 +0000640 elif initial.isidentifier(): # ordinary name
Jelle Zijlstraac317702017-10-05 20:24:46 -0700641 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000642 elif initial == '\\': # continued stmt
643 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000644 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000645 if initial in '([{':
646 parenlev += 1
647 elif initial in ')]}':
648 parenlev -= 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000649 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000650 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000651 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000652 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000653 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000654
655 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000656 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
657 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000658
Trent Nelson428de652008-03-18 22:41:35 +0000659
660# An undocumented, backwards compatible, API for all the places in the standard
661# library that expect to be able to use tokenize with strings
662def generate_tokens(readline):
663 return _tokenize(readline, None)
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000664
Meador Inge14c0f032011-10-07 08:53:38 -0500665def main():
666 import argparse
667
668 # Helper error handling routines
669 def perror(message):
670 print(message, file=sys.stderr)
671
672 def error(message, filename=None, location=None):
673 if location:
674 args = (filename,) + location + (message,)
675 perror("%s:%d:%d: error: %s" % args)
676 elif filename:
677 perror("%s: error: %s" % (filename, message))
678 else:
679 perror("error: %s" % message)
680 sys.exit(1)
681
682 # Parse the arguments and options
683 parser = argparse.ArgumentParser(prog='python -m tokenize')
684 parser.add_argument(dest='filename', nargs='?',
685 metavar='filename.py',
686 help='the file to tokenize; defaults to stdin')
Meador Inge00c7f852012-01-19 00:44:45 -0600687 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
688 help='display token names using the exact type')
Meador Inge14c0f032011-10-07 08:53:38 -0500689 args = parser.parse_args()
690
691 try:
692 # Tokenize the input
693 if args.filename:
694 filename = args.filename
Victor Stinner96917502014-12-05 10:17:10 +0100695 with _builtin_open(filename, 'rb') as f:
Meador Inge14c0f032011-10-07 08:53:38 -0500696 tokens = list(tokenize(f.readline))
697 else:
698 filename = "<stdin>"
699 tokens = _tokenize(sys.stdin.readline, None)
700
701 # Output the tokenization
702 for token in tokens:
Meador Inge00c7f852012-01-19 00:44:45 -0600703 token_type = token.type
704 if args.exact:
705 token_type = token.exact_type
Meador Inge14c0f032011-10-07 08:53:38 -0500706 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
707 print("%-20s%-15s%-15r" %
Meador Inge00c7f852012-01-19 00:44:45 -0600708 (token_range, tok_name[token_type], token.string))
Meador Inge14c0f032011-10-07 08:53:38 -0500709 except IndentationError as err:
710 line, column = err.args[1][1:3]
711 error(err.args[0], filename, (line, column))
712 except TokenError as err:
713 line, column = err.args[1]
714 error(err.args[0], filename, (line, column))
715 except SyntaxError as err:
716 error(err, filename)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200717 except OSError as err:
Meador Inge14c0f032011-10-07 08:53:38 -0500718 error(err)
719 except KeyboardInterrupt:
720 print("interrupted\n")
721 except Exception as err:
722 perror("unexpected error: %s" % err)
723 raise
724
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000725if __name__ == "__main__":
Meador Inge14c0f032011-10-07 08:53:38 -0500726 main()