blob: 42c1f10373de9b5819547f75145bfac79dfd5cee [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020027from builtins import open as _builtin_open
Benjamin Peterson433f32c2008-12-12 01:25:05 +000028from codecs import lookup, BOM_UTF8
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000029import collections
Anthony Sottile15bd9ef2021-01-24 01:23:17 -080030import functools
Victor Stinner58c07522010-11-09 01:08:59 +000031from io import TextIOWrapper
Eric V. Smith1c8222c2015-10-26 04:37:55 -040032import itertools as _itertools
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050033import re
34import sys
35from token import *
Serhiy Storchaka8ac65812018-12-22 11:18:40 +020036from token import EXACT_TOKEN_TYPES
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050037
Serhiy Storchakae431d3c2016-03-20 23:36:29 +020038cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +020039blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000040
Skip Montanaro40fc1602001-03-01 04:27:19 +000041import token
Thomas Kluyverc56b17b2018-06-05 19:26:39 +020042__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +010043 "untokenize", "TokenInfo"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000044del token
45
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000046class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000047 def __repr__(self):
Raymond Hettingera0e79402010-09-09 08:29:05 +000048 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
49 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
50 self._replace(type=annotated_type))
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000051
Meador Inge00c7f852012-01-19 00:44:45 -060052 @property
53 def exact_type(self):
54 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
55 return EXACT_TOKEN_TYPES[self.string]
56 else:
57 return self.type
58
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000059def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000060def any(*choices): return group(*choices) + '*'
61def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000062
Antoine Pitroufd036452008-08-19 17:56:33 +000063# Note: we use unicode matching for names ("\w") but ascii matching for
64# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000065Whitespace = r'[ \f\t]*'
66Comment = r'#[^\r\n]*'
67Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +000068Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000069
Brett Cannona721aba2016-09-09 14:57:09 -070070Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
71Binnumber = r'0[bB](?:_?[01])+'
72Octnumber = r'0[oO](?:_?[0-7])+'
73Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000074Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Brett Cannona721aba2016-09-09 14:57:09 -070075Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
76Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
77 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
78Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000079Floatnumber = group(Pointfloat, Expfloat)
Brett Cannona721aba2016-09-09 14:57:09 -070080Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000081Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000082
Eric V. Smith1c8222c2015-10-26 04:37:55 -040083# Return the empty string, plus all of the valid string prefixes.
84def _all_string_prefixes():
85 # The valid string prefixes. Only contain the lower case versions,
penguindustin96466302019-05-06 14:57:17 -040086 # and don't contain any permutations (include 'fr', but not
Eric V. Smith1c8222c2015-10-26 04:37:55 -040087 # 'rf'). The various permutations will be generated.
88 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
89 # if we add binary f-strings, add: ['fb', 'fbr']
Jon Dufresne39726282017-05-18 07:35:54 -070090 result = {''}
Eric V. Smith1c8222c2015-10-26 04:37:55 -040091 for prefix in _valid_string_prefixes:
92 for t in _itertools.permutations(prefix):
93 # create a list with upper and lower versions of each
94 # character
95 for u in _itertools.product(*[(c, c.upper()) for c in t]):
96 result.add(''.join(u))
97 return result
98
Anthony Sottile15bd9ef2021-01-24 01:23:17 -080099@functools.lru_cache
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400100def _compile(expr):
101 return re.compile(expr, re.UNICODE)
102
103# Note that since _all_string_prefixes includes the empty string,
104# StringPrefix can be the empty string (making it optional).
105StringPrefix = group(*_all_string_prefixes())
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000106
Tim Petersde495832000-10-07 05:09:39 +0000107# Tail end of ' string.
108Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
109# Tail end of " string.
110Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
111# Tail end of ''' string.
112Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
113# Tail end of """ string.
114Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000115Triple = group(StringPrefix + "'''", StringPrefix + '"""')
Tim Petersde495832000-10-07 05:09:39 +0000116# Single-line ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000117String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
118 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000119
Serhiy Storchaka8ac65812018-12-22 11:18:40 +0200120# Sorting in reverse order puts the long operators before their prefixes.
121# Otherwise if = came before ==, == would get recognized as two instances
122# of =.
123Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
124Funny = group(r'\r?\n', Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000125
Guido van Rossum3b631771997-10-27 20:44:15 +0000126PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000127Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000128
Tim Petersde495832000-10-07 05:09:39 +0000129# First (or only) line of ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000130ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000131 group("'", r'\\\r?\n'),
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000132 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000133 group('"', r'\\\r?\n'))
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200134PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
Guido van Rossum3b631771997-10-27 20:44:15 +0000135PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000136
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400137# For a given string prefix plus quotes, endpats maps it to a regex
138# to match the remainder of that string. _prefix can be empty, for
139# a normal single or triple quoted string (with no prefix).
140endpats = {}
141for _prefix in _all_string_prefixes():
142 endpats[_prefix + "'"] = Single
143 endpats[_prefix + '"'] = Double
144 endpats[_prefix + "'''"] = Single3
145 endpats[_prefix + '"""'] = Double3
Benjamin Peterson33856de2010-08-30 14:41:20 +0000146
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400147# A set of all of the single and triple quoted string prefixes,
148# including the opening quotes.
149single_quoted = set()
150triple_quoted = set()
151for t in _all_string_prefixes():
152 for u in (t + '"', t + "'"):
153 single_quoted.add(u)
154 for u in (t + '"""', t + "'''"):
155 triple_quoted.add(u)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000156
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000157tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000158
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000159class TokenError(Exception): pass
160
161class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000162
Tim Peters5ca576e2001-06-18 22:08:13 +0000163
Thomas Wouters89f507f2006-12-13 04:49:30 +0000164class Untokenizer:
165
166 def __init__(self):
167 self.tokens = []
168 self.prev_row = 1
169 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000170 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000171
172 def add_whitespace(self, start):
173 row, col = start
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500174 if row < self.prev_row or row == self.prev_row and col < self.prev_col:
175 raise ValueError("start ({},{}) precedes previous end ({},{})"
176 .format(row, col, self.prev_row, self.prev_col))
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500177 row_offset = row - self.prev_row
Terry Jan Reedyf106f8f2014-02-23 23:39:57 -0500178 if row_offset:
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500179 self.tokens.append("\\\n" * row_offset)
180 self.prev_col = 0
Thomas Wouters89f507f2006-12-13 04:49:30 +0000181 col_offset = col - self.prev_col
182 if col_offset:
183 self.tokens.append(" " * col_offset)
184
185 def untokenize(self, iterable):
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500186 it = iter(iterable)
Dingyuan Wange411b662015-06-22 10:01:12 +0800187 indents = []
188 startline = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500189 for t in it:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000190 if len(t) == 2:
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500191 self.compat(t, it)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000192 break
193 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000194 if tok_type == ENCODING:
195 self.encoding = token
196 continue
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500197 if tok_type == ENDMARKER:
198 break
Dingyuan Wange411b662015-06-22 10:01:12 +0800199 if tok_type == INDENT:
200 indents.append(token)
201 continue
202 elif tok_type == DEDENT:
203 indents.pop()
204 self.prev_row, self.prev_col = end
205 continue
206 elif tok_type in (NEWLINE, NL):
207 startline = True
208 elif startline and indents:
209 indent = indents[-1]
210 if start[1] >= len(indent):
211 self.tokens.append(indent)
212 self.prev_col = len(indent)
213 startline = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000214 self.add_whitespace(start)
215 self.tokens.append(token)
216 self.prev_row, self.prev_col = end
217 if tok_type in (NEWLINE, NL):
218 self.prev_row += 1
219 self.prev_col = 0
220 return "".join(self.tokens)
221
222 def compat(self, token, iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000223 indents = []
224 toks_append = self.tokens.append
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500225 startline = token[0] in (NEWLINE, NL)
Christian Heimesba4af492008-03-28 00:55:15 +0000226 prevstring = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500227
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700228 for tok in _itertools.chain([token], iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000229 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000230 if toknum == ENCODING:
231 self.encoding = tokval
232 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000233
Serhiy Storchakad08972f2018-04-11 19:15:51 +0300234 if toknum in (NAME, NUMBER):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000235 tokval += ' '
236
Christian Heimesba4af492008-03-28 00:55:15 +0000237 # Insert a space between two consecutive strings
238 if toknum == STRING:
239 if prevstring:
240 tokval = ' ' + tokval
241 prevstring = True
242 else:
243 prevstring = False
244
Thomas Wouters89f507f2006-12-13 04:49:30 +0000245 if toknum == INDENT:
246 indents.append(tokval)
247 continue
248 elif toknum == DEDENT:
249 indents.pop()
250 continue
251 elif toknum in (NEWLINE, NL):
252 startline = True
253 elif startline and indents:
254 toks_append(indents[-1])
255 startline = False
256 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000257
Trent Nelson428de652008-03-18 22:41:35 +0000258
Raymond Hettinger68c04532005-06-10 11:05:19 +0000259def untokenize(iterable):
260 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000261 It returns a bytes object, encoded using the ENCODING
262 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000263
264 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000265 with at least two elements, a token number and token value. If
266 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000267
Thomas Wouters89f507f2006-12-13 04:49:30 +0000268 Round-trip invariant for full input:
269 Untokenized source will match input source exactly
270
Berker Peksagff8d0872015-12-30 01:41:58 +0200271 Round-trip invariant for limited input:
272 # Output bytes will tokenize back to the input
Trent Nelson428de652008-03-18 22:41:35 +0000273 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000274 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000275 readline = BytesIO(newcode).readline
276 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000277 assert t1 == t2
278 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000279 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000280 out = ut.untokenize(iterable)
281 if ut.encoding is not None:
282 out = out.encode(ut.encoding)
283 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000284
Trent Nelson428de652008-03-18 22:41:35 +0000285
Benjamin Petersond3afada2009-10-09 21:43:09 +0000286def _get_normal_name(orig_enc):
287 """Imitates get_normal_name in tokenizer.c."""
288 # Only care about the first 12 characters.
289 enc = orig_enc[:12].lower().replace("_", "-")
290 if enc == "utf-8" or enc.startswith("utf-8-"):
291 return "utf-8"
292 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
293 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
294 return "iso-8859-1"
295 return orig_enc
296
Trent Nelson428de652008-03-18 22:41:35 +0000297def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000298 """
Trent Nelson428de652008-03-18 22:41:35 +0000299 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200300 be used to decode a Python source file. It requires one argument, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000301 in the same way as the tokenize() generator.
302
303 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000304 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000305
306 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000307 cookie as specified in pep-0263. If both a bom and a cookie are present,
308 but disagree, a SyntaxError will be raised. If the encoding cookie is an
309 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000310 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000311
312 If no encoding is specified, then the default of 'utf-8' will be returned.
313 """
Brett Cannonc33f3f22012-04-20 13:23:54 -0400314 try:
315 filename = readline.__self__.name
316 except AttributeError:
317 filename = None
Trent Nelson428de652008-03-18 22:41:35 +0000318 bom_found = False
319 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000320 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000321 def read_or_stop():
322 try:
323 return readline()
324 except StopIteration:
325 return b''
326
327 def find_cookie(line):
328 try:
Martin v. Löwis63674f42012-04-20 14:36:47 +0200329 # Decode as UTF-8. Either the line is an encoding declaration,
330 # in which case it should be pure ASCII, or it must be UTF-8
331 # per default encoding.
332 line_string = line.decode('utf-8')
Trent Nelson428de652008-03-18 22:41:35 +0000333 except UnicodeDecodeError:
Brett Cannonc33f3f22012-04-20 13:23:54 -0400334 msg = "invalid or missing encoding declaration"
335 if filename is not None:
336 msg = '{} for {!r}'.format(msg, filename)
337 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000338
Serhiy Storchakadafea852013-09-16 23:51:56 +0300339 match = cookie_re.match(line_string)
340 if not match:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000341 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300342 encoding = _get_normal_name(match.group(1))
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000343 try:
344 codec = lookup(encoding)
345 except LookupError:
346 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400347 if filename is None:
348 msg = "unknown encoding: " + encoding
349 else:
350 msg = "unknown encoding for {!r}: {}".format(filename,
351 encoding)
352 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000353
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000354 if bom_found:
Florent Xicluna11f0b412012-07-07 12:13:35 +0200355 if encoding != 'utf-8':
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000356 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400357 if filename is None:
358 msg = 'encoding problem: utf-8'
359 else:
360 msg = 'encoding problem for {!r}: utf-8'.format(filename)
361 raise SyntaxError(msg)
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000362 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000363 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000364
365 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000366 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000367 bom_found = True
368 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000369 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000370 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000371 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000372
373 encoding = find_cookie(first)
374 if encoding:
375 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200376 if not blank_re.match(first):
377 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000378
379 second = read_or_stop()
380 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000381 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000382
383 encoding = find_cookie(second)
384 if encoding:
385 return encoding, [first, second]
386
Benjamin Peterson689a5582010-03-18 22:29:52 +0000387 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000388
389
Victor Stinner58c07522010-11-09 01:08:59 +0000390def open(filename):
391 """Open a file in read only mode using the encoding detected by
392 detect_encoding().
393 """
Victor Stinner96917502014-12-05 10:17:10 +0100394 buffer = _builtin_open(filename, 'rb')
Victor Stinner387729e2015-05-26 00:43:58 +0200395 try:
396 encoding, lines = detect_encoding(buffer.readline)
397 buffer.seek(0)
398 text = TextIOWrapper(buffer, encoding, line_buffering=True)
399 text.mode = 'r'
400 return text
401 except:
402 buffer.close()
403 raise
Victor Stinner58c07522010-11-09 01:08:59 +0000404
405
Trent Nelson428de652008-03-18 22:41:35 +0000406def tokenize(readline):
407 """
Berker Peksagff8d0872015-12-30 01:41:58 +0200408 The tokenize() generator requires one argument, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000409 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000410 readline() method of built-in file objects. Each call to the function
Berker Peksagff8d0872015-12-30 01:41:58 +0200411 should return one line of input as bytes. Alternatively, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000412 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000413 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000414
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000415 The generator produces 5-tuples with these members: the token type; the
416 token string; a 2-tuple (srow, scol) of ints specifying the row and
417 column where the token begins in the source; a 2-tuple (erow, ecol) of
418 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000419 and the line on which the token was found. The line passed is the
Anthony Sottile2a58b062019-05-30 15:06:32 -0700420 physical line.
Trent Nelson428de652008-03-18 22:41:35 +0000421
422 The first token sequence will always be an ENCODING token
423 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000424 """
Trent Nelson428de652008-03-18 22:41:35 +0000425 encoding, consumed = detect_encoding(readline)
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700426 empty = _itertools.repeat(b"")
427 rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
428 return _tokenize(rl_gen.__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000429
430
431def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000432 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000433 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000434 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000435 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000436 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000437
Trent Nelson428de652008-03-18 22:41:35 +0000438 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000439 if encoding == "utf-8-sig":
440 # BOM will already have been stripped.
441 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000442 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Ammar Askarc4ef4892018-07-06 03:19:08 -0400443 last_line = b''
444 line = b''
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700445 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000446 try:
Ammar Askarc4ef4892018-07-06 03:19:08 -0400447 # We capture the value of the line variable here because
448 # readline uses the empty string '' to signal end of input,
449 # hence `line` itself will always be overwritten at the end
450 # of this loop.
451 last_line = line
Raymond Hettinger68c04532005-06-10 11:05:19 +0000452 line = readline()
453 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000454 line = b''
455
456 if encoding is not None:
457 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000458 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000459 pos, max = 0, len(line)
460
461 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000462 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000463 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000464 endmatch = endprog.match(line)
465 if endmatch:
466 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000467 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000468 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000469 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000470 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000471 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000472 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000473 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000474 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000475 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000476 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000477 else:
478 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000479 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000480 continue
481
Guido van Rossum1aec3231997-04-08 14:24:39 +0000482 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000483 if not line: break
484 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000485 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000486 if line[pos] == ' ':
487 column += 1
488 elif line[pos] == '\t':
489 column = (column//tabsize + 1)*tabsize
490 elif line[pos] == '\f':
491 column = 0
492 else:
493 break
494 pos += 1
495 if pos == max:
496 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000497
498 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000499 if line[pos] == '#':
500 comment_token = line[pos:].rstrip('\r\n')
Raymond Hettingera48db392009-04-29 00:34:27 +0000501 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000502 (lnum, pos), (lnum, pos + len(comment_token)), line)
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +0100503 pos += len(comment_token)
504
505 yield TokenInfo(NL, line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000506 (lnum, pos), (lnum, len(line)), line)
507 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000508
509 if column > indents[-1]: # count indents or dedents
510 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000511 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000512 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000513 if column not in indents:
514 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000515 "unindent does not match any outer indentation level",
516 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000517 indents = indents[:-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400518
Raymond Hettingera48db392009-04-29 00:34:27 +0000519 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000520
521 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000522 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000523 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000524 continued = 0
525
526 while pos < max:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200527 pseudomatch = _compile(PseudoToken).match(line, pos)
Guido van Rossum3b631771997-10-27 20:44:15 +0000528 if pseudomatch: # scan for tokens
529 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000530 spos, epos, pos = (lnum, start), (lnum, end), end
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200531 if start == end:
532 continue
Guido van Rossum1aec3231997-04-08 14:24:39 +0000533 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000534
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700535 if (initial in numchars or # ordinary number
Georg Brandldde00282007-03-18 19:01:53 +0000536 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000537 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000538 elif initial in '\r\n':
Yury Selivanov96ec9342015-07-23 15:01:58 +0300539 if parenlev > 0:
540 yield TokenInfo(NL, token, spos, epos, line)
541 else:
542 yield TokenInfo(NEWLINE, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300543
Guido van Rossum1aec3231997-04-08 14:24:39 +0000544 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000545 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000546 yield TokenInfo(COMMENT, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400547
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000548 elif token in triple_quoted:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200549 endprog = _compile(endpats[token])
Guido van Rossum3b631771997-10-27 20:44:15 +0000550 endmatch = endprog.match(line, pos)
551 if endmatch: # all on one line
552 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000553 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000554 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000555 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000556 strstart = (lnum, start) # multiple lines
557 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000558 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000559 break
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400560
561 # Check up to the first 3 chars of the token to see if
562 # they're in the single_quoted set. If so, they start
563 # a string.
564 # We're using the first 3, because we're looking for
565 # "rb'" (for example) at the start of the token. If
566 # we switch to longer prefixes, this needs to be
567 # adjusted.
568 # Note that initial == token[:1].
Berker Peksaga7161e72015-12-30 01:42:43 +0200569 # Also note that single quote checking must come after
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400570 # triple quote checking (above).
571 elif (initial in single_quoted or
572 token[:2] in single_quoted or
573 token[:3] in single_quoted):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000574 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000575 strstart = (lnum, start)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400576 # Again, using the first 3 chars of the
577 # token. This is looking for the matching end
578 # regex for the correct type of quote
579 # character. So it's really looking for
580 # endpats["'"] or endpats['"'], by trying to
581 # skip string prefix characters, if any.
582 endprog = _compile(endpats.get(initial) or
583 endpats.get(token[1]) or
584 endpats.get(token[2]))
Guido van Rossumde655271997-04-09 17:15:54 +0000585 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000586 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000587 break
588 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000589 yield TokenInfo(STRING, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400590
Benjamin Peterson33856de2010-08-30 14:41:20 +0000591 elif initial.isidentifier(): # ordinary name
Jelle Zijlstraac317702017-10-05 20:24:46 -0700592 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000593 elif initial == '\\': # continued stmt
594 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000595 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000596 if initial in '([{':
597 parenlev += 1
598 elif initial in ')]}':
599 parenlev -= 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000600 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000601 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000602 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000603 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000604 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000605
Ammar Askarc4ef4892018-07-06 03:19:08 -0400606 # Add an implicit NEWLINE if the input doesn't end in one
607 if last_line and last_line[-1] not in '\r\n':
608 yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000609 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000610 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
611 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000612
Trent Nelson428de652008-03-18 22:41:35 +0000613
Trent Nelson428de652008-03-18 22:41:35 +0000614def generate_tokens(readline):
Thomas Kluyverc56b17b2018-06-05 19:26:39 +0200615 """Tokenize a source reading Python code as unicode strings.
616
617 This has the same API as tokenize(), except that it expects the *readline*
618 callable to return str objects instead of bytes.
619 """
Trent Nelson428de652008-03-18 22:41:35 +0000620 return _tokenize(readline, None)
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000621
Meador Inge14c0f032011-10-07 08:53:38 -0500622def main():
623 import argparse
624
625 # Helper error handling routines
626 def perror(message):
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700627 sys.stderr.write(message)
628 sys.stderr.write('\n')
Meador Inge14c0f032011-10-07 08:53:38 -0500629
630 def error(message, filename=None, location=None):
631 if location:
632 args = (filename,) + location + (message,)
633 perror("%s:%d:%d: error: %s" % args)
634 elif filename:
635 perror("%s: error: %s" % (filename, message))
636 else:
637 perror("error: %s" % message)
638 sys.exit(1)
639
640 # Parse the arguments and options
641 parser = argparse.ArgumentParser(prog='python -m tokenize')
642 parser.add_argument(dest='filename', nargs='?',
643 metavar='filename.py',
644 help='the file to tokenize; defaults to stdin')
Meador Inge00c7f852012-01-19 00:44:45 -0600645 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
646 help='display token names using the exact type')
Meador Inge14c0f032011-10-07 08:53:38 -0500647 args = parser.parse_args()
648
649 try:
650 # Tokenize the input
651 if args.filename:
652 filename = args.filename
Victor Stinner96917502014-12-05 10:17:10 +0100653 with _builtin_open(filename, 'rb') as f:
Meador Inge14c0f032011-10-07 08:53:38 -0500654 tokens = list(tokenize(f.readline))
655 else:
656 filename = "<stdin>"
657 tokens = _tokenize(sys.stdin.readline, None)
658
659 # Output the tokenization
660 for token in tokens:
Meador Inge00c7f852012-01-19 00:44:45 -0600661 token_type = token.type
662 if args.exact:
663 token_type = token.exact_type
Meador Inge14c0f032011-10-07 08:53:38 -0500664 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
665 print("%-20s%-15s%-15r" %
Meador Inge00c7f852012-01-19 00:44:45 -0600666 (token_range, tok_name[token_type], token.string))
Meador Inge14c0f032011-10-07 08:53:38 -0500667 except IndentationError as err:
668 line, column = err.args[1][1:3]
669 error(err.args[0], filename, (line, column))
670 except TokenError as err:
671 line, column = err.args[1]
672 error(err.args[0], filename, (line, column))
673 except SyntaxError as err:
674 error(err, filename)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200675 except OSError as err:
Meador Inge14c0f032011-10-07 08:53:38 -0500676 error(err)
677 except KeyboardInterrupt:
678 print("interrupted\n")
679 except Exception as err:
680 perror("unexpected error: %s" % err)
681 raise
682
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000683if __name__ == "__main__":
Meador Inge14c0f032011-10-07 08:53:38 -0500684 main()