blob: 1aee21b5e18fa716dfaa5306fc6aa8a96d253641 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Serhiy Storchakacf4a2f22015-03-11 17:18:03 +020027from builtins import open as _builtin_open
Benjamin Peterson433f32c2008-12-12 01:25:05 +000028from codecs import lookup, BOM_UTF8
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000029import collections
Victor Stinner58c07522010-11-09 01:08:59 +000030from io import TextIOWrapper
Eric V. Smith1c8222c2015-10-26 04:37:55 -040031import itertools as _itertools
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050032import re
33import sys
34from token import *
Serhiy Storchaka8ac65812018-12-22 11:18:40 +020035from token import EXACT_TOKEN_TYPES
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -050036
Serhiy Storchakae431d3c2016-03-20 23:36:29 +020037cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
Serhiy Storchaka768c16c2014-01-09 18:36:09 +020038blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000039
Skip Montanaro40fc1602001-03-01 04:27:19 +000040import token
Thomas Kluyverc56b17b2018-06-05 19:26:39 +020041__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
Albert-Jan Nijburgfc354f02017-05-31 15:00:21 +010042 "untokenize", "TokenInfo"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000043del token
44
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000045class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000046 def __repr__(self):
Raymond Hettingera0e79402010-09-09 08:29:05 +000047 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
48 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
49 self._replace(type=annotated_type))
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000050
Meador Inge00c7f852012-01-19 00:44:45 -060051 @property
52 def exact_type(self):
53 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
54 return EXACT_TOKEN_TYPES[self.string]
55 else:
56 return self.type
57
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000058def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000059def any(*choices): return group(*choices) + '*'
60def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000061
Antoine Pitroufd036452008-08-19 17:56:33 +000062# Note: we use unicode matching for names ("\w") but ascii matching for
63# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000064Whitespace = r'[ \f\t]*'
65Comment = r'#[^\r\n]*'
66Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +000067Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000068
Brett Cannona721aba2016-09-09 14:57:09 -070069Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
70Binnumber = r'0[bB](?:_?[01])+'
71Octnumber = r'0[oO](?:_?[0-7])+'
72Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000073Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Brett Cannona721aba2016-09-09 14:57:09 -070074Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
75Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
76 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
77Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000078Floatnumber = group(Pointfloat, Expfloat)
Brett Cannona721aba2016-09-09 14:57:09 -070079Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000080Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000081
Eric V. Smith1c8222c2015-10-26 04:37:55 -040082# Return the empty string, plus all of the valid string prefixes.
83def _all_string_prefixes():
84 # The valid string prefixes. Only contain the lower case versions,
penguindustin96466302019-05-06 14:57:17 -040085 # and don't contain any permutations (include 'fr', but not
Eric V. Smith1c8222c2015-10-26 04:37:55 -040086 # 'rf'). The various permutations will be generated.
87 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
88 # if we add binary f-strings, add: ['fb', 'fbr']
Jon Dufresne39726282017-05-18 07:35:54 -070089 result = {''}
Eric V. Smith1c8222c2015-10-26 04:37:55 -040090 for prefix in _valid_string_prefixes:
91 for t in _itertools.permutations(prefix):
92 # create a list with upper and lower versions of each
93 # character
94 for u in _itertools.product(*[(c, c.upper()) for c in t]):
95 result.add(''.join(u))
96 return result
97
98def _compile(expr):
99 return re.compile(expr, re.UNICODE)
100
101# Note that since _all_string_prefixes includes the empty string,
102# StringPrefix can be the empty string (making it optional).
103StringPrefix = group(*_all_string_prefixes())
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000104
Tim Petersde495832000-10-07 05:09:39 +0000105# Tail end of ' string.
106Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
107# Tail end of " string.
108Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
109# Tail end of ''' string.
110Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
111# Tail end of """ string.
112Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000113Triple = group(StringPrefix + "'''", StringPrefix + '"""')
Tim Petersde495832000-10-07 05:09:39 +0000114# Single-line ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000115String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
116 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000117
Serhiy Storchaka8ac65812018-12-22 11:18:40 +0200118# Sorting in reverse order puts the long operators before their prefixes.
119# Otherwise if = came before ==, == would get recognized as two instances
120# of =.
121Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
122Funny = group(r'\r?\n', Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000123
Guido van Rossum3b631771997-10-27 20:44:15 +0000124PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000125Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000126
Tim Petersde495832000-10-07 05:09:39 +0000127# First (or only) line of ' or " string.
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000128ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000129 group("'", r'\\\r?\n'),
Armin Ronacherc0eaeca2012-03-04 13:07:57 +0000130 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000131 group('"', r'\\\r?\n'))
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200132PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
Guido van Rossum3b631771997-10-27 20:44:15 +0000133PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000134
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400135# For a given string prefix plus quotes, endpats maps it to a regex
136# to match the remainder of that string. _prefix can be empty, for
137# a normal single or triple quoted string (with no prefix).
138endpats = {}
139for _prefix in _all_string_prefixes():
140 endpats[_prefix + "'"] = Single
141 endpats[_prefix + '"'] = Double
142 endpats[_prefix + "'''"] = Single3
143 endpats[_prefix + '"""'] = Double3
Benjamin Peterson33856de2010-08-30 14:41:20 +0000144
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400145# A set of all of the single and triple quoted string prefixes,
146# including the opening quotes.
147single_quoted = set()
148triple_quoted = set()
149for t in _all_string_prefixes():
150 for u in (t + '"', t + "'"):
151 single_quoted.add(u)
152 for u in (t + '"""', t + "'''"):
153 triple_quoted.add(u)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000154
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000155tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000156
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000157class TokenError(Exception): pass
158
159class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000160
Tim Peters5ca576e2001-06-18 22:08:13 +0000161
Thomas Wouters89f507f2006-12-13 04:49:30 +0000162class Untokenizer:
163
164 def __init__(self):
165 self.tokens = []
166 self.prev_row = 1
167 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000168 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000169
170 def add_whitespace(self, start):
171 row, col = start
Terry Jan Reedy5e6db312014-02-17 16:45:48 -0500172 if row < self.prev_row or row == self.prev_row and col < self.prev_col:
173 raise ValueError("start ({},{}) precedes previous end ({},{})"
174 .format(row, col, self.prev_row, self.prev_col))
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500175 row_offset = row - self.prev_row
Terry Jan Reedyf106f8f2014-02-23 23:39:57 -0500176 if row_offset:
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500177 self.tokens.append("\\\n" * row_offset)
178 self.prev_col = 0
Thomas Wouters89f507f2006-12-13 04:49:30 +0000179 col_offset = col - self.prev_col
180 if col_offset:
181 self.tokens.append(" " * col_offset)
182
183 def untokenize(self, iterable):
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500184 it = iter(iterable)
Dingyuan Wange411b662015-06-22 10:01:12 +0800185 indents = []
186 startline = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500187 for t in it:
Thomas Wouters89f507f2006-12-13 04:49:30 +0000188 if len(t) == 2:
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500189 self.compat(t, it)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000190 break
191 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000192 if tok_type == ENCODING:
193 self.encoding = token
194 continue
Terry Jan Reedy9dc3a362014-02-23 23:33:08 -0500195 if tok_type == ENDMARKER:
196 break
Dingyuan Wange411b662015-06-22 10:01:12 +0800197 if tok_type == INDENT:
198 indents.append(token)
199 continue
200 elif tok_type == DEDENT:
201 indents.pop()
202 self.prev_row, self.prev_col = end
203 continue
204 elif tok_type in (NEWLINE, NL):
205 startline = True
206 elif startline and indents:
207 indent = indents[-1]
208 if start[1] >= len(indent):
209 self.tokens.append(indent)
210 self.prev_col = len(indent)
211 startline = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000212 self.add_whitespace(start)
213 self.tokens.append(token)
214 self.prev_row, self.prev_col = end
215 if tok_type in (NEWLINE, NL):
216 self.prev_row += 1
217 self.prev_col = 0
218 return "".join(self.tokens)
219
220 def compat(self, token, iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000221 indents = []
222 toks_append = self.tokens.append
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500223 startline = token[0] in (NEWLINE, NL)
Christian Heimesba4af492008-03-28 00:55:15 +0000224 prevstring = False
Terry Jan Reedy5b8d2c32014-02-17 23:12:16 -0500225
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700226 for tok in _itertools.chain([token], iterable):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000227 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000228 if toknum == ENCODING:
229 self.encoding = tokval
230 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000231
Serhiy Storchakad08972f2018-04-11 19:15:51 +0300232 if toknum in (NAME, NUMBER):
Thomas Wouters89f507f2006-12-13 04:49:30 +0000233 tokval += ' '
234
Christian Heimesba4af492008-03-28 00:55:15 +0000235 # Insert a space between two consecutive strings
236 if toknum == STRING:
237 if prevstring:
238 tokval = ' ' + tokval
239 prevstring = True
240 else:
241 prevstring = False
242
Thomas Wouters89f507f2006-12-13 04:49:30 +0000243 if toknum == INDENT:
244 indents.append(tokval)
245 continue
246 elif toknum == DEDENT:
247 indents.pop()
248 continue
249 elif toknum in (NEWLINE, NL):
250 startline = True
251 elif startline and indents:
252 toks_append(indents[-1])
253 startline = False
254 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000255
Trent Nelson428de652008-03-18 22:41:35 +0000256
Raymond Hettinger68c04532005-06-10 11:05:19 +0000257def untokenize(iterable):
258 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000259 It returns a bytes object, encoded using the ENCODING
260 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000261
262 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000263 with at least two elements, a token number and token value. If
264 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000265
Thomas Wouters89f507f2006-12-13 04:49:30 +0000266 Round-trip invariant for full input:
267 Untokenized source will match input source exactly
268
Berker Peksagff8d0872015-12-30 01:41:58 +0200269 Round-trip invariant for limited input:
270 # Output bytes will tokenize back to the input
Trent Nelson428de652008-03-18 22:41:35 +0000271 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000272 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000273 readline = BytesIO(newcode).readline
274 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000275 assert t1 == t2
276 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000277 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000278 out = ut.untokenize(iterable)
279 if ut.encoding is not None:
280 out = out.encode(ut.encoding)
281 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000282
Trent Nelson428de652008-03-18 22:41:35 +0000283
Benjamin Petersond3afada2009-10-09 21:43:09 +0000284def _get_normal_name(orig_enc):
285 """Imitates get_normal_name in tokenizer.c."""
286 # Only care about the first 12 characters.
287 enc = orig_enc[:12].lower().replace("_", "-")
288 if enc == "utf-8" or enc.startswith("utf-8-"):
289 return "utf-8"
290 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
291 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
292 return "iso-8859-1"
293 return orig_enc
294
Trent Nelson428de652008-03-18 22:41:35 +0000295def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000296 """
Trent Nelson428de652008-03-18 22:41:35 +0000297 The detect_encoding() function is used to detect the encoding that should
Ezio Melotti4bcc7962013-11-25 05:14:51 +0200298 be used to decode a Python source file. It requires one argument, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000299 in the same way as the tokenize() generator.
300
301 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000302 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000303
304 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000305 cookie as specified in pep-0263. If both a bom and a cookie are present,
306 but disagree, a SyntaxError will be raised. If the encoding cookie is an
307 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000308 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000309
310 If no encoding is specified, then the default of 'utf-8' will be returned.
311 """
Brett Cannonc33f3f22012-04-20 13:23:54 -0400312 try:
313 filename = readline.__self__.name
314 except AttributeError:
315 filename = None
Trent Nelson428de652008-03-18 22:41:35 +0000316 bom_found = False
317 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000318 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000319 def read_or_stop():
320 try:
321 return readline()
322 except StopIteration:
323 return b''
324
325 def find_cookie(line):
326 try:
Martin v. Löwis63674f42012-04-20 14:36:47 +0200327 # Decode as UTF-8. Either the line is an encoding declaration,
328 # in which case it should be pure ASCII, or it must be UTF-8
329 # per default encoding.
330 line_string = line.decode('utf-8')
Trent Nelson428de652008-03-18 22:41:35 +0000331 except UnicodeDecodeError:
Brett Cannonc33f3f22012-04-20 13:23:54 -0400332 msg = "invalid or missing encoding declaration"
333 if filename is not None:
334 msg = '{} for {!r}'.format(msg, filename)
335 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000336
Serhiy Storchakadafea852013-09-16 23:51:56 +0300337 match = cookie_re.match(line_string)
338 if not match:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000339 return None
Serhiy Storchakadafea852013-09-16 23:51:56 +0300340 encoding = _get_normal_name(match.group(1))
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000341 try:
342 codec = lookup(encoding)
343 except LookupError:
344 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400345 if filename is None:
346 msg = "unknown encoding: " + encoding
347 else:
348 msg = "unknown encoding for {!r}: {}".format(filename,
349 encoding)
350 raise SyntaxError(msg)
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000351
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000352 if bom_found:
Florent Xicluna11f0b412012-07-07 12:13:35 +0200353 if encoding != 'utf-8':
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000354 # This behaviour mimics the Python interpreter
Brett Cannonc33f3f22012-04-20 13:23:54 -0400355 if filename is None:
356 msg = 'encoding problem: utf-8'
357 else:
358 msg = 'encoding problem for {!r}: utf-8'.format(filename)
359 raise SyntaxError(msg)
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000360 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000361 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000362
363 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000364 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000365 bom_found = True
366 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000367 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000368 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000369 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000370
371 encoding = find_cookie(first)
372 if encoding:
373 return encoding, [first]
Serhiy Storchaka768c16c2014-01-09 18:36:09 +0200374 if not blank_re.match(first):
375 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000376
377 second = read_or_stop()
378 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000379 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000380
381 encoding = find_cookie(second)
382 if encoding:
383 return encoding, [first, second]
384
Benjamin Peterson689a5582010-03-18 22:29:52 +0000385 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000386
387
Victor Stinner58c07522010-11-09 01:08:59 +0000388def open(filename):
389 """Open a file in read only mode using the encoding detected by
390 detect_encoding().
391 """
Victor Stinner96917502014-12-05 10:17:10 +0100392 buffer = _builtin_open(filename, 'rb')
Victor Stinner387729e2015-05-26 00:43:58 +0200393 try:
394 encoding, lines = detect_encoding(buffer.readline)
395 buffer.seek(0)
396 text = TextIOWrapper(buffer, encoding, line_buffering=True)
397 text.mode = 'r'
398 return text
399 except:
400 buffer.close()
401 raise
Victor Stinner58c07522010-11-09 01:08:59 +0000402
403
Trent Nelson428de652008-03-18 22:41:35 +0000404def tokenize(readline):
405 """
Berker Peksagff8d0872015-12-30 01:41:58 +0200406 The tokenize() generator requires one argument, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000407 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000408 readline() method of built-in file objects. Each call to the function
Berker Peksagff8d0872015-12-30 01:41:58 +0200409 should return one line of input as bytes. Alternatively, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000410 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000411 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000412
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000413 The generator produces 5-tuples with these members: the token type; the
414 token string; a 2-tuple (srow, scol) of ints specifying the row and
415 column where the token begins in the source; a 2-tuple (erow, ecol) of
416 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000417 and the line on which the token was found. The line passed is the
Anthony Sottile2a58b062019-05-30 15:06:32 -0700418 physical line.
Trent Nelson428de652008-03-18 22:41:35 +0000419
420 The first token sequence will always be an ENCODING token
421 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000422 """
Trent Nelson428de652008-03-18 22:41:35 +0000423 encoding, consumed = detect_encoding(readline)
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700424 empty = _itertools.repeat(b"")
425 rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
426 return _tokenize(rl_gen.__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000427
428
429def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000430 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000431 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000432 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000433 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000434 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000435
Trent Nelson428de652008-03-18 22:41:35 +0000436 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000437 if encoding == "utf-8-sig":
438 # BOM will already have been stripped.
439 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000440 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Ammar Askarc4ef4892018-07-06 03:19:08 -0400441 last_line = b''
442 line = b''
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700443 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000444 try:
Ammar Askarc4ef4892018-07-06 03:19:08 -0400445 # We capture the value of the line variable here because
446 # readline uses the empty string '' to signal end of input,
447 # hence `line` itself will always be overwritten at the end
448 # of this loop.
449 last_line = line
Raymond Hettinger68c04532005-06-10 11:05:19 +0000450 line = readline()
451 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000452 line = b''
453
454 if encoding is not None:
455 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000456 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000457 pos, max = 0, len(line)
458
459 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000460 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000461 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000462 endmatch = endprog.match(line)
463 if endmatch:
464 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000465 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000466 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000467 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000468 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000469 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000470 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000471 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000472 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000473 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000474 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000475 else:
476 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000477 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000478 continue
479
Guido van Rossum1aec3231997-04-08 14:24:39 +0000480 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000481 if not line: break
482 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000483 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000484 if line[pos] == ' ':
485 column += 1
486 elif line[pos] == '\t':
487 column = (column//tabsize + 1)*tabsize
488 elif line[pos] == '\f':
489 column = 0
490 else:
491 break
492 pos += 1
493 if pos == max:
494 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000495
496 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000497 if line[pos] == '#':
498 comment_token = line[pos:].rstrip('\r\n')
Raymond Hettingera48db392009-04-29 00:34:27 +0000499 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000500 (lnum, pos), (lnum, pos + len(comment_token)), line)
Albert-Jan Nijburgc471ca42017-05-24 12:31:57 +0100501 pos += len(comment_token)
502
503 yield TokenInfo(NL, line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000504 (lnum, pos), (lnum, len(line)), line)
505 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000506
507 if column > indents[-1]: # count indents or dedents
508 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000509 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000510 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000511 if column not in indents:
512 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000513 "unindent does not match any outer indentation level",
514 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000515 indents = indents[:-1]
Yury Selivanov75445082015-05-11 22:57:16 -0400516
Raymond Hettingera48db392009-04-29 00:34:27 +0000517 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000518
519 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000520 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000521 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000522 continued = 0
523
524 while pos < max:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200525 pseudomatch = _compile(PseudoToken).match(line, pos)
Guido van Rossum3b631771997-10-27 20:44:15 +0000526 if pseudomatch: # scan for tokens
527 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000528 spos, epos, pos = (lnum, start), (lnum, end), end
Ezio Melotti2cc3b4b2012-11-03 17:38:43 +0200529 if start == end:
530 continue
Guido van Rossum1aec3231997-04-08 14:24:39 +0000531 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000532
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700533 if (initial in numchars or # ordinary number
Georg Brandldde00282007-03-18 19:01:53 +0000534 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000535 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000536 elif initial in '\r\n':
Yury Selivanov96ec9342015-07-23 15:01:58 +0300537 if parenlev > 0:
538 yield TokenInfo(NL, token, spos, epos, line)
539 else:
540 yield TokenInfo(NEWLINE, token, spos, epos, line)
Yury Selivanov96ec9342015-07-23 15:01:58 +0300541
Guido van Rossum1aec3231997-04-08 14:24:39 +0000542 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000543 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000544 yield TokenInfo(COMMENT, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400545
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000546 elif token in triple_quoted:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200547 endprog = _compile(endpats[token])
Guido van Rossum3b631771997-10-27 20:44:15 +0000548 endmatch = endprog.match(line, pos)
549 if endmatch: # all on one line
550 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000551 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000552 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000553 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000554 strstart = (lnum, start) # multiple lines
555 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000556 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000557 break
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400558
559 # Check up to the first 3 chars of the token to see if
560 # they're in the single_quoted set. If so, they start
561 # a string.
562 # We're using the first 3, because we're looking for
563 # "rb'" (for example) at the start of the token. If
564 # we switch to longer prefixes, this needs to be
565 # adjusted.
566 # Note that initial == token[:1].
Berker Peksaga7161e72015-12-30 01:42:43 +0200567 # Also note that single quote checking must come after
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400568 # triple quote checking (above).
569 elif (initial in single_quoted or
570 token[:2] in single_quoted or
571 token[:3] in single_quoted):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000572 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000573 strstart = (lnum, start)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400574 # Again, using the first 3 chars of the
575 # token. This is looking for the matching end
576 # regex for the correct type of quote
577 # character. So it's really looking for
578 # endpats["'"] or endpats['"'], by trying to
579 # skip string prefix characters, if any.
580 endprog = _compile(endpats.get(initial) or
581 endpats.get(token[1]) or
582 endpats.get(token[2]))
Guido van Rossumde655271997-04-09 17:15:54 +0000583 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000584 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000585 break
586 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000587 yield TokenInfo(STRING, token, spos, epos, line)
Eric V. Smith1c8222c2015-10-26 04:37:55 -0400588
Benjamin Peterson33856de2010-08-30 14:41:20 +0000589 elif initial.isidentifier(): # ordinary name
Jelle Zijlstraac317702017-10-05 20:24:46 -0700590 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000591 elif initial == '\\': # continued stmt
592 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000593 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000594 if initial in '([{':
595 parenlev += 1
596 elif initial in ')]}':
597 parenlev -= 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000598 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000599 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000600 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000601 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000602 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000603
Ammar Askarc4ef4892018-07-06 03:19:08 -0400604 # Add an implicit NEWLINE if the input doesn't end in one
605 if last_line and last_line[-1] not in '\r\n':
606 yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000607 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000608 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
609 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000610
Trent Nelson428de652008-03-18 22:41:35 +0000611
Trent Nelson428de652008-03-18 22:41:35 +0000612def generate_tokens(readline):
Thomas Kluyverc56b17b2018-06-05 19:26:39 +0200613 """Tokenize a source reading Python code as unicode strings.
614
615 This has the same API as tokenize(), except that it expects the *readline*
616 callable to return str objects instead of bytes.
617 """
Trent Nelson428de652008-03-18 22:41:35 +0000618 return _tokenize(readline, None)
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000619
Meador Inge14c0f032011-10-07 08:53:38 -0500620def main():
621 import argparse
622
623 # Helper error handling routines
624 def perror(message):
Ɓukasz Langac2d384d2018-04-23 01:07:11 -0700625 sys.stderr.write(message)
626 sys.stderr.write('\n')
Meador Inge14c0f032011-10-07 08:53:38 -0500627
628 def error(message, filename=None, location=None):
629 if location:
630 args = (filename,) + location + (message,)
631 perror("%s:%d:%d: error: %s" % args)
632 elif filename:
633 perror("%s: error: %s" % (filename, message))
634 else:
635 perror("error: %s" % message)
636 sys.exit(1)
637
638 # Parse the arguments and options
639 parser = argparse.ArgumentParser(prog='python -m tokenize')
640 parser.add_argument(dest='filename', nargs='?',
641 metavar='filename.py',
642 help='the file to tokenize; defaults to stdin')
Meador Inge00c7f852012-01-19 00:44:45 -0600643 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
644 help='display token names using the exact type')
Meador Inge14c0f032011-10-07 08:53:38 -0500645 args = parser.parse_args()
646
647 try:
648 # Tokenize the input
649 if args.filename:
650 filename = args.filename
Victor Stinner96917502014-12-05 10:17:10 +0100651 with _builtin_open(filename, 'rb') as f:
Meador Inge14c0f032011-10-07 08:53:38 -0500652 tokens = list(tokenize(f.readline))
653 else:
654 filename = "<stdin>"
655 tokens = _tokenize(sys.stdin.readline, None)
656
657 # Output the tokenization
658 for token in tokens:
Meador Inge00c7f852012-01-19 00:44:45 -0600659 token_type = token.type
660 if args.exact:
661 token_type = token.exact_type
Meador Inge14c0f032011-10-07 08:53:38 -0500662 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
663 print("%-20s%-15s%-15r" %
Meador Inge00c7f852012-01-19 00:44:45 -0600664 (token_range, tok_name[token_type], token.string))
Meador Inge14c0f032011-10-07 08:53:38 -0500665 except IndentationError as err:
666 line, column = err.args[1][1:3]
667 error(err.args[0], filename, (line, column))
668 except TokenError as err:
669 line, column = err.args[1]
670 error(err.args[0], filename, (line, column))
671 except SyntaxError as err:
672 error(err, filename)
Andrew Svetlovf7a17b42012-12-25 16:47:37 +0200673 except OSError as err:
Meador Inge14c0f032011-10-07 08:53:38 -0500674 error(err)
675 except KeyboardInterrupt:
676 print("interrupted\n")
677 except Exception as err:
678 perror("unexpected error: %s" % err)
679 raise
680
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000681if __name__ == "__main__":
Meador Inge14c0f032011-10-07 08:53:38 -0500682 main()