blob: f923e177ca0281b8c52ea5b5d97d130967bbafea [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Brett Cannonf3042782011-02-22 03:25:12 +000027import builtins
Florent Xicluna43e4ea12010-09-03 19:54:02 +000028import re
29import sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000030from token import *
Benjamin Peterson433f32c2008-12-12 01:25:05 +000031from codecs import lookup, BOM_UTF8
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000032import collections
Victor Stinner58c07522010-11-09 01:08:59 +000033from io import TextIOWrapper
Trent Nelson428de652008-03-18 22:41:35 +000034cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000035
Skip Montanaro40fc1602001-03-01 04:27:19 +000036import token
Alexander Belopolskyb9d10d02010-11-11 14:07:41 +000037__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
38 "NL", "untokenize", "ENCODING", "TokenInfo"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000039del token
40
Guido van Rossum1aec3231997-04-08 14:24:39 +000041COMMENT = N_TOKENS
42tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000043NL = N_TOKENS + 1
44tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000045ENCODING = N_TOKENS + 2
46tok_name[ENCODING] = 'ENCODING'
47N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000048
Raymond Hettinger3fb79c72010-09-09 07:15:18 +000049class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000050 def __repr__(self):
Raymond Hettingera0e79402010-09-09 08:29:05 +000051 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
52 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
53 self._replace(type=annotated_type))
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000054
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000055def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000056def any(*choices): return group(*choices) + '*'
57def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000058
Antoine Pitroufd036452008-08-19 17:56:33 +000059# Note: we use unicode matching for names ("\w") but ascii matching for
60# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000061Whitespace = r'[ \f\t]*'
62Comment = r'#[^\r\n]*'
63Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +000064Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000065
Antoine Pitroufd036452008-08-19 17:56:33 +000066Hexnumber = r'0[xX][0-9a-fA-F]+'
Georg Brandlfceab5a2008-01-19 20:08:23 +000067Binnumber = r'0[bB][01]+'
68Octnumber = r'0[oO][0-7]+'
Antoine Pitroufd036452008-08-19 17:56:33 +000069Decnumber = r'(?:0+|[1-9][0-9]*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000070Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Antoine Pitroufd036452008-08-19 17:56:33 +000071Exponent = r'[eE][-+]?[0-9]+'
72Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
73Expfloat = r'[0-9]+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000074Floatnumber = group(Pointfloat, Expfloat)
Antoine Pitroufd036452008-08-19 17:56:33 +000075Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000076Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000077
Tim Petersde495832000-10-07 05:09:39 +000078# Tail end of ' string.
79Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
80# Tail end of " string.
81Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
82# Tail end of ''' string.
83Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
84# Tail end of """ string.
85Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +000086Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000087# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000088String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
89 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000090
Tim Petersde495832000-10-07 05:09:39 +000091# Because of leftmost-then-longest match semantics, be sure to put the
92# longest operators first (e.g., if = came before ==, == would get
93# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +000094Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +000095 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +000096 r"[+\-*/%&|^=<>]=?",
97 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000098
Guido van Rossum4d8e8591992-01-01 19:34:47 +000099Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +0000100Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000101Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000102
Guido van Rossum3b631771997-10-27 20:44:15 +0000103PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000104Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000105
Tim Petersde495832000-10-07 05:09:39 +0000106# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000107ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000108 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000109 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000110 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000111PseudoExtras = group(r'\\\r?\n', Comment, Triple)
112PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000113
Benjamin Peterson33856de2010-08-30 14:41:20 +0000114def _compile(expr):
115 return re.compile(expr, re.UNICODE)
116
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200117endpats = {"'": Single, '"': Double,
118 "'''": Single3, '"""': Double3,
119 "r'''": Single3, 'r"""': Double3,
120 "b'''": Single3, 'b"""': Double3,
121 "br'''": Single3, 'br"""': Double3,
122 "R'''": Single3, 'R"""': Double3,
123 "B'''": Single3, 'B"""': Double3,
124 "bR'''": Single3, 'bR"""': Double3,
125 "Br'''": Single3, 'Br"""': Double3,
126 "BR'''": Single3, 'BR"""': Double3,
127 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000128
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000129triple_quoted = {}
130for t in ("'''", '"""',
131 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000132 "b'''", 'b"""', "B'''", 'B"""',
133 "br'''", 'br"""', "Br'''", 'Br"""',
134 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000135 triple_quoted[t] = t
136single_quoted = {}
137for t in ("'", '"',
138 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000139 "b'", 'b"', "B'", 'B"',
140 "br'", 'br"', "Br'", 'Br"',
141 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000142 single_quoted[t] = t
143
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000144tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000145
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000146class TokenError(Exception): pass
147
148class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000149
Tim Peters5ca576e2001-06-18 22:08:13 +0000150
Thomas Wouters89f507f2006-12-13 04:49:30 +0000151class Untokenizer:
152
153 def __init__(self):
154 self.tokens = []
155 self.prev_row = 1
156 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000157 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000158
159 def add_whitespace(self, start):
160 row, col = start
161 assert row <= self.prev_row
162 col_offset = col - self.prev_col
163 if col_offset:
164 self.tokens.append(" " * col_offset)
165
166 def untokenize(self, iterable):
167 for t in iterable:
168 if len(t) == 2:
169 self.compat(t, iterable)
170 break
171 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000172 if tok_type == ENCODING:
173 self.encoding = token
174 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000175 self.add_whitespace(start)
176 self.tokens.append(token)
177 self.prev_row, self.prev_col = end
178 if tok_type in (NEWLINE, NL):
179 self.prev_row += 1
180 self.prev_col = 0
181 return "".join(self.tokens)
182
183 def compat(self, token, iterable):
184 startline = False
185 indents = []
186 toks_append = self.tokens.append
187 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000188
Thomas Wouters89f507f2006-12-13 04:49:30 +0000189 if toknum in (NAME, NUMBER):
190 tokval += ' '
191 if toknum in (NEWLINE, NL):
192 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000193 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000194 for tok in iterable:
195 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000196 if toknum == ENCODING:
197 self.encoding = tokval
198 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000199
200 if toknum in (NAME, NUMBER):
201 tokval += ' '
202
Christian Heimesba4af492008-03-28 00:55:15 +0000203 # Insert a space between two consecutive strings
204 if toknum == STRING:
205 if prevstring:
206 tokval = ' ' + tokval
207 prevstring = True
208 else:
209 prevstring = False
210
Thomas Wouters89f507f2006-12-13 04:49:30 +0000211 if toknum == INDENT:
212 indents.append(tokval)
213 continue
214 elif toknum == DEDENT:
215 indents.pop()
216 continue
217 elif toknum in (NEWLINE, NL):
218 startline = True
219 elif startline and indents:
220 toks_append(indents[-1])
221 startline = False
222 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000223
Trent Nelson428de652008-03-18 22:41:35 +0000224
Raymond Hettinger68c04532005-06-10 11:05:19 +0000225def untokenize(iterable):
226 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000227 It returns a bytes object, encoded using the ENCODING
228 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000229
230 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000231 with at least two elements, a token number and token value. If
232 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000233
Thomas Wouters89f507f2006-12-13 04:49:30 +0000234 Round-trip invariant for full input:
235 Untokenized source will match input source exactly
236
237 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000238 # Output bytes will tokenize the back to the input
239 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000240 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000241 readline = BytesIO(newcode).readline
242 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000243 assert t1 == t2
244 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000245 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000246 out = ut.untokenize(iterable)
247 if ut.encoding is not None:
248 out = out.encode(ut.encoding)
249 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000250
Trent Nelson428de652008-03-18 22:41:35 +0000251
Benjamin Petersond3afada2009-10-09 21:43:09 +0000252def _get_normal_name(orig_enc):
253 """Imitates get_normal_name in tokenizer.c."""
254 # Only care about the first 12 characters.
255 enc = orig_enc[:12].lower().replace("_", "-")
256 if enc == "utf-8" or enc.startswith("utf-8-"):
257 return "utf-8"
258 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
259 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
260 return "iso-8859-1"
261 return orig_enc
262
Trent Nelson428de652008-03-18 22:41:35 +0000263def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000264 """
Trent Nelson428de652008-03-18 22:41:35 +0000265 The detect_encoding() function is used to detect the encoding that should
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000266 be used to decode a Python source file. It requires one argment, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000267 in the same way as the tokenize() generator.
268
269 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000270 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000271
272 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000273 cookie as specified in pep-0263. If both a bom and a cookie are present,
274 but disagree, a SyntaxError will be raised. If the encoding cookie is an
275 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000276 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000277
278 If no encoding is specified, then the default of 'utf-8' will be returned.
279 """
Trent Nelson428de652008-03-18 22:41:35 +0000280 bom_found = False
281 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000282 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000283 def read_or_stop():
284 try:
285 return readline()
286 except StopIteration:
287 return b''
288
289 def find_cookie(line):
290 try:
291 line_string = line.decode('ascii')
292 except UnicodeDecodeError:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000293 return None
294
295 matches = cookie_re.findall(line_string)
296 if not matches:
297 return None
Benjamin Petersond3afada2009-10-09 21:43:09 +0000298 encoding = _get_normal_name(matches[0])
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000299 try:
300 codec = lookup(encoding)
301 except LookupError:
302 # This behaviour mimics the Python interpreter
303 raise SyntaxError("unknown encoding: " + encoding)
304
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000305 if bom_found:
306 if codec.name != 'utf-8':
307 # This behaviour mimics the Python interpreter
308 raise SyntaxError('encoding problem: utf-8')
309 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000310 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000311
312 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000313 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000314 bom_found = True
315 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000316 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000317 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000318 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000319
320 encoding = find_cookie(first)
321 if encoding:
322 return encoding, [first]
323
324 second = read_or_stop()
325 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000326 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000327
328 encoding = find_cookie(second)
329 if encoding:
330 return encoding, [first, second]
331
Benjamin Peterson689a5582010-03-18 22:29:52 +0000332 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000333
334
Victor Stinner58c07522010-11-09 01:08:59 +0000335def open(filename):
336 """Open a file in read only mode using the encoding detected by
337 detect_encoding().
338 """
Brett Cannonf3042782011-02-22 03:25:12 +0000339 buffer = builtins.open(filename, 'rb')
Victor Stinner58c07522010-11-09 01:08:59 +0000340 encoding, lines = detect_encoding(buffer.readline)
341 buffer.seek(0)
342 text = TextIOWrapper(buffer, encoding, line_buffering=True)
343 text.mode = 'r'
344 return text
345
346
Trent Nelson428de652008-03-18 22:41:35 +0000347def tokenize(readline):
348 """
349 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000350 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000351 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000352 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000353 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000354 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000355
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000356 The generator produces 5-tuples with these members: the token type; the
357 token string; a 2-tuple (srow, scol) of ints specifying the row and
358 column where the token begins in the source; a 2-tuple (erow, ecol) of
359 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000360 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000361 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000362
363 The first token sequence will always be an ENCODING token
364 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000365 """
Benjamin Peterson21db77e2009-11-14 16:27:26 +0000366 # This import is here to avoid problems when the itertools module is not
367 # built yet and tokenize is imported.
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000368 from itertools import chain, repeat
Trent Nelson428de652008-03-18 22:41:35 +0000369 encoding, consumed = detect_encoding(readline)
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000370 rl_gen = iter(readline, b"")
371 empty = repeat(b"")
372 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000373
374
375def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000376 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000377 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000378 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000379 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000380 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000381
Trent Nelson428de652008-03-18 22:41:35 +0000382 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000383 if encoding == "utf-8-sig":
384 # BOM will already have been stripped.
385 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000386 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000387 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000388 try:
389 line = readline()
390 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000391 line = b''
392
393 if encoding is not None:
394 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000395 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000396 pos, max = 0, len(line)
397
398 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000399 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000400 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000401 endmatch = endprog.match(line)
402 if endmatch:
403 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000404 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000405 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000406 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000407 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000408 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000409 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000410 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000411 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000412 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000413 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000414 else:
415 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000416 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000417 continue
418
Guido van Rossum1aec3231997-04-08 14:24:39 +0000419 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000420 if not line: break
421 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000422 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000423 if line[pos] == ' ':
424 column += 1
425 elif line[pos] == '\t':
426 column = (column//tabsize + 1)*tabsize
427 elif line[pos] == '\f':
428 column = 0
429 else:
430 break
431 pos += 1
432 if pos == max:
433 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000434
435 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000436 if line[pos] == '#':
437 comment_token = line[pos:].rstrip('\r\n')
438 nl_pos = pos + len(comment_token)
Raymond Hettingera48db392009-04-29 00:34:27 +0000439 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000440 (lnum, pos), (lnum, pos + len(comment_token)), line)
Raymond Hettingera48db392009-04-29 00:34:27 +0000441 yield TokenInfo(NL, line[nl_pos:],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000442 (lnum, nl_pos), (lnum, len(line)), line)
443 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000444 yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000445 (lnum, pos), (lnum, len(line)), line)
446 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000447
448 if column > indents[-1]: # count indents or dedents
449 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000450 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000451 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000452 if column not in indents:
453 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000454 "unindent does not match any outer indentation level",
455 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000456 indents = indents[:-1]
Raymond Hettingera48db392009-04-29 00:34:27 +0000457 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000458
459 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000460 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000461 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000462 continued = 0
463
464 while pos < max:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200465 pseudomatch = _compile(PseudoToken).match(line, pos)
Guido van Rossum3b631771997-10-27 20:44:15 +0000466 if pseudomatch: # scan for tokens
467 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000468 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000469 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000470
Georg Brandldde00282007-03-18 19:01:53 +0000471 if (initial in numchars or # ordinary number
472 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000473 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000474 elif initial in '\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000475 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000476 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000477 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000478 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000479 yield TokenInfo(COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000480 elif token in triple_quoted:
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200481 endprog = _compile(endpats[token])
Guido van Rossum3b631771997-10-27 20:44:15 +0000482 endmatch = endprog.match(line, pos)
483 if endmatch: # all on one line
484 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000485 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000486 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000487 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000488 strstart = (lnum, start) # multiple lines
489 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000490 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000491 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000492 elif initial in single_quoted or \
493 token[:2] in single_quoted or \
494 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000495 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000496 strstart = (lnum, start)
Antoine Pitrou10a99b02011-10-11 15:45:56 +0200497 endprog = _compile(endpats[initial] or
498 endpats[token[1]] or
499 endpats[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000500 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000501 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000502 break
503 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000504 yield TokenInfo(STRING, token, spos, epos, line)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000505 elif initial.isidentifier(): # ordinary name
Raymond Hettingera48db392009-04-29 00:34:27 +0000506 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000507 elif initial == '\\': # continued stmt
508 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000509 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000510 if initial in '([{':
511 parenlev += 1
512 elif initial in ')]}':
513 parenlev -= 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000514 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000515 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000516 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000517 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000518 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000519
520 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000521 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
522 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000523
Trent Nelson428de652008-03-18 22:41:35 +0000524
525# An undocumented, backwards compatible, API for all the places in the standard
526# library that expect to be able to use tokenize with strings
527def generate_tokens(readline):
528 return _tokenize(readline, None)
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000529
Meador Inge14c0f032011-10-07 08:53:38 -0500530def main():
531 import argparse
532
533 # Helper error handling routines
534 def perror(message):
535 print(message, file=sys.stderr)
536
537 def error(message, filename=None, location=None):
538 if location:
539 args = (filename,) + location + (message,)
540 perror("%s:%d:%d: error: %s" % args)
541 elif filename:
542 perror("%s: error: %s" % (filename, message))
543 else:
544 perror("error: %s" % message)
545 sys.exit(1)
546
547 # Parse the arguments and options
548 parser = argparse.ArgumentParser(prog='python -m tokenize')
549 parser.add_argument(dest='filename', nargs='?',
550 metavar='filename.py',
551 help='the file to tokenize; defaults to stdin')
552 args = parser.parse_args()
553
554 try:
555 # Tokenize the input
556 if args.filename:
557 filename = args.filename
558 with builtins.open(filename, 'rb') as f:
559 tokens = list(tokenize(f.readline))
560 else:
561 filename = "<stdin>"
562 tokens = _tokenize(sys.stdin.readline, None)
563
564 # Output the tokenization
565 for token in tokens:
566 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
567 print("%-20s%-15s%-15r" %
568 (token_range, tok_name[token.type], token.string))
569 except IndentationError as err:
570 line, column = err.args[1][1:3]
571 error(err.args[0], filename, (line, column))
572 except TokenError as err:
573 line, column = err.args[1]
574 error(err.args[0], filename, (line, column))
575 except SyntaxError as err:
576 error(err, filename)
577 except IOError as err:
578 error(err)
579 except KeyboardInterrupt:
580 print("interrupted\n")
581 except Exception as err:
582 perror("unexpected error: %s" % err)
583 raise
584
Raymond Hettinger6c60d092010-09-09 04:32:39 +0000585if __name__ == "__main__":
Meador Inge14c0f032011-10-07 08:53:38 -0500586 main()