blob: aa4b246246d35e69562cd515a5e79db336ccb0c7 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Florent Xicluna43e4ea12010-09-03 19:54:02 +00003tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens. It decodes the bytes according to PEP-0263 for
5determining source file encoding.
Trent Nelson428de652008-03-18 22:41:35 +00006
Florent Xicluna43e4ea12010-09-03 19:54:02 +00007It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF). It generates 5-tuples with these
9members:
Tim Peters4efb6e92001-06-29 23:51:08 +000010
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Florent Xicluna43e4ea12010-09-03 19:54:02 +000019operators. Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000022
Ka-Ping Yee244c5932001-03-01 13:56:40 +000023__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000024__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
Florent Xicluna43e4ea12010-09-03 19:54:02 +000027import re
28import sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000029from token import *
Benjamin Peterson433f32c2008-12-12 01:25:05 +000030from codecs import lookup, BOM_UTF8
Trent Nelson428de652008-03-18 22:41:35 +000031cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Skip Montanaro40fc1602001-03-01 04:27:19 +000033import token
Benjamin Petersona0dfa822009-11-13 02:25:08 +000034__all__ = [x for x in dir(token) if not x.startswith("_")]
35__all__.extend(["COMMENT", "tokenize", "detect_encoding", "NL", "untokenize",
36 "ENCODING", "TokenInfo"])
Skip Montanaro40fc1602001-03-01 04:27:19 +000037del token
38
Guido van Rossum1aec3231997-04-08 14:24:39 +000039COMMENT = N_TOKENS
40tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000041NL = N_TOKENS + 1
42tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000043ENCODING = N_TOKENS + 2
44tok_name[ENCODING] = 'ENCODING'
45N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000046
Raymond Hettingeraa17a7f2009-04-29 14:21:25 +000047class TokenInfo(tuple):
48 'TokenInfo(type, string, start, end, line)'
49
50 __slots__ = ()
51
52 _fields = ('type', 'string', 'start', 'end', 'line')
53
54 def __new__(cls, type, string, start, end, line):
55 return tuple.__new__(cls, (type, string, start, end, line))
56
57 @classmethod
58 def _make(cls, iterable, new=tuple.__new__, len=len):
59 'Make a new TokenInfo object from a sequence or iterable'
60 result = new(cls, iterable)
61 if len(result) != 5:
62 raise TypeError('Expected 5 arguments, got %d' % len(result))
63 return result
64
65 def __repr__(self):
66 return 'TokenInfo(type=%r, string=%r, start=%r, end=%r, line=%r)' % self
67
68 def _asdict(self):
69 'Return a new dict which maps field names to their values'
70 return dict(zip(self._fields, self))
71
72 def _replace(self, **kwds):
73 'Return a new TokenInfo object replacing specified fields with new values'
74 result = self._make(map(kwds.pop, ('type', 'string', 'start', 'end', 'line'), self))
75 if kwds:
76 raise ValueError('Got unexpected field names: %r' % kwds.keys())
77 return result
78
79 def __getnewargs__(self):
80 return tuple(self)
81
82 type = property(lambda t: t[0])
83 string = property(lambda t: t[1])
84 start = property(lambda t: t[2])
85 end = property(lambda t: t[3])
86 line = property(lambda t: t[4])
Raymond Hettingera48db392009-04-29 00:34:27 +000087
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000088def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000089def any(*choices): return group(*choices) + '*'
90def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000091
Antoine Pitroufd036452008-08-19 17:56:33 +000092# Note: we use unicode matching for names ("\w") but ascii matching for
93# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000094Whitespace = r'[ \f\t]*'
95Comment = r'#[^\r\n]*'
96Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Benjamin Peterson33856de2010-08-30 14:41:20 +000097Name = r'\w+'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000098
Antoine Pitroufd036452008-08-19 17:56:33 +000099Hexnumber = r'0[xX][0-9a-fA-F]+'
Georg Brandlfceab5a2008-01-19 20:08:23 +0000100Binnumber = r'0[bB][01]+'
101Octnumber = r'0[oO][0-7]+'
Antoine Pitroufd036452008-08-19 17:56:33 +0000102Decnumber = r'(?:0+|[1-9][0-9]*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +0000103Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Antoine Pitroufd036452008-08-19 17:56:33 +0000104Exponent = r'[eE][-+]?[0-9]+'
105Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
106Expfloat = r'[0-9]+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +0000107Floatnumber = group(Pointfloat, Expfloat)
Antoine Pitroufd036452008-08-19 17:56:33 +0000108Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +0000109Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000110
Tim Petersde495832000-10-07 05:09:39 +0000111# Tail end of ' string.
112Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
113# Tail end of " string.
114Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
115# Tail end of ''' string.
116Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
117# Tail end of """ string.
118Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000119Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +0000120# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000121String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
122 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000123
Tim Petersde495832000-10-07 05:09:39 +0000124# Because of leftmost-then-longest match semantics, be sure to put the
125# longest operators first (e.g., if = came before ==, == would get
126# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +0000127Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +0000128 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +0000129 r"[+\-*/%&|^=<>]=?",
130 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +0000131
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000132Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +0000133Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000134Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000135
Guido van Rossum3b631771997-10-27 20:44:15 +0000136PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000137Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000138
Tim Petersde495832000-10-07 05:09:39 +0000139# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000140ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000141 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000142 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000143 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000144PseudoExtras = group(r'\\\r?\n', Comment, Triple)
145PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000146
Benjamin Peterson33856de2010-08-30 14:41:20 +0000147def _compile(expr):
148 return re.compile(expr, re.UNICODE)
149
Guido van Rossum3b631771997-10-27 20:44:15 +0000150tokenprog, pseudoprog, single3prog, double3prog = map(
Benjamin Peterson33856de2010-08-30 14:41:20 +0000151 _compile, (Token, PseudoToken, Single3, Double3))
152endprogs = {"'": _compile(Single), '"': _compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000153 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000154 "r'''": single3prog, 'r"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000155 "b'''": single3prog, 'b"""': double3prog,
156 "br'''": single3prog, 'br"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000157 "R'''": single3prog, 'R"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000158 "B'''": single3prog, 'B"""': double3prog,
159 "bR'''": single3prog, 'bR"""': double3prog,
160 "Br'''": single3prog, 'Br"""': double3prog,
161 "BR'''": single3prog, 'BR"""': double3prog,
162 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000163
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000164triple_quoted = {}
165for t in ("'''", '"""',
166 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000167 "b'''", 'b"""', "B'''", 'B"""',
168 "br'''", 'br"""', "Br'''", 'Br"""',
169 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000170 triple_quoted[t] = t
171single_quoted = {}
172for t in ("'", '"',
173 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000174 "b'", 'b"', "B'", 'B"',
175 "br'", 'br"', "Br'", 'Br"',
176 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000177 single_quoted[t] = t
178
Benjamin Peterson33856de2010-08-30 14:41:20 +0000179del _compile
180
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000181tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000182
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000183class TokenError(Exception): pass
184
185class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000186
Tim Peters5ca576e2001-06-18 22:08:13 +0000187
Thomas Wouters89f507f2006-12-13 04:49:30 +0000188class Untokenizer:
189
190 def __init__(self):
191 self.tokens = []
192 self.prev_row = 1
193 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000194 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000195
196 def add_whitespace(self, start):
197 row, col = start
198 assert row <= self.prev_row
199 col_offset = col - self.prev_col
200 if col_offset:
201 self.tokens.append(" " * col_offset)
202
203 def untokenize(self, iterable):
204 for t in iterable:
205 if len(t) == 2:
206 self.compat(t, iterable)
207 break
208 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000209 if tok_type == ENCODING:
210 self.encoding = token
211 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000212 self.add_whitespace(start)
213 self.tokens.append(token)
214 self.prev_row, self.prev_col = end
215 if tok_type in (NEWLINE, NL):
216 self.prev_row += 1
217 self.prev_col = 0
218 return "".join(self.tokens)
219
220 def compat(self, token, iterable):
221 startline = False
222 indents = []
223 toks_append = self.tokens.append
224 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000225
Thomas Wouters89f507f2006-12-13 04:49:30 +0000226 if toknum in (NAME, NUMBER):
227 tokval += ' '
228 if toknum in (NEWLINE, NL):
229 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000230 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000231 for tok in iterable:
232 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000233 if toknum == ENCODING:
234 self.encoding = tokval
235 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000236
237 if toknum in (NAME, NUMBER):
238 tokval += ' '
239
Christian Heimesba4af492008-03-28 00:55:15 +0000240 # Insert a space between two consecutive strings
241 if toknum == STRING:
242 if prevstring:
243 tokval = ' ' + tokval
244 prevstring = True
245 else:
246 prevstring = False
247
Thomas Wouters89f507f2006-12-13 04:49:30 +0000248 if toknum == INDENT:
249 indents.append(tokval)
250 continue
251 elif toknum == DEDENT:
252 indents.pop()
253 continue
254 elif toknum in (NEWLINE, NL):
255 startline = True
256 elif startline and indents:
257 toks_append(indents[-1])
258 startline = False
259 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000260
Trent Nelson428de652008-03-18 22:41:35 +0000261
Raymond Hettinger68c04532005-06-10 11:05:19 +0000262def untokenize(iterable):
263 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000264 It returns a bytes object, encoded using the ENCODING
265 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000266
267 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000268 with at least two elements, a token number and token value. If
269 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000270
Thomas Wouters89f507f2006-12-13 04:49:30 +0000271 Round-trip invariant for full input:
272 Untokenized source will match input source exactly
273
274 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000275 # Output bytes will tokenize the back to the input
276 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000277 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000278 readline = BytesIO(newcode).readline
279 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000280 assert t1 == t2
281 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000282 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000283 out = ut.untokenize(iterable)
284 if ut.encoding is not None:
285 out = out.encode(ut.encoding)
286 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000287
Trent Nelson428de652008-03-18 22:41:35 +0000288
Benjamin Petersond3afada2009-10-09 21:43:09 +0000289def _get_normal_name(orig_enc):
290 """Imitates get_normal_name in tokenizer.c."""
291 # Only care about the first 12 characters.
292 enc = orig_enc[:12].lower().replace("_", "-")
293 if enc == "utf-8" or enc.startswith("utf-8-"):
294 return "utf-8"
295 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
296 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
297 return "iso-8859-1"
298 return orig_enc
299
Trent Nelson428de652008-03-18 22:41:35 +0000300def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000301 """
Trent Nelson428de652008-03-18 22:41:35 +0000302 The detect_encoding() function is used to detect the encoding that should
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000303 be used to decode a Python source file. It requires one argment, readline,
Trent Nelson428de652008-03-18 22:41:35 +0000304 in the same way as the tokenize() generator.
305
306 It will call readline a maximum of twice, and return the encoding used
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000307 (as a string) and a list of any lines (left as bytes) it has read in.
Trent Nelson428de652008-03-18 22:41:35 +0000308
309 It detects the encoding from the presence of a utf-8 bom or an encoding
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000310 cookie as specified in pep-0263. If both a bom and a cookie are present,
311 but disagree, a SyntaxError will be raised. If the encoding cookie is an
312 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
Benjamin Peterson689a5582010-03-18 22:29:52 +0000313 'utf-8-sig' is returned.
Trent Nelson428de652008-03-18 22:41:35 +0000314
315 If no encoding is specified, then the default of 'utf-8' will be returned.
316 """
Trent Nelson428de652008-03-18 22:41:35 +0000317 bom_found = False
318 encoding = None
Benjamin Peterson689a5582010-03-18 22:29:52 +0000319 default = 'utf-8'
Trent Nelson428de652008-03-18 22:41:35 +0000320 def read_or_stop():
321 try:
322 return readline()
323 except StopIteration:
324 return b''
325
326 def find_cookie(line):
327 try:
328 line_string = line.decode('ascii')
329 except UnicodeDecodeError:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000330 return None
331
332 matches = cookie_re.findall(line_string)
333 if not matches:
334 return None
Benjamin Petersond3afada2009-10-09 21:43:09 +0000335 encoding = _get_normal_name(matches[0])
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000336 try:
337 codec = lookup(encoding)
338 except LookupError:
339 # This behaviour mimics the Python interpreter
340 raise SyntaxError("unknown encoding: " + encoding)
341
Benjamin Peterson1613ed82010-03-18 22:34:15 +0000342 if bom_found:
343 if codec.name != 'utf-8':
344 # This behaviour mimics the Python interpreter
345 raise SyntaxError('encoding problem: utf-8')
346 encoding += '-sig'
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000347 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000348
349 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000350 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000351 bom_found = True
352 first = first[3:]
Benjamin Peterson689a5582010-03-18 22:29:52 +0000353 default = 'utf-8-sig'
Trent Nelson428de652008-03-18 22:41:35 +0000354 if not first:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000355 return default, []
Trent Nelson428de652008-03-18 22:41:35 +0000356
357 encoding = find_cookie(first)
358 if encoding:
359 return encoding, [first]
360
361 second = read_or_stop()
362 if not second:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000363 return default, [first]
Trent Nelson428de652008-03-18 22:41:35 +0000364
365 encoding = find_cookie(second)
366 if encoding:
367 return encoding, [first, second]
368
Benjamin Peterson689a5582010-03-18 22:29:52 +0000369 return default, [first, second]
Trent Nelson428de652008-03-18 22:41:35 +0000370
371
372def tokenize(readline):
373 """
374 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000375 must be a callable object which provides the same interface as the
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000376 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000377 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000378 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000379 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000380
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000381 The generator produces 5-tuples with these members: the token type; the
382 token string; a 2-tuple (srow, scol) of ints specifying the row and
383 column where the token begins in the source; a 2-tuple (erow, ecol) of
384 ints specifying the row and column where the token ends in the source;
Florent Xicluna43e4ea12010-09-03 19:54:02 +0000385 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000386 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000387
388 The first token sequence will always be an ENCODING token
389 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000390 """
Benjamin Peterson21db77e2009-11-14 16:27:26 +0000391 # This import is here to avoid problems when the itertools module is not
392 # built yet and tokenize is imported.
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000393 from itertools import chain, repeat
Trent Nelson428de652008-03-18 22:41:35 +0000394 encoding, consumed = detect_encoding(readline)
Benjamin Peterson81dd8b92009-11-14 18:09:17 +0000395 rl_gen = iter(readline, b"")
396 empty = repeat(b"")
397 return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)
Trent Nelson428de652008-03-18 22:41:35 +0000398
399
400def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000401 lnum = parenlev = continued = 0
Benjamin Peterson33856de2010-08-30 14:41:20 +0000402 numchars = '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000403 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000404 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000405 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000406
Trent Nelson428de652008-03-18 22:41:35 +0000407 if encoding is not None:
Benjamin Peterson689a5582010-03-18 22:29:52 +0000408 if encoding == "utf-8-sig":
409 # BOM will already have been stripped.
410 encoding = "utf-8"
Raymond Hettingera48db392009-04-29 00:34:27 +0000411 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000412 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000413 try:
414 line = readline()
415 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000416 line = b''
417
418 if encoding is not None:
419 line = line.decode(encoding)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000420 lnum += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000421 pos, max = 0, len(line)
422
423 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000424 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000425 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000426 endmatch = endprog.match(line)
427 if endmatch:
428 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000429 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000430 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000431 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000432 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000433 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000434 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000435 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000436 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000437 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000438 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000439 else:
440 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000441 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000442 continue
443
Guido van Rossum1aec3231997-04-08 14:24:39 +0000444 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000445 if not line: break
446 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000447 while pos < max: # measure leading whitespace
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000448 if line[pos] == ' ':
449 column += 1
450 elif line[pos] == '\t':
451 column = (column//tabsize + 1)*tabsize
452 elif line[pos] == '\f':
453 column = 0
454 else:
455 break
456 pos += 1
457 if pos == max:
458 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000459
460 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000461 if line[pos] == '#':
462 comment_token = line[pos:].rstrip('\r\n')
463 nl_pos = pos + len(comment_token)
Raymond Hettingera48db392009-04-29 00:34:27 +0000464 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000465 (lnum, pos), (lnum, pos + len(comment_token)), line)
Raymond Hettingera48db392009-04-29 00:34:27 +0000466 yield TokenInfo(NL, line[nl_pos:],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000467 (lnum, nl_pos), (lnum, len(line)), line)
468 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000469 yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000470 (lnum, pos), (lnum, len(line)), line)
471 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000472
473 if column > indents[-1]: # count indents or dedents
474 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000475 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000476 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000477 if column not in indents:
478 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000479 "unindent does not match any outer indentation level",
480 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000481 indents = indents[:-1]
Raymond Hettingera48db392009-04-29 00:34:27 +0000482 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000483
484 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000485 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000486 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000487 continued = 0
488
489 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000490 pseudomatch = pseudoprog.match(line, pos)
491 if pseudomatch: # scan for tokens
492 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000493 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000494 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000495
Georg Brandldde00282007-03-18 19:01:53 +0000496 if (initial in numchars or # ordinary number
497 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000498 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000499 elif initial in '\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000500 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000501 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000502 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000503 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000504 yield TokenInfo(COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000505 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000506 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000507 endmatch = endprog.match(line, pos)
508 if endmatch: # all on one line
509 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000510 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000511 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000512 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000513 strstart = (lnum, start) # multiple lines
514 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000515 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000516 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000517 elif initial in single_quoted or \
518 token[:2] in single_quoted or \
519 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000520 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000521 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000522 endprog = (endprogs[initial] or endprogs[token[1]] or
523 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000524 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000525 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000526 break
527 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000528 yield TokenInfo(STRING, token, spos, epos, line)
Benjamin Peterson33856de2010-08-30 14:41:20 +0000529 elif initial.isidentifier(): # ordinary name
Raymond Hettingera48db392009-04-29 00:34:27 +0000530 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000531 elif initial == '\\': # continued stmt
532 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000533 else:
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000534 if initial in '([{':
535 parenlev += 1
536 elif initial in ')]}':
537 parenlev -= 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000538 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000539 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000540 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000541 (lnum, pos), (lnum, pos+1), line)
Benjamin Petersona0dfa822009-11-13 02:25:08 +0000542 pos += 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000543
544 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000545 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
546 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000547
Trent Nelson428de652008-03-18 22:41:35 +0000548
549# An undocumented, backwards compatible, API for all the places in the standard
550# library that expect to be able to use tokenize with strings
551def generate_tokens(readline):
552 return _tokenize(readline, None)