blob: fdc5cbfb416380b0bf355b51dc64ef7dfeb20574 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Trent Nelson428de652008-03-18 22:41:35 +00003tokenize(readline) is a generator that breaks a stream of
4bytes into Python tokens. It decodes the bytes according to
5PEP-0263 for determining source file encoding.
6
7It accepts a readline-like method which is called
8repeatedly to get the next line of input (or b"" for EOF). It generates
Tim Peters4efb6e92001-06-29 23:51:08 +000095-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Trent Nelson428de652008-03-18 22:41:35 +000019operators. Aditionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000021
Ka-Ping Yee244c5932001-03-01 13:56:40 +000022__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000023__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
24 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
25 'Michael Foord')
Guido van Rossumb51eaa11997-03-07 00:21:55 +000026
Trent Nelson428de652008-03-18 22:41:35 +000027import re, string, sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000028from token import *
Trent Nelson428de652008-03-18 22:41:35 +000029from codecs import lookup
30from itertools import chain, repeat
31cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Skip Montanaro40fc1602001-03-01 04:27:19 +000033import token
Raymond Hettinger78a7aee2002-11-05 06:06:02 +000034__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
Trent Nelson428de652008-03-18 22:41:35 +000035 "detect_encoding", "NL", "untokenize", "ENCODING"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000036del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000042ENCODING = N_TOKENS + 2
43tok_name[ENCODING] = 'ENCODING'
44N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000045
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000046def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000047def any(*choices): return group(*choices) + '*'
48def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000049
Guido van Rossum3b631771997-10-27 20:44:15 +000050Whitespace = r'[ \f\t]*'
51Comment = r'#[^\r\n]*'
52Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
53Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000054
Georg Brandlfceab5a2008-01-19 20:08:23 +000055Hexnumber = r'0[xX][\da-fA-F]+'
56Binnumber = r'0[bB][01]+'
57Octnumber = r'0[oO][0-7]+'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000058Decnumber = r'(?:0+|[1-9]\d*)'
59Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000060Exponent = r'[eE][-+]?\d+'
61Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Tim Petersd507dab2001-08-30 20:51:59 +000062Expfloat = r'\d+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000063Floatnumber = group(Pointfloat, Expfloat)
Tim Petersd507dab2001-08-30 20:51:59 +000064Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000065Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000066
Tim Petersde495832000-10-07 05:09:39 +000067# Tail end of ' string.
68Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
69# Tail end of " string.
70Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
71# Tail end of ''' string.
72Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
73# Tail end of """ string.
74Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +000075Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000076# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000077String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
78 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000079
Tim Petersde495832000-10-07 05:09:39 +000080# Because of leftmost-then-longest match semantics, be sure to put the
81# longest operators first (e.g., if = came before ==, == would get
82# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +000083Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +000084 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +000085 r"[+\-*/%&|^=<>]=?",
86 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000087
Guido van Rossum4d8e8591992-01-01 19:34:47 +000088Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +000089Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000090Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000091
Guido van Rossum3b631771997-10-27 20:44:15 +000092PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000093Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000094
Tim Petersde495832000-10-07 05:09:39 +000095# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000096ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000097 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +000098 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000099 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000100PseudoExtras = group(r'\\\r?\n', Comment, Triple)
101PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000102
Guido van Rossum3b631771997-10-27 20:44:15 +0000103tokenprog, pseudoprog, single3prog, double3prog = map(
104 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000105endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000106 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000107 "r'''": single3prog, 'r"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000108 "b'''": single3prog, 'b"""': double3prog,
109 "br'''": single3prog, 'br"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000110 "R'''": single3prog, 'R"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000111 "B'''": single3prog, 'B"""': double3prog,
112 "bR'''": single3prog, 'bR"""': double3prog,
113 "Br'''": single3prog, 'Br"""': double3prog,
114 "BR'''": single3prog, 'BR"""': double3prog,
115 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000116
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000117triple_quoted = {}
118for t in ("'''", '"""',
119 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000120 "b'''", 'b"""', "B'''", 'B"""',
121 "br'''", 'br"""', "Br'''", 'Br"""',
122 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000123 triple_quoted[t] = t
124single_quoted = {}
125for t in ("'", '"',
126 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000127 "b'", 'b"', "B'", 'B"',
128 "br'", 'br"', "Br'", 'Br"',
129 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000130 single_quoted[t] = t
131
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000132tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000133
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000134class TokenError(Exception): pass
135
136class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000137
Tim Peters5ca576e2001-06-18 22:08:13 +0000138
Thomas Wouters89f507f2006-12-13 04:49:30 +0000139class Untokenizer:
140
141 def __init__(self):
142 self.tokens = []
143 self.prev_row = 1
144 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000145 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000146
147 def add_whitespace(self, start):
148 row, col = start
149 assert row <= self.prev_row
150 col_offset = col - self.prev_col
151 if col_offset:
152 self.tokens.append(" " * col_offset)
153
154 def untokenize(self, iterable):
155 for t in iterable:
156 if len(t) == 2:
157 self.compat(t, iterable)
158 break
159 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000160 if tok_type == ENCODING:
161 self.encoding = token
162 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000163 self.add_whitespace(start)
164 self.tokens.append(token)
165 self.prev_row, self.prev_col = end
166 if tok_type in (NEWLINE, NL):
167 self.prev_row += 1
168 self.prev_col = 0
169 return "".join(self.tokens)
170
171 def compat(self, token, iterable):
172 startline = False
173 indents = []
174 toks_append = self.tokens.append
175 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000176
Thomas Wouters89f507f2006-12-13 04:49:30 +0000177 if toknum in (NAME, NUMBER):
178 tokval += ' '
179 if toknum in (NEWLINE, NL):
180 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000181 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000182 for tok in iterable:
183 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000184 if toknum == ENCODING:
185 self.encoding = tokval
186 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000187
188 if toknum in (NAME, NUMBER):
189 tokval += ' '
190
Christian Heimesba4af492008-03-28 00:55:15 +0000191 # Insert a space between two consecutive strings
192 if toknum == STRING:
193 if prevstring:
194 tokval = ' ' + tokval
195 prevstring = True
196 else:
197 prevstring = False
198
Thomas Wouters89f507f2006-12-13 04:49:30 +0000199 if toknum == INDENT:
200 indents.append(tokval)
201 continue
202 elif toknum == DEDENT:
203 indents.pop()
204 continue
205 elif toknum in (NEWLINE, NL):
206 startline = True
207 elif startline and indents:
208 toks_append(indents[-1])
209 startline = False
210 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000211
Trent Nelson428de652008-03-18 22:41:35 +0000212
Raymond Hettinger68c04532005-06-10 11:05:19 +0000213def untokenize(iterable):
214 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000215 It returns a bytes object, encoded using the ENCODING
216 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000217
218 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000219 with at least two elements, a token number and token value. If
220 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000221
Thomas Wouters89f507f2006-12-13 04:49:30 +0000222 Round-trip invariant for full input:
223 Untokenized source will match input source exactly
224
225 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000226 # Output bytes will tokenize the back to the input
227 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000228 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000229 readline = BytesIO(newcode).readline
230 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000231 assert t1 == t2
232 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000233 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000234 out = ut.untokenize(iterable)
235 if ut.encoding is not None:
236 out = out.encode(ut.encoding)
237 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000238
Trent Nelson428de652008-03-18 22:41:35 +0000239
240def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000241 """
Trent Nelson428de652008-03-18 22:41:35 +0000242 The detect_encoding() function is used to detect the encoding that should
243 be used to decode a Python source file. It requires one argment, readline,
244 in the same way as the tokenize() generator.
245
246 It will call readline a maximum of twice, and return the encoding used
247 (as a string) and a list of any lines (left as bytes) it has read
248 in.
249
250 It detects the encoding from the presence of a utf-8 bom or an encoding
251 cookie as specified in pep-0263. If both a bom and a cookie are present,
252 but disagree, a SyntaxError will be raised.
253
254 If no encoding is specified, then the default of 'utf-8' will be returned.
255 """
256 utf8_bom = b'\xef\xbb\xbf'
257 bom_found = False
258 encoding = None
259 def read_or_stop():
260 try:
261 return readline()
262 except StopIteration:
263 return b''
264
265 def find_cookie(line):
266 try:
267 line_string = line.decode('ascii')
268 except UnicodeDecodeError:
269 pass
270 else:
271 matches = cookie_re.findall(line_string)
272 if matches:
273 encoding = matches[0]
274 if bom_found and lookup(encoding).name != 'utf-8':
275 # This behaviour mimics the Python interpreter
276 raise SyntaxError('encoding problem: utf-8')
277 return encoding
278
279 first = read_or_stop()
280 if first.startswith(utf8_bom):
281 bom_found = True
282 first = first[3:]
283 if not first:
284 return 'utf-8', []
285
286 encoding = find_cookie(first)
287 if encoding:
288 return encoding, [first]
289
290 second = read_or_stop()
291 if not second:
292 return 'utf-8', [first]
293
294 encoding = find_cookie(second)
295 if encoding:
296 return encoding, [first, second]
297
298 return 'utf-8', [first, second]
299
300
301def tokenize(readline):
302 """
303 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000304 must be a callable object which provides the same interface as the
305 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000306 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000307 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000308 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000309
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000310 The generator produces 5-tuples with these members: the token type; the
311 token string; a 2-tuple (srow, scol) of ints specifying the row and
312 column where the token begins in the source; a 2-tuple (erow, ecol) of
313 ints specifying the row and column where the token ends in the source;
314 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000315 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000316
317 The first token sequence will always be an ENCODING token
318 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000319 """
Trent Nelson428de652008-03-18 22:41:35 +0000320 encoding, consumed = detect_encoding(readline)
321 def readline_generator():
322 while True:
323 try:
324 yield readline()
325 except StopIteration:
326 return
327 chained = chain(consumed, readline_generator())
328 return _tokenize(chained.__next__, encoding)
329
330
331def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000332 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000333 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000334 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000335 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000336 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000337
Trent Nelson428de652008-03-18 22:41:35 +0000338 if encoding is not None:
339 yield (ENCODING, encoding, (0, 0), (0, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000340 while 1: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000341 try:
342 line = readline()
343 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000344 line = b''
345
346 if encoding is not None:
347 line = line.decode(encoding)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000348 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000349 pos, max = 0, len(line)
350
351 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000352 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000353 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000354 endmatch = endprog.match(line)
355 if endmatch:
356 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000357 yield (STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000358 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000359 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000360 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000361 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000362 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000363 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000364 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000365 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000366 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000367 else:
368 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000369 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000370 continue
371
Guido van Rossum1aec3231997-04-08 14:24:39 +0000372 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000373 if not line: break
374 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000375 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000376 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000377 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000378 elif line[pos] == '\f': column = 0
379 else: break
380 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000381 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000382
383 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000384 if line[pos] == '#':
385 comment_token = line[pos:].rstrip('\r\n')
386 nl_pos = pos + len(comment_token)
387 yield (COMMENT, comment_token,
388 (lnum, pos), (lnum, pos + len(comment_token)), line)
389 yield (NL, line[nl_pos:],
390 (lnum, nl_pos), (lnum, len(line)), line)
391 else:
392 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000393 (lnum, pos), (lnum, len(line)), line)
394 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000395
396 if column > indents[-1]: # count indents or dedents
397 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000398 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000399 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000400 if column not in indents:
401 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000402 "unindent does not match any outer indentation level",
403 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000404 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000405 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000406
407 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000408 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000409 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000410 continued = 0
411
412 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000413 pseudomatch = pseudoprog.match(line, pos)
414 if pseudomatch: # scan for tokens
415 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000416 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000417 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000418
Georg Brandldde00282007-03-18 19:01:53 +0000419 if (initial in numchars or # ordinary number
420 (initial == '.' and token != '.' and token != '...')):
Tim Peters5ca576e2001-06-18 22:08:13 +0000421 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000422 elif initial in '\r\n':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000423 yield (NL if parenlev > 0 else NEWLINE,
424 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000425 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000426 assert not token.endswith("\n")
Tim Peters5ca576e2001-06-18 22:08:13 +0000427 yield (COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000428 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000429 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000430 endmatch = endprog.match(line, pos)
431 if endmatch: # all on one line
432 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000433 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000434 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000435 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000436 strstart = (lnum, start) # multiple lines
437 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000438 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000439 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000440 elif initial in single_quoted or \
441 token[:2] in single_quoted or \
442 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000443 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000444 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000445 endprog = (endprogs[initial] or endprogs[token[1]] or
446 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000447 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000448 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000449 break
450 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000451 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000452 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000453 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000454 elif initial == '\\': # continued stmt
455 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000456 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000457 if initial in '([{': parenlev = parenlev + 1
458 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000459 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000460 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000461 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000462 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000463 pos = pos + 1
464
465 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000466 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
467 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000468
Trent Nelson428de652008-03-18 22:41:35 +0000469
470# An undocumented, backwards compatible, API for all the places in the standard
471# library that expect to be able to use tokenize with strings
472def generate_tokens(readline):
473 return _tokenize(readline, None)