blob: d7043b1bc39e0dfa93aee41f1000c6247af15ec6 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Trent Nelson428de652008-03-18 22:41:35 +00003tokenize(readline) is a generator that breaks a stream of
4bytes into Python tokens. It decodes the bytes according to
5PEP-0263 for determining source file encoding.
6
7It accepts a readline-like method which is called
8repeatedly to get the next line of input (or b"" for EOF). It generates
Tim Peters4efb6e92001-06-29 23:51:08 +000095-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Trent Nelson428de652008-03-18 22:41:35 +000019operators. Aditionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000021
Ka-Ping Yee244c5932001-03-01 13:56:40 +000022__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000023__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
24 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
25 'Michael Foord')
Guido van Rossumb51eaa11997-03-07 00:21:55 +000026
Trent Nelson428de652008-03-18 22:41:35 +000027import re, string, sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000028from token import *
Trent Nelson428de652008-03-18 22:41:35 +000029from codecs import lookup
30from itertools import chain, repeat
31cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Skip Montanaro40fc1602001-03-01 04:27:19 +000033import token
Raymond Hettinger78a7aee2002-11-05 06:06:02 +000034__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
Trent Nelson428de652008-03-18 22:41:35 +000035 "detect_encoding", "NL", "untokenize", "ENCODING"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000036del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000042ENCODING = N_TOKENS + 2
43tok_name[ENCODING] = 'ENCODING'
44N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000045
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000046def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000047def any(*choices): return group(*choices) + '*'
48def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000049
Guido van Rossum3b631771997-10-27 20:44:15 +000050Whitespace = r'[ \f\t]*'
51Comment = r'#[^\r\n]*'
52Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
53Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000054
Georg Brandlfceab5a2008-01-19 20:08:23 +000055Hexnumber = r'0[xX][\da-fA-F]+'
56Binnumber = r'0[bB][01]+'
57Octnumber = r'0[oO][0-7]+'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000058Decnumber = r'(?:0+|[1-9]\d*)'
59Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000060Exponent = r'[eE][-+]?\d+'
61Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Tim Petersd507dab2001-08-30 20:51:59 +000062Expfloat = r'\d+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000063Floatnumber = group(Pointfloat, Expfloat)
Tim Petersd507dab2001-08-30 20:51:59 +000064Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000065Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000066
Tim Petersde495832000-10-07 05:09:39 +000067# Tail end of ' string.
68Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
69# Tail end of " string.
70Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
71# Tail end of ''' string.
72Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
73# Tail end of """ string.
74Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +000075Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000076# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000077String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
78 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000079
Tim Petersde495832000-10-07 05:09:39 +000080# Because of leftmost-then-longest match semantics, be sure to put the
81# longest operators first (e.g., if = came before ==, == would get
82# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +000083Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +000084 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +000085 r"[+\-*/%&|^=<>]=?",
86 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000087
Guido van Rossum4d8e8591992-01-01 19:34:47 +000088Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +000089Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000090Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000091
Guido van Rossum3b631771997-10-27 20:44:15 +000092PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000093Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000094
Tim Petersde495832000-10-07 05:09:39 +000095# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000096ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000097 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +000098 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000099 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000100PseudoExtras = group(r'\\\r?\n', Comment, Triple)
101PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000102
Guido van Rossum3b631771997-10-27 20:44:15 +0000103tokenprog, pseudoprog, single3prog, double3prog = map(
104 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000105endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000106 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000107 "r'''": single3prog, 'r"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000108 "b'''": single3prog, 'b"""': double3prog,
109 "br'''": single3prog, 'br"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000110 "R'''": single3prog, 'R"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000111 "B'''": single3prog, 'B"""': double3prog,
112 "bR'''": single3prog, 'bR"""': double3prog,
113 "Br'''": single3prog, 'Br"""': double3prog,
114 "BR'''": single3prog, 'BR"""': double3prog,
115 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000116
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000117triple_quoted = {}
118for t in ("'''", '"""',
119 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000120 "b'''", 'b"""', "B'''", 'B"""',
121 "br'''", 'br"""', "Br'''", 'Br"""',
122 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000123 triple_quoted[t] = t
124single_quoted = {}
125for t in ("'", '"',
126 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000127 "b'", 'b"', "B'", 'B"',
128 "br'", 'br"', "Br'", 'Br"',
129 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000130 single_quoted[t] = t
131
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000132tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000133
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000134class TokenError(Exception): pass
135
136class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000137
Tim Peters5ca576e2001-06-18 22:08:13 +0000138
Thomas Wouters89f507f2006-12-13 04:49:30 +0000139class Untokenizer:
140
141 def __init__(self):
142 self.tokens = []
143 self.prev_row = 1
144 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000145 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000146
147 def add_whitespace(self, start):
148 row, col = start
149 assert row <= self.prev_row
150 col_offset = col - self.prev_col
151 if col_offset:
152 self.tokens.append(" " * col_offset)
153
154 def untokenize(self, iterable):
155 for t in iterable:
156 if len(t) == 2:
157 self.compat(t, iterable)
158 break
159 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000160 if tok_type == ENCODING:
161 self.encoding = token
162 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000163 self.add_whitespace(start)
164 self.tokens.append(token)
165 self.prev_row, self.prev_col = end
166 if tok_type in (NEWLINE, NL):
167 self.prev_row += 1
168 self.prev_col = 0
169 return "".join(self.tokens)
170
171 def compat(self, token, iterable):
172 startline = False
173 indents = []
174 toks_append = self.tokens.append
175 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000176
Thomas Wouters89f507f2006-12-13 04:49:30 +0000177 if toknum in (NAME, NUMBER):
178 tokval += ' '
179 if toknum in (NEWLINE, NL):
180 startline = True
181 for tok in iterable:
182 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000183 if toknum == ENCODING:
184 self.encoding = tokval
185 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000186
187 if toknum in (NAME, NUMBER):
188 tokval += ' '
189
190 if toknum == INDENT:
191 indents.append(tokval)
192 continue
193 elif toknum == DEDENT:
194 indents.pop()
195 continue
196 elif toknum in (NEWLINE, NL):
197 startline = True
198 elif startline and indents:
199 toks_append(indents[-1])
200 startline = False
201 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000202
Trent Nelson428de652008-03-18 22:41:35 +0000203
Raymond Hettinger68c04532005-06-10 11:05:19 +0000204def untokenize(iterable):
205 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000206 It returns a bytes object, encoded using the ENCODING
207 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000208
209 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000210 with at least two elements, a token number and token value. If
211 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000212
Thomas Wouters89f507f2006-12-13 04:49:30 +0000213 Round-trip invariant for full input:
214 Untokenized source will match input source exactly
215
216 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000217 # Output bytes will tokenize the back to the input
218 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000219 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000220 readline = BytesIO(newcode).readline
221 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000222 assert t1 == t2
223 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000224 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000225 out = ut.untokenize(iterable)
226 if ut.encoding is not None:
227 out = out.encode(ut.encoding)
228 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000229
Trent Nelson428de652008-03-18 22:41:35 +0000230
231def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000232 """
Trent Nelson428de652008-03-18 22:41:35 +0000233 The detect_encoding() function is used to detect the encoding that should
234 be used to decode a Python source file. It requires one argment, readline,
235 in the same way as the tokenize() generator.
236
237 It will call readline a maximum of twice, and return the encoding used
238 (as a string) and a list of any lines (left as bytes) it has read
239 in.
240
241 It detects the encoding from the presence of a utf-8 bom or an encoding
242 cookie as specified in pep-0263. If both a bom and a cookie are present,
243 but disagree, a SyntaxError will be raised.
244
245 If no encoding is specified, then the default of 'utf-8' will be returned.
246 """
247 utf8_bom = b'\xef\xbb\xbf'
248 bom_found = False
249 encoding = None
250 def read_or_stop():
251 try:
252 return readline()
253 except StopIteration:
254 return b''
255
256 def find_cookie(line):
257 try:
258 line_string = line.decode('ascii')
259 except UnicodeDecodeError:
260 pass
261 else:
262 matches = cookie_re.findall(line_string)
263 if matches:
264 encoding = matches[0]
265 if bom_found and lookup(encoding).name != 'utf-8':
266 # This behaviour mimics the Python interpreter
267 raise SyntaxError('encoding problem: utf-8')
268 return encoding
269
270 first = read_or_stop()
271 if first.startswith(utf8_bom):
272 bom_found = True
273 first = first[3:]
274 if not first:
275 return 'utf-8', []
276
277 encoding = find_cookie(first)
278 if encoding:
279 return encoding, [first]
280
281 second = read_or_stop()
282 if not second:
283 return 'utf-8', [first]
284
285 encoding = find_cookie(second)
286 if encoding:
287 return encoding, [first, second]
288
289 return 'utf-8', [first, second]
290
291
292def tokenize(readline):
293 """
294 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000295 must be a callable object which provides the same interface as the
296 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000297 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000298 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000299 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000300
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000301 The generator produces 5-tuples with these members: the token type; the
302 token string; a 2-tuple (srow, scol) of ints specifying the row and
303 column where the token begins in the source; a 2-tuple (erow, ecol) of
304 ints specifying the row and column where the token ends in the source;
305 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000306 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000307
308 The first token sequence will always be an ENCODING token
309 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000310 """
Trent Nelson428de652008-03-18 22:41:35 +0000311 encoding, consumed = detect_encoding(readline)
312 def readline_generator():
313 while True:
314 try:
315 yield readline()
316 except StopIteration:
317 return
318 chained = chain(consumed, readline_generator())
319 return _tokenize(chained.__next__, encoding)
320
321
322def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000323 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000324 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000325 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000326 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000327 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000328
Trent Nelson428de652008-03-18 22:41:35 +0000329 if encoding is not None:
330 yield (ENCODING, encoding, (0, 0), (0, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000331 while 1: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000332 try:
333 line = readline()
334 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000335 line = b''
336
337 if encoding is not None:
338 line = line.decode(encoding)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000339 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000340 pos, max = 0, len(line)
341
342 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000343 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000344 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000345 endmatch = endprog.match(line)
346 if endmatch:
347 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000348 yield (STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000349 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000350 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000351 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000352 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000353 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000354 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000355 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000356 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000357 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000358 else:
359 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000360 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000361 continue
362
Guido van Rossum1aec3231997-04-08 14:24:39 +0000363 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000364 if not line: break
365 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000366 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000367 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000368 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000369 elif line[pos] == '\f': column = 0
370 else: break
371 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000372 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000373
374 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000375 if line[pos] == '#':
376 comment_token = line[pos:].rstrip('\r\n')
377 nl_pos = pos + len(comment_token)
378 yield (COMMENT, comment_token,
379 (lnum, pos), (lnum, pos + len(comment_token)), line)
380 yield (NL, line[nl_pos:],
381 (lnum, nl_pos), (lnum, len(line)), line)
382 else:
383 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000384 (lnum, pos), (lnum, len(line)), line)
385 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000386
387 if column > indents[-1]: # count indents or dedents
388 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000389 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000390 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000391 if column not in indents:
392 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000393 "unindent does not match any outer indentation level",
394 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000395 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000396 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000397
398 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000399 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000400 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000401 continued = 0
402
403 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000404 pseudomatch = pseudoprog.match(line, pos)
405 if pseudomatch: # scan for tokens
406 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000407 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000408 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000409
Georg Brandldde00282007-03-18 19:01:53 +0000410 if (initial in numchars or # ordinary number
411 (initial == '.' and token != '.' and token != '...')):
Tim Peters5ca576e2001-06-18 22:08:13 +0000412 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000413 elif initial in '\r\n':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000414 yield (NL if parenlev > 0 else NEWLINE,
415 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000416 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000417 assert not token.endswith("\n")
Tim Peters5ca576e2001-06-18 22:08:13 +0000418 yield (COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000419 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000420 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000421 endmatch = endprog.match(line, pos)
422 if endmatch: # all on one line
423 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000424 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000425 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000426 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000427 strstart = (lnum, start) # multiple lines
428 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000429 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000430 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000431 elif initial in single_quoted or \
432 token[:2] in single_quoted or \
433 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000434 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000435 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000436 endprog = (endprogs[initial] or endprogs[token[1]] or
437 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000438 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000439 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000440 break
441 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000442 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000443 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000444 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000445 elif initial == '\\': # continued stmt
446 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000447 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000448 if initial in '([{': parenlev = parenlev + 1
449 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000450 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000451 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000452 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000453 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000454 pos = pos + 1
455
456 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000457 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
458 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000459
Trent Nelson428de652008-03-18 22:41:35 +0000460
461# An undocumented, backwards compatible, API for all the places in the standard
462# library that expect to be able to use tokenize with strings
463def generate_tokens(readline):
464 return _tokenize(readline, None)