blob: ec5a79a6453b0866f85eea6c702e1d3e47984769 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Trent Nelson428de652008-03-18 22:41:35 +00003tokenize(readline) is a generator that breaks a stream of
4bytes into Python tokens. It decodes the bytes according to
5PEP-0263 for determining source file encoding.
6
7It accepts a readline-like method which is called
8repeatedly to get the next line of input (or b"" for EOF). It generates
Tim Peters4efb6e92001-06-29 23:51:08 +000095-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Trent Nelson428de652008-03-18 22:41:35 +000019operators. Aditionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000021
Ka-Ping Yee244c5932001-03-01 13:56:40 +000022__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000023__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
24 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
25 'Michael Foord')
Guido van Rossumb51eaa11997-03-07 00:21:55 +000026
Trent Nelson428de652008-03-18 22:41:35 +000027import re, string, sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000028from token import *
Trent Nelson428de652008-03-18 22:41:35 +000029from codecs import lookup
30from itertools import chain, repeat
31cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Skip Montanaro40fc1602001-03-01 04:27:19 +000033import token
Raymond Hettinger78a7aee2002-11-05 06:06:02 +000034__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
Trent Nelson428de652008-03-18 22:41:35 +000035 "detect_encoding", "NL", "untokenize", "ENCODING"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000036del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000042ENCODING = N_TOKENS + 2
43tok_name[ENCODING] = 'ENCODING'
44N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000045
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000046def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000047def any(*choices): return group(*choices) + '*'
48def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000049
Antoine Pitroufd036452008-08-19 17:56:33 +000050# Note: we use unicode matching for names ("\w") but ascii matching for
51# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000052Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000056
Antoine Pitroufd036452008-08-19 17:56:33 +000057Hexnumber = r'0[xX][0-9a-fA-F]+'
Georg Brandlfceab5a2008-01-19 20:08:23 +000058Binnumber = r'0[bB][01]+'
59Octnumber = r'0[oO][0-7]+'
Antoine Pitroufd036452008-08-19 17:56:33 +000060Decnumber = r'(?:0+|[1-9][0-9]*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000061Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Antoine Pitroufd036452008-08-19 17:56:33 +000062Exponent = r'[eE][-+]?[0-9]+'
63Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
64Expfloat = r'[0-9]+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000065Floatnumber = group(Pointfloat, Expfloat)
Antoine Pitroufd036452008-08-19 17:56:33 +000066Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000067Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000068
Tim Petersde495832000-10-07 05:09:39 +000069# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +000077Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000078# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000079String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000081
Tim Petersde495832000-10-07 05:09:39 +000082# Because of leftmost-then-longest match semantics, be sure to put the
83# longest operators first (e.g., if = came before ==, == would get
84# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +000085Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +000086 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +000087 r"[+\-*/%&|^=<>]=?",
88 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000089
Guido van Rossum4d8e8591992-01-01 19:34:47 +000090Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +000091Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000092Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000093
Guido van Rossum3b631771997-10-27 20:44:15 +000094PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000095Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000096
Tim Petersde495832000-10-07 05:09:39 +000097# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000098ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000099 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000100 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000101 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000102PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000104
Guido van Rossum3b631771997-10-27 20:44:15 +0000105tokenprog, pseudoprog, single3prog, double3prog = map(
106 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000107endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000108 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000109 "r'''": single3prog, 'r"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000110 "b'''": single3prog, 'b"""': double3prog,
111 "br'''": single3prog, 'br"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000112 "R'''": single3prog, 'R"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000113 "B'''": single3prog, 'B"""': double3prog,
114 "bR'''": single3prog, 'bR"""': double3prog,
115 "Br'''": single3prog, 'Br"""': double3prog,
116 "BR'''": single3prog, 'BR"""': double3prog,
117 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000118
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000119triple_quoted = {}
120for t in ("'''", '"""',
121 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000122 "b'''", 'b"""', "B'''", 'B"""',
123 "br'''", 'br"""', "Br'''", 'Br"""',
124 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000125 triple_quoted[t] = t
126single_quoted = {}
127for t in ("'", '"',
128 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000129 "b'", 'b"', "B'", 'B"',
130 "br'", 'br"', "Br'", 'Br"',
131 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000132 single_quoted[t] = t
133
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000134tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000135
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000136class TokenError(Exception): pass
137
138class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000139
Tim Peters5ca576e2001-06-18 22:08:13 +0000140
Thomas Wouters89f507f2006-12-13 04:49:30 +0000141class Untokenizer:
142
143 def __init__(self):
144 self.tokens = []
145 self.prev_row = 1
146 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000147 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000148
149 def add_whitespace(self, start):
150 row, col = start
151 assert row <= self.prev_row
152 col_offset = col - self.prev_col
153 if col_offset:
154 self.tokens.append(" " * col_offset)
155
156 def untokenize(self, iterable):
157 for t in iterable:
158 if len(t) == 2:
159 self.compat(t, iterable)
160 break
161 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000162 if tok_type == ENCODING:
163 self.encoding = token
164 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000165 self.add_whitespace(start)
166 self.tokens.append(token)
167 self.prev_row, self.prev_col = end
168 if tok_type in (NEWLINE, NL):
169 self.prev_row += 1
170 self.prev_col = 0
171 return "".join(self.tokens)
172
173 def compat(self, token, iterable):
174 startline = False
175 indents = []
176 toks_append = self.tokens.append
177 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000178
Thomas Wouters89f507f2006-12-13 04:49:30 +0000179 if toknum in (NAME, NUMBER):
180 tokval += ' '
181 if toknum in (NEWLINE, NL):
182 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000183 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000184 for tok in iterable:
185 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000186 if toknum == ENCODING:
187 self.encoding = tokval
188 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000189
190 if toknum in (NAME, NUMBER):
191 tokval += ' '
192
Christian Heimesba4af492008-03-28 00:55:15 +0000193 # Insert a space between two consecutive strings
194 if toknum == STRING:
195 if prevstring:
196 tokval = ' ' + tokval
197 prevstring = True
198 else:
199 prevstring = False
200
Thomas Wouters89f507f2006-12-13 04:49:30 +0000201 if toknum == INDENT:
202 indents.append(tokval)
203 continue
204 elif toknum == DEDENT:
205 indents.pop()
206 continue
207 elif toknum in (NEWLINE, NL):
208 startline = True
209 elif startline and indents:
210 toks_append(indents[-1])
211 startline = False
212 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000213
Trent Nelson428de652008-03-18 22:41:35 +0000214
Raymond Hettinger68c04532005-06-10 11:05:19 +0000215def untokenize(iterable):
216 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000217 It returns a bytes object, encoded using the ENCODING
218 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000219
220 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000221 with at least two elements, a token number and token value. If
222 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000223
Thomas Wouters89f507f2006-12-13 04:49:30 +0000224 Round-trip invariant for full input:
225 Untokenized source will match input source exactly
226
227 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000228 # Output bytes will tokenize the back to the input
229 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000230 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000231 readline = BytesIO(newcode).readline
232 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000233 assert t1 == t2
234 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000235 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000236 out = ut.untokenize(iterable)
237 if ut.encoding is not None:
238 out = out.encode(ut.encoding)
239 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000240
Trent Nelson428de652008-03-18 22:41:35 +0000241
242def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000243 """
Trent Nelson428de652008-03-18 22:41:35 +0000244 The detect_encoding() function is used to detect the encoding that should
245 be used to decode a Python source file. It requires one argment, readline,
246 in the same way as the tokenize() generator.
247
248 It will call readline a maximum of twice, and return the encoding used
249 (as a string) and a list of any lines (left as bytes) it has read
250 in.
251
252 It detects the encoding from the presence of a utf-8 bom or an encoding
253 cookie as specified in pep-0263. If both a bom and a cookie are present,
254 but disagree, a SyntaxError will be raised.
255
256 If no encoding is specified, then the default of 'utf-8' will be returned.
257 """
258 utf8_bom = b'\xef\xbb\xbf'
259 bom_found = False
260 encoding = None
261 def read_or_stop():
262 try:
263 return readline()
264 except StopIteration:
265 return b''
266
267 def find_cookie(line):
268 try:
269 line_string = line.decode('ascii')
270 except UnicodeDecodeError:
271 pass
272 else:
273 matches = cookie_re.findall(line_string)
274 if matches:
275 encoding = matches[0]
276 if bom_found and lookup(encoding).name != 'utf-8':
277 # This behaviour mimics the Python interpreter
278 raise SyntaxError('encoding problem: utf-8')
279 return encoding
280
281 first = read_or_stop()
282 if first.startswith(utf8_bom):
283 bom_found = True
284 first = first[3:]
285 if not first:
286 return 'utf-8', []
287
288 encoding = find_cookie(first)
289 if encoding:
290 return encoding, [first]
291
292 second = read_or_stop()
293 if not second:
294 return 'utf-8', [first]
295
296 encoding = find_cookie(second)
297 if encoding:
298 return encoding, [first, second]
299
300 return 'utf-8', [first, second]
301
302
303def tokenize(readline):
304 """
305 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000306 must be a callable object which provides the same interface as the
307 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000308 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000309 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000310 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000311
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000312 The generator produces 5-tuples with these members: the token type; the
313 token string; a 2-tuple (srow, scol) of ints specifying the row and
314 column where the token begins in the source; a 2-tuple (erow, ecol) of
315 ints specifying the row and column where the token ends in the source;
316 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000317 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000318
319 The first token sequence will always be an ENCODING token
320 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000321 """
Trent Nelson428de652008-03-18 22:41:35 +0000322 encoding, consumed = detect_encoding(readline)
323 def readline_generator():
324 while True:
325 try:
326 yield readline()
327 except StopIteration:
328 return
329 chained = chain(consumed, readline_generator())
330 return _tokenize(chained.__next__, encoding)
331
332
333def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000334 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000335 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000336 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000337 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000338 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000339
Trent Nelson428de652008-03-18 22:41:35 +0000340 if encoding is not None:
341 yield (ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000342 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000343 try:
344 line = readline()
345 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000346 line = b''
347
348 if encoding is not None:
349 line = line.decode(encoding)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000350 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000351 pos, max = 0, len(line)
352
353 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000354 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000355 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000356 endmatch = endprog.match(line)
357 if endmatch:
358 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000359 yield (STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000360 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000361 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000362 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000363 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000364 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000365 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000366 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000367 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000368 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000369 else:
370 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000371 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000372 continue
373
Guido van Rossum1aec3231997-04-08 14:24:39 +0000374 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000375 if not line: break
376 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000377 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000378 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000379 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000380 elif line[pos] == '\f': column = 0
381 else: break
382 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000383 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000384
385 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000386 if line[pos] == '#':
387 comment_token = line[pos:].rstrip('\r\n')
388 nl_pos = pos + len(comment_token)
389 yield (COMMENT, comment_token,
390 (lnum, pos), (lnum, pos + len(comment_token)), line)
391 yield (NL, line[nl_pos:],
392 (lnum, nl_pos), (lnum, len(line)), line)
393 else:
394 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000395 (lnum, pos), (lnum, len(line)), line)
396 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000397
398 if column > indents[-1]: # count indents or dedents
399 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000400 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000401 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000402 if column not in indents:
403 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000404 "unindent does not match any outer indentation level",
405 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000406 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000407 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000408
409 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000410 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000411 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000412 continued = 0
413
414 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000415 pseudomatch = pseudoprog.match(line, pos)
416 if pseudomatch: # scan for tokens
417 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000418 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000419 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000420
Georg Brandldde00282007-03-18 19:01:53 +0000421 if (initial in numchars or # ordinary number
422 (initial == '.' and token != '.' and token != '...')):
Tim Peters5ca576e2001-06-18 22:08:13 +0000423 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000424 elif initial in '\r\n':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000425 yield (NL if parenlev > 0 else NEWLINE,
426 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000427 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000428 assert not token.endswith("\n")
Tim Peters5ca576e2001-06-18 22:08:13 +0000429 yield (COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000430 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000431 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000432 endmatch = endprog.match(line, pos)
433 if endmatch: # all on one line
434 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000435 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000436 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000437 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000438 strstart = (lnum, start) # multiple lines
439 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000440 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000441 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000442 elif initial in single_quoted or \
443 token[:2] in single_quoted or \
444 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000445 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000446 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000447 endprog = (endprogs[initial] or endprogs[token[1]] or
448 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000449 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000450 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000451 break
452 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000453 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000454 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000455 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000456 elif initial == '\\': # continued stmt
457 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000458 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000459 if initial in '([{': parenlev = parenlev + 1
460 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000461 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000462 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000463 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000464 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000465 pos = pos + 1
466
467 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000468 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
469 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000470
Trent Nelson428de652008-03-18 22:41:35 +0000471
472# An undocumented, backwards compatible, API for all the places in the standard
473# library that expect to be able to use tokenize with strings
474def generate_tokens(readline):
475 return _tokenize(readline, None)