blob: e9114c6aa75e6c20c005d79c208655dbf4d3151b [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Trent Nelson428de652008-03-18 22:41:35 +00003tokenize(readline) is a generator that breaks a stream of
4bytes into Python tokens. It decodes the bytes according to
5PEP-0263 for determining source file encoding.
6
7It accepts a readline-like method which is called
8repeatedly to get the next line of input (or b"" for EOF). It generates
Tim Peters4efb6e92001-06-29 23:51:08 +000095-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Trent Nelson428de652008-03-18 22:41:35 +000019operators. Aditionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000021
Ka-Ping Yee244c5932001-03-01 13:56:40 +000022__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000023__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
24 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
25 'Michael Foord')
Guido van Rossumb51eaa11997-03-07 00:21:55 +000026
Raymond Hettingera48db392009-04-29 00:34:27 +000027import collections
Trent Nelson428de652008-03-18 22:41:35 +000028import re, string, sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000029from token import *
Benjamin Peterson433f32c2008-12-12 01:25:05 +000030from codecs import lookup, BOM_UTF8
Trent Nelson428de652008-03-18 22:41:35 +000031cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Skip Montanaro40fc1602001-03-01 04:27:19 +000033import token
Raymond Hettinger78a7aee2002-11-05 06:06:02 +000034__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
Raymond Hettingera48db392009-04-29 00:34:27 +000035 "detect_encoding", "NL", "untokenize", "ENCODING", "Tokenize"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000036del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000042ENCODING = N_TOKENS + 2
43tok_name[ENCODING] = 'ENCODING'
44N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000045
Raymond Hettingera48db392009-04-29 00:34:27 +000046TokenInfo = collections.namedtuple('TokenInfo', 'type string start end line')
47
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000048def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000049def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000051
Antoine Pitroufd036452008-08-19 17:56:33 +000052# Note: we use unicode matching for names ("\w") but ascii matching for
53# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000054Whitespace = r'[ \f\t]*'
55Comment = r'#[^\r\n]*'
56Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
57Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000058
Antoine Pitroufd036452008-08-19 17:56:33 +000059Hexnumber = r'0[xX][0-9a-fA-F]+'
Georg Brandlfceab5a2008-01-19 20:08:23 +000060Binnumber = r'0[bB][01]+'
61Octnumber = r'0[oO][0-7]+'
Antoine Pitroufd036452008-08-19 17:56:33 +000062Decnumber = r'(?:0+|[1-9][0-9]*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000063Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Antoine Pitroufd036452008-08-19 17:56:33 +000064Exponent = r'[eE][-+]?[0-9]+'
65Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
66Expfloat = r'[0-9]+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000067Floatnumber = group(Pointfloat, Expfloat)
Antoine Pitroufd036452008-08-19 17:56:33 +000068Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000069Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000070
Tim Petersde495832000-10-07 05:09:39 +000071# Tail end of ' string.
72Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
73# Tail end of " string.
74Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
75# Tail end of ''' string.
76Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
77# Tail end of """ string.
78Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +000079Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000080# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000081String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
82 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000083
Tim Petersde495832000-10-07 05:09:39 +000084# Because of leftmost-then-longest match semantics, be sure to put the
85# longest operators first (e.g., if = came before ==, == would get
86# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +000087Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +000088 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +000089 r"[+\-*/%&|^=<>]=?",
90 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000091
Guido van Rossum4d8e8591992-01-01 19:34:47 +000092Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +000093Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000094Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000095
Guido van Rossum3b631771997-10-27 20:44:15 +000096PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000097Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000098
Tim Petersde495832000-10-07 05:09:39 +000099# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000100ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000101 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000102 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000103 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000104PseudoExtras = group(r'\\\r?\n', Comment, Triple)
105PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000106
Guido van Rossum3b631771997-10-27 20:44:15 +0000107tokenprog, pseudoprog, single3prog, double3prog = map(
108 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000109endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000110 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000111 "r'''": single3prog, 'r"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000112 "b'''": single3prog, 'b"""': double3prog,
113 "br'''": single3prog, 'br"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000114 "R'''": single3prog, 'R"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000115 "B'''": single3prog, 'B"""': double3prog,
116 "bR'''": single3prog, 'bR"""': double3prog,
117 "Br'''": single3prog, 'Br"""': double3prog,
118 "BR'''": single3prog, 'BR"""': double3prog,
119 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000120
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000121triple_quoted = {}
122for t in ("'''", '"""',
123 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000124 "b'''", 'b"""', "B'''", 'B"""',
125 "br'''", 'br"""', "Br'''", 'Br"""',
126 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000127 triple_quoted[t] = t
128single_quoted = {}
129for t in ("'", '"',
130 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000131 "b'", 'b"', "B'", 'B"',
132 "br'", 'br"', "Br'", 'Br"',
133 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000134 single_quoted[t] = t
135
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000136tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000137
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000138class TokenError(Exception): pass
139
140class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000141
Tim Peters5ca576e2001-06-18 22:08:13 +0000142
Thomas Wouters89f507f2006-12-13 04:49:30 +0000143class Untokenizer:
144
145 def __init__(self):
146 self.tokens = []
147 self.prev_row = 1
148 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000149 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000150
151 def add_whitespace(self, start):
152 row, col = start
153 assert row <= self.prev_row
154 col_offset = col - self.prev_col
155 if col_offset:
156 self.tokens.append(" " * col_offset)
157
158 def untokenize(self, iterable):
159 for t in iterable:
160 if len(t) == 2:
161 self.compat(t, iterable)
162 break
163 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000164 if tok_type == ENCODING:
165 self.encoding = token
166 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000167 self.add_whitespace(start)
168 self.tokens.append(token)
169 self.prev_row, self.prev_col = end
170 if tok_type in (NEWLINE, NL):
171 self.prev_row += 1
172 self.prev_col = 0
173 return "".join(self.tokens)
174
175 def compat(self, token, iterable):
176 startline = False
177 indents = []
178 toks_append = self.tokens.append
179 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000180
Thomas Wouters89f507f2006-12-13 04:49:30 +0000181 if toknum in (NAME, NUMBER):
182 tokval += ' '
183 if toknum in (NEWLINE, NL):
184 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000185 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000186 for tok in iterable:
187 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000188 if toknum == ENCODING:
189 self.encoding = tokval
190 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000191
192 if toknum in (NAME, NUMBER):
193 tokval += ' '
194
Christian Heimesba4af492008-03-28 00:55:15 +0000195 # Insert a space between two consecutive strings
196 if toknum == STRING:
197 if prevstring:
198 tokval = ' ' + tokval
199 prevstring = True
200 else:
201 prevstring = False
202
Thomas Wouters89f507f2006-12-13 04:49:30 +0000203 if toknum == INDENT:
204 indents.append(tokval)
205 continue
206 elif toknum == DEDENT:
207 indents.pop()
208 continue
209 elif toknum in (NEWLINE, NL):
210 startline = True
211 elif startline and indents:
212 toks_append(indents[-1])
213 startline = False
214 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000215
Trent Nelson428de652008-03-18 22:41:35 +0000216
Raymond Hettinger68c04532005-06-10 11:05:19 +0000217def untokenize(iterable):
218 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000219 It returns a bytes object, encoded using the ENCODING
220 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000221
222 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000223 with at least two elements, a token number and token value. If
224 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000225
Thomas Wouters89f507f2006-12-13 04:49:30 +0000226 Round-trip invariant for full input:
227 Untokenized source will match input source exactly
228
229 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000230 # Output bytes will tokenize the back to the input
231 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000232 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000233 readline = BytesIO(newcode).readline
234 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000235 assert t1 == t2
236 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000237 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000238 out = ut.untokenize(iterable)
239 if ut.encoding is not None:
240 out = out.encode(ut.encoding)
241 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000242
Trent Nelson428de652008-03-18 22:41:35 +0000243
244def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000245 """
Trent Nelson428de652008-03-18 22:41:35 +0000246 The detect_encoding() function is used to detect the encoding that should
247 be used to decode a Python source file. It requires one argment, readline,
248 in the same way as the tokenize() generator.
249
250 It will call readline a maximum of twice, and return the encoding used
251 (as a string) and a list of any lines (left as bytes) it has read
252 in.
253
254 It detects the encoding from the presence of a utf-8 bom or an encoding
255 cookie as specified in pep-0263. If both a bom and a cookie are present,
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000256 but disagree, a SyntaxError will be raised. If the encoding cookie is an
257 invalid charset, raise a SyntaxError.
Trent Nelson428de652008-03-18 22:41:35 +0000258
259 If no encoding is specified, then the default of 'utf-8' will be returned.
260 """
Trent Nelson428de652008-03-18 22:41:35 +0000261 bom_found = False
262 encoding = None
263 def read_or_stop():
264 try:
265 return readline()
266 except StopIteration:
267 return b''
268
269 def find_cookie(line):
270 try:
271 line_string = line.decode('ascii')
272 except UnicodeDecodeError:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000273 return None
274
275 matches = cookie_re.findall(line_string)
276 if not matches:
277 return None
278 encoding = matches[0]
279 try:
280 codec = lookup(encoding)
281 except LookupError:
282 # This behaviour mimics the Python interpreter
283 raise SyntaxError("unknown encoding: " + encoding)
284
285 if bom_found and codec.name != 'utf-8':
286 # This behaviour mimics the Python interpreter
287 raise SyntaxError('encoding problem: utf-8')
288 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000289
290 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000291 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000292 bom_found = True
293 first = first[3:]
294 if not first:
295 return 'utf-8', []
296
297 encoding = find_cookie(first)
298 if encoding:
299 return encoding, [first]
300
301 second = read_or_stop()
302 if not second:
303 return 'utf-8', [first]
304
305 encoding = find_cookie(second)
306 if encoding:
307 return encoding, [first, second]
308
309 return 'utf-8', [first, second]
310
311
312def tokenize(readline):
313 """
314 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000315 must be a callable object which provides the same interface as the
316 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000317 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000318 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000319 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000320
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000321 The generator produces 5-tuples with these members: the token type; the
322 token string; a 2-tuple (srow, scol) of ints specifying the row and
323 column where the token begins in the source; a 2-tuple (erow, ecol) of
324 ints specifying the row and column where the token ends in the source;
325 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000326 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000327
328 The first token sequence will always be an ENCODING token
329 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000330 """
Trent Nelson428de652008-03-18 22:41:35 +0000331 encoding, consumed = detect_encoding(readline)
Benjamin Peterson9b8d24b2009-03-24 22:30:15 +0000332 def readline_generator(consumed):
333 for line in consumed:
334 yield line
Trent Nelson428de652008-03-18 22:41:35 +0000335 while True:
336 try:
337 yield readline()
338 except StopIteration:
339 return
Benjamin Peterson9b8d24b2009-03-24 22:30:15 +0000340 chained = readline_generator(consumed)
Trent Nelson428de652008-03-18 22:41:35 +0000341 return _tokenize(chained.__next__, encoding)
342
343
344def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000345 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000346 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000347 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000348 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000349 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000350
Trent Nelson428de652008-03-18 22:41:35 +0000351 if encoding is not None:
Raymond Hettingera48db392009-04-29 00:34:27 +0000352 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000353 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000354 try:
355 line = readline()
356 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000357 line = b''
358
359 if encoding is not None:
360 line = line.decode(encoding)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000361 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000362 pos, max = 0, len(line)
363
364 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000365 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000366 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000367 endmatch = endprog.match(line)
368 if endmatch:
369 pos = end = endmatch.end(0)
Raymond Hettingera48db392009-04-29 00:34:27 +0000370 yield TokenInfo(STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000371 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000372 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000373 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000374 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000375 yield TokenInfo(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000376 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000377 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000378 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000379 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000380 else:
381 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000382 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000383 continue
384
Guido van Rossum1aec3231997-04-08 14:24:39 +0000385 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000386 if not line: break
387 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000388 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000389 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000390 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000391 elif line[pos] == '\f': column = 0
392 else: break
393 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000394 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000395
396 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000397 if line[pos] == '#':
398 comment_token = line[pos:].rstrip('\r\n')
399 nl_pos = pos + len(comment_token)
Raymond Hettingera48db392009-04-29 00:34:27 +0000400 yield TokenInfo(COMMENT, comment_token,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000401 (lnum, pos), (lnum, pos + len(comment_token)), line)
Raymond Hettingera48db392009-04-29 00:34:27 +0000402 yield TokenInfo(NL, line[nl_pos:],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000403 (lnum, nl_pos), (lnum, len(line)), line)
404 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000405 yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000406 (lnum, pos), (lnum, len(line)), line)
407 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000408
409 if column > indents[-1]: # count indents or dedents
410 indents.append(column)
Raymond Hettingera48db392009-04-29 00:34:27 +0000411 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000412 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000413 if column not in indents:
414 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000415 "unindent does not match any outer indentation level",
416 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000417 indents = indents[:-1]
Raymond Hettingera48db392009-04-29 00:34:27 +0000418 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000419
420 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000421 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000422 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000423 continued = 0
424
425 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000426 pseudomatch = pseudoprog.match(line, pos)
427 if pseudomatch: # scan for tokens
428 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000429 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000430 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000431
Georg Brandldde00282007-03-18 19:01:53 +0000432 if (initial in numchars or # ordinary number
433 (initial == '.' and token != '.' and token != '...')):
Raymond Hettingera48db392009-04-29 00:34:27 +0000434 yield TokenInfo(NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000435 elif initial in '\r\n':
Raymond Hettingera48db392009-04-29 00:34:27 +0000436 yield TokenInfo(NL if parenlev > 0 else NEWLINE,
Thomas Wouters89f507f2006-12-13 04:49:30 +0000437 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000438 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000439 assert not token.endswith("\n")
Raymond Hettingera48db392009-04-29 00:34:27 +0000440 yield TokenInfo(COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000441 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000442 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000443 endmatch = endprog.match(line, pos)
444 if endmatch: # all on one line
445 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000446 token = line[start:pos]
Raymond Hettingera48db392009-04-29 00:34:27 +0000447 yield TokenInfo(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000448 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000449 strstart = (lnum, start) # multiple lines
450 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000451 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000452 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000453 elif initial in single_quoted or \
454 token[:2] in single_quoted or \
455 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000456 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000457 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000458 endprog = (endprogs[initial] or endprogs[token[1]] or
459 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000460 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000461 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000462 break
463 else: # ordinary string
Raymond Hettingera48db392009-04-29 00:34:27 +0000464 yield TokenInfo(STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000465 elif initial in namechars: # ordinary name
Raymond Hettingera48db392009-04-29 00:34:27 +0000466 yield TokenInfo(NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000467 elif initial == '\\': # continued stmt
468 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000469 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000470 if initial in '([{': parenlev = parenlev + 1
471 elif initial in ')]}': parenlev = parenlev - 1
Raymond Hettingera48db392009-04-29 00:34:27 +0000472 yield TokenInfo(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000473 else:
Raymond Hettingera48db392009-04-29 00:34:27 +0000474 yield TokenInfo(ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000475 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000476 pos = pos + 1
477
478 for indent in indents[1:]: # pop remaining indent levels
Raymond Hettingera48db392009-04-29 00:34:27 +0000479 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
480 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000481
Trent Nelson428de652008-03-18 22:41:35 +0000482
483# An undocumented, backwards compatible, API for all the places in the standard
484# library that expect to be able to use tokenize with strings
485def generate_tokens(readline):
486 return _tokenize(readline, None)