blob: 16c4f3f029830e24020f251762912d62b8afd61d [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Trent Nelson428de652008-03-18 22:41:35 +00003tokenize(readline) is a generator that breaks a stream of
4bytes into Python tokens. It decodes the bytes according to
5PEP-0263 for determining source file encoding.
6
7It accepts a readline-like method which is called
8repeatedly to get the next line of input (or b"" for EOF). It generates
Tim Peters4efb6e92001-06-29 23:51:08 +000095-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Trent Nelson428de652008-03-18 22:41:35 +000019operators. Aditionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000021
Ka-Ping Yee244c5932001-03-01 13:56:40 +000022__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000023__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
24 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
25 'Michael Foord')
Guido van Rossumb51eaa11997-03-07 00:21:55 +000026
Trent Nelson428de652008-03-18 22:41:35 +000027import re, string, sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000028from token import *
Benjamin Peterson433f32c2008-12-12 01:25:05 +000029from codecs import lookup, BOM_UTF8
Trent Nelson428de652008-03-18 22:41:35 +000030from itertools import chain, repeat
31cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Skip Montanaro40fc1602001-03-01 04:27:19 +000033import token
Raymond Hettinger78a7aee2002-11-05 06:06:02 +000034__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
Trent Nelson428de652008-03-18 22:41:35 +000035 "detect_encoding", "NL", "untokenize", "ENCODING"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000036del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000042ENCODING = N_TOKENS + 2
43tok_name[ENCODING] = 'ENCODING'
44N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000045
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000046def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000047def any(*choices): return group(*choices) + '*'
48def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000049
Antoine Pitroufd036452008-08-19 17:56:33 +000050# Note: we use unicode matching for names ("\w") but ascii matching for
51# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000052Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000056
Antoine Pitroufd036452008-08-19 17:56:33 +000057Hexnumber = r'0[xX][0-9a-fA-F]+'
Georg Brandlfceab5a2008-01-19 20:08:23 +000058Binnumber = r'0[bB][01]+'
59Octnumber = r'0[oO][0-7]+'
Antoine Pitroufd036452008-08-19 17:56:33 +000060Decnumber = r'(?:0+|[1-9][0-9]*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000061Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Antoine Pitroufd036452008-08-19 17:56:33 +000062Exponent = r'[eE][-+]?[0-9]+'
63Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
64Expfloat = r'[0-9]+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000065Floatnumber = group(Pointfloat, Expfloat)
Antoine Pitroufd036452008-08-19 17:56:33 +000066Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000067Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000068
Tim Petersde495832000-10-07 05:09:39 +000069# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +000077Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000078# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000079String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000081
Tim Petersde495832000-10-07 05:09:39 +000082# Because of leftmost-then-longest match semantics, be sure to put the
83# longest operators first (e.g., if = came before ==, == would get
84# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +000085Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +000086 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +000087 r"[+\-*/%&|^=<>]=?",
88 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000089
Guido van Rossum4d8e8591992-01-01 19:34:47 +000090Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +000091Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000092Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000093
Guido van Rossum3b631771997-10-27 20:44:15 +000094PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000095Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000096
Tim Petersde495832000-10-07 05:09:39 +000097# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000098ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000099 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000100 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000101 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000102PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000104
Guido van Rossum3b631771997-10-27 20:44:15 +0000105tokenprog, pseudoprog, single3prog, double3prog = map(
106 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000107endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000108 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000109 "r'''": single3prog, 'r"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000110 "b'''": single3prog, 'b"""': double3prog,
111 "br'''": single3prog, 'br"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000112 "R'''": single3prog, 'R"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000113 "B'''": single3prog, 'B"""': double3prog,
114 "bR'''": single3prog, 'bR"""': double3prog,
115 "Br'''": single3prog, 'Br"""': double3prog,
116 "BR'''": single3prog, 'BR"""': double3prog,
117 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000118
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000119triple_quoted = {}
120for t in ("'''", '"""',
121 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000122 "b'''", 'b"""', "B'''", 'B"""',
123 "br'''", 'br"""', "Br'''", 'Br"""',
124 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000125 triple_quoted[t] = t
126single_quoted = {}
127for t in ("'", '"',
128 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000129 "b'", 'b"', "B'", 'B"',
130 "br'", 'br"', "Br'", 'Br"',
131 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000132 single_quoted[t] = t
133
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000134tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000135
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000136class TokenError(Exception): pass
137
138class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000139
Tim Peters5ca576e2001-06-18 22:08:13 +0000140
Thomas Wouters89f507f2006-12-13 04:49:30 +0000141class Untokenizer:
142
143 def __init__(self):
144 self.tokens = []
145 self.prev_row = 1
146 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000147 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000148
149 def add_whitespace(self, start):
150 row, col = start
151 assert row <= self.prev_row
152 col_offset = col - self.prev_col
153 if col_offset:
154 self.tokens.append(" " * col_offset)
155
156 def untokenize(self, iterable):
157 for t in iterable:
158 if len(t) == 2:
159 self.compat(t, iterable)
160 break
161 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000162 if tok_type == ENCODING:
163 self.encoding = token
164 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000165 self.add_whitespace(start)
166 self.tokens.append(token)
167 self.prev_row, self.prev_col = end
168 if tok_type in (NEWLINE, NL):
169 self.prev_row += 1
170 self.prev_col = 0
171 return "".join(self.tokens)
172
173 def compat(self, token, iterable):
174 startline = False
175 indents = []
176 toks_append = self.tokens.append
177 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000178
Thomas Wouters89f507f2006-12-13 04:49:30 +0000179 if toknum in (NAME, NUMBER):
180 tokval += ' '
181 if toknum in (NEWLINE, NL):
182 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000183 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000184 for tok in iterable:
185 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000186 if toknum == ENCODING:
187 self.encoding = tokval
188 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000189
190 if toknum in (NAME, NUMBER):
191 tokval += ' '
192
Christian Heimesba4af492008-03-28 00:55:15 +0000193 # Insert a space between two consecutive strings
194 if toknum == STRING:
195 if prevstring:
196 tokval = ' ' + tokval
197 prevstring = True
198 else:
199 prevstring = False
200
Thomas Wouters89f507f2006-12-13 04:49:30 +0000201 if toknum == INDENT:
202 indents.append(tokval)
203 continue
204 elif toknum == DEDENT:
205 indents.pop()
206 continue
207 elif toknum in (NEWLINE, NL):
208 startline = True
209 elif startline and indents:
210 toks_append(indents[-1])
211 startline = False
212 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000213
Trent Nelson428de652008-03-18 22:41:35 +0000214
Raymond Hettinger68c04532005-06-10 11:05:19 +0000215def untokenize(iterable):
216 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000217 It returns a bytes object, encoded using the ENCODING
218 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000219
220 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000221 with at least two elements, a token number and token value. If
222 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000223
Thomas Wouters89f507f2006-12-13 04:49:30 +0000224 Round-trip invariant for full input:
225 Untokenized source will match input source exactly
226
227 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000228 # Output bytes will tokenize the back to the input
229 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000230 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000231 readline = BytesIO(newcode).readline
232 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000233 assert t1 == t2
234 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000235 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000236 out = ut.untokenize(iterable)
237 if ut.encoding is not None:
238 out = out.encode(ut.encoding)
239 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000240
Trent Nelson428de652008-03-18 22:41:35 +0000241
242def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000243 """
Trent Nelson428de652008-03-18 22:41:35 +0000244 The detect_encoding() function is used to detect the encoding that should
245 be used to decode a Python source file. It requires one argment, readline,
246 in the same way as the tokenize() generator.
247
248 It will call readline a maximum of twice, and return the encoding used
249 (as a string) and a list of any lines (left as bytes) it has read
250 in.
251
252 It detects the encoding from the presence of a utf-8 bom or an encoding
253 cookie as specified in pep-0263. If both a bom and a cookie are present,
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000254 but disagree, a SyntaxError will be raised. If the encoding cookie is an
255 invalid charset, raise a SyntaxError.
Trent Nelson428de652008-03-18 22:41:35 +0000256
257 If no encoding is specified, then the default of 'utf-8' will be returned.
258 """
Trent Nelson428de652008-03-18 22:41:35 +0000259 bom_found = False
260 encoding = None
261 def read_or_stop():
262 try:
263 return readline()
264 except StopIteration:
265 return b''
266
267 def find_cookie(line):
268 try:
269 line_string = line.decode('ascii')
270 except UnicodeDecodeError:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000271 return None
272
273 matches = cookie_re.findall(line_string)
274 if not matches:
275 return None
276 encoding = matches[0]
277 try:
278 codec = lookup(encoding)
279 except LookupError:
280 # This behaviour mimics the Python interpreter
281 raise SyntaxError("unknown encoding: " + encoding)
282
283 if bom_found and codec.name != 'utf-8':
284 # This behaviour mimics the Python interpreter
285 raise SyntaxError('encoding problem: utf-8')
286 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000287
288 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000289 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000290 bom_found = True
291 first = first[3:]
292 if not first:
293 return 'utf-8', []
294
295 encoding = find_cookie(first)
296 if encoding:
297 return encoding, [first]
298
299 second = read_or_stop()
300 if not second:
301 return 'utf-8', [first]
302
303 encoding = find_cookie(second)
304 if encoding:
305 return encoding, [first, second]
306
307 return 'utf-8', [first, second]
308
309
310def tokenize(readline):
311 """
312 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000313 must be a callable object which provides the same interface as the
314 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000315 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000316 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000317 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000318
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000319 The generator produces 5-tuples with these members: the token type; the
320 token string; a 2-tuple (srow, scol) of ints specifying the row and
321 column where the token begins in the source; a 2-tuple (erow, ecol) of
322 ints specifying the row and column where the token ends in the source;
323 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000324 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000325
326 The first token sequence will always be an ENCODING token
327 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000328 """
Trent Nelson428de652008-03-18 22:41:35 +0000329 encoding, consumed = detect_encoding(readline)
330 def readline_generator():
331 while True:
332 try:
333 yield readline()
334 except StopIteration:
335 return
336 chained = chain(consumed, readline_generator())
337 return _tokenize(chained.__next__, encoding)
338
339
340def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000341 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000342 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000343 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000344 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000345 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000346
Trent Nelson428de652008-03-18 22:41:35 +0000347 if encoding is not None:
348 yield (ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000349 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000350 try:
351 line = readline()
352 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000353 line = b''
354
355 if encoding is not None:
356 line = line.decode(encoding)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000357 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000358 pos, max = 0, len(line)
359
360 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000361 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000362 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000363 endmatch = endprog.match(line)
364 if endmatch:
365 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000366 yield (STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000367 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000368 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000369 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000370 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000371 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000372 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000373 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000374 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000375 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000376 else:
377 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000378 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000379 continue
380
Guido van Rossum1aec3231997-04-08 14:24:39 +0000381 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000382 if not line: break
383 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000384 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000385 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000386 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000387 elif line[pos] == '\f': column = 0
388 else: break
389 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000390 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000391
392 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000393 if line[pos] == '#':
394 comment_token = line[pos:].rstrip('\r\n')
395 nl_pos = pos + len(comment_token)
396 yield (COMMENT, comment_token,
397 (lnum, pos), (lnum, pos + len(comment_token)), line)
398 yield (NL, line[nl_pos:],
399 (lnum, nl_pos), (lnum, len(line)), line)
400 else:
401 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000402 (lnum, pos), (lnum, len(line)), line)
403 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000404
405 if column > indents[-1]: # count indents or dedents
406 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000407 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000408 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000409 if column not in indents:
410 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000411 "unindent does not match any outer indentation level",
412 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000413 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000414 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000415
416 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000417 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000418 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000419 continued = 0
420
421 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000422 pseudomatch = pseudoprog.match(line, pos)
423 if pseudomatch: # scan for tokens
424 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000425 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000426 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000427
Georg Brandldde00282007-03-18 19:01:53 +0000428 if (initial in numchars or # ordinary number
429 (initial == '.' and token != '.' and token != '...')):
Tim Peters5ca576e2001-06-18 22:08:13 +0000430 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000431 elif initial in '\r\n':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000432 yield (NL if parenlev > 0 else NEWLINE,
433 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000434 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000435 assert not token.endswith("\n")
Tim Peters5ca576e2001-06-18 22:08:13 +0000436 yield (COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000437 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000438 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000439 endmatch = endprog.match(line, pos)
440 if endmatch: # all on one line
441 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000442 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000443 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000444 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000445 strstart = (lnum, start) # multiple lines
446 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000447 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000448 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000449 elif initial in single_quoted or \
450 token[:2] in single_quoted or \
451 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000452 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000453 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000454 endprog = (endprogs[initial] or endprogs[token[1]] or
455 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000456 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000457 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000458 break
459 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000460 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000461 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000462 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000463 elif initial == '\\': # continued stmt
464 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000465 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000466 if initial in '([{': parenlev = parenlev + 1
467 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000468 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000469 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000470 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000471 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000472 pos = pos + 1
473
474 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000475 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
476 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000477
Trent Nelson428de652008-03-18 22:41:35 +0000478
479# An undocumented, backwards compatible, API for all the places in the standard
480# library that expect to be able to use tokenize with strings
481def generate_tokens(readline):
482 return _tokenize(readline, None)