blob: 4ff859d9d0dbdb10200e421a15d88c9d9a9426f8 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Trent Nelson428de652008-03-18 22:41:35 +00003tokenize(readline) is a generator that breaks a stream of
4bytes into Python tokens. It decodes the bytes according to
5PEP-0263 for determining source file encoding.
6
7It accepts a readline-like method which is called
8repeatedly to get the next line of input (or b"" for EOF). It generates
Tim Peters4efb6e92001-06-29 23:51:08 +000095-tuples with these members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
Trent Nelson428de652008-03-18 22:41:35 +000019operators. Aditionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000021
Ka-Ping Yee244c5932001-03-01 13:56:40 +000022__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Trent Nelson428de652008-03-18 22:41:35 +000023__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
24 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
25 'Michael Foord')
Guido van Rossumb51eaa11997-03-07 00:21:55 +000026
Trent Nelson428de652008-03-18 22:41:35 +000027import re, string, sys
Guido van Rossumfc6f5331997-03-07 00:21:12 +000028from token import *
Benjamin Peterson433f32c2008-12-12 01:25:05 +000029from codecs import lookup, BOM_UTF8
Trent Nelson428de652008-03-18 22:41:35 +000030cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
Guido van Rossum4d8e8591992-01-01 19:34:47 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032import token
Raymond Hettinger78a7aee2002-11-05 06:06:02 +000033__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
Trent Nelson428de652008-03-18 22:41:35 +000034 "detect_encoding", "NL", "untokenize", "ENCODING"]
Skip Montanaro40fc1602001-03-01 04:27:19 +000035del token
36
Guido van Rossum1aec3231997-04-08 14:24:39 +000037COMMENT = N_TOKENS
38tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000039NL = N_TOKENS + 1
40tok_name[NL] = 'NL'
Trent Nelson428de652008-03-18 22:41:35 +000041ENCODING = N_TOKENS + 2
42tok_name[ENCODING] = 'ENCODING'
43N_TOKENS += 3
Guido van Rossum1aec3231997-04-08 14:24:39 +000044
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000045def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000046def any(*choices): return group(*choices) + '*'
47def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000048
Antoine Pitroufd036452008-08-19 17:56:33 +000049# Note: we use unicode matching for names ("\w") but ascii matching for
50# number literals.
Guido van Rossum3b631771997-10-27 20:44:15 +000051Whitespace = r'[ \f\t]*'
52Comment = r'#[^\r\n]*'
53Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
54Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000055
Antoine Pitroufd036452008-08-19 17:56:33 +000056Hexnumber = r'0[xX][0-9a-fA-F]+'
Georg Brandlfceab5a2008-01-19 20:08:23 +000057Binnumber = r'0[bB][01]+'
58Octnumber = r'0[oO][0-7]+'
Antoine Pitroufd036452008-08-19 17:56:33 +000059Decnumber = r'(?:0+|[1-9][0-9]*)'
Guido van Rossumcd16bf62007-06-13 18:07:49 +000060Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
Antoine Pitroufd036452008-08-19 17:56:33 +000061Exponent = r'[eE][-+]?[0-9]+'
62Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
63Expfloat = r'[0-9]+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000064Floatnumber = group(Pointfloat, Expfloat)
Antoine Pitroufd036452008-08-19 17:56:33 +000065Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000066Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000067
Tim Petersde495832000-10-07 05:09:39 +000068# Tail end of ' string.
69Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
70# Tail end of " string.
71Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
72# Tail end of ''' string.
73Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
74# Tail end of """ string.
75Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Guido van Rossum4fe72f92007-11-12 17:40:10 +000076Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000077# Single-line ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000078String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
79 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000080
Tim Petersde495832000-10-07 05:09:39 +000081# Because of leftmost-then-longest match semantics, be sure to put the
82# longest operators first (e.g., if = came before ==, == would get
83# recognized as two instances of =).
Guido van Rossumb053cd82006-08-24 03:53:23 +000084Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
Neal Norwitzc1505362006-12-28 06:47:50 +000085 r"//=?", r"->",
Tim Petersde495832000-10-07 05:09:39 +000086 r"[+\-*/%&|^=<>]=?",
87 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000088
Guido van Rossum4d8e8591992-01-01 19:34:47 +000089Bracket = '[][(){}]'
Georg Brandldde00282007-03-18 19:01:53 +000090Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000091Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000092
Guido van Rossum3b631771997-10-27 20:44:15 +000093PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000094Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000095
Tim Petersde495832000-10-07 05:09:39 +000096# First (or only) line of ' or " string.
Guido van Rossum4fe72f92007-11-12 17:40:10 +000097ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000098 group("'", r'\\\r?\n'),
Guido van Rossum4fe72f92007-11-12 17:40:10 +000099 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000100 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +0000101PseudoExtras = group(r'\\\r?\n', Comment, Triple)
102PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000103
Guido van Rossum3b631771997-10-27 20:44:15 +0000104tokenprog, pseudoprog, single3prog, double3prog = map(
105 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000106endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000107 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000108 "r'''": single3prog, 'r"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000109 "b'''": single3prog, 'b"""': double3prog,
110 "br'''": single3prog, 'br"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000111 "R'''": single3prog, 'R"""': double3prog,
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000112 "B'''": single3prog, 'B"""': double3prog,
113 "bR'''": single3prog, 'bR"""': double3prog,
114 "Br'''": single3prog, 'Br"""': double3prog,
115 "BR'''": single3prog, 'BR"""': double3prog,
116 'r': None, 'R': None, 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000117
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000118triple_quoted = {}
119for t in ("'''", '"""',
120 "r'''", 'r"""', "R'''", 'R"""',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000121 "b'''", 'b"""', "B'''", 'B"""',
122 "br'''", 'br"""', "Br'''", 'Br"""',
123 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000124 triple_quoted[t] = t
125single_quoted = {}
126for t in ("'", '"',
127 "r'", 'r"', "R'", 'R"',
Guido van Rossum4fe72f92007-11-12 17:40:10 +0000128 "b'", 'b"', "B'", 'B"',
129 "br'", 'br"', "Br'", 'Br"',
130 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000131 single_quoted[t] = t
132
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000133tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000134
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000135class TokenError(Exception): pass
136
137class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000138
Tim Peters5ca576e2001-06-18 22:08:13 +0000139
Thomas Wouters89f507f2006-12-13 04:49:30 +0000140class Untokenizer:
141
142 def __init__(self):
143 self.tokens = []
144 self.prev_row = 1
145 self.prev_col = 0
Trent Nelson428de652008-03-18 22:41:35 +0000146 self.encoding = None
Thomas Wouters89f507f2006-12-13 04:49:30 +0000147
148 def add_whitespace(self, start):
149 row, col = start
150 assert row <= self.prev_row
151 col_offset = col - self.prev_col
152 if col_offset:
153 self.tokens.append(" " * col_offset)
154
155 def untokenize(self, iterable):
156 for t in iterable:
157 if len(t) == 2:
158 self.compat(t, iterable)
159 break
160 tok_type, token, start, end, line = t
Trent Nelson428de652008-03-18 22:41:35 +0000161 if tok_type == ENCODING:
162 self.encoding = token
163 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000164 self.add_whitespace(start)
165 self.tokens.append(token)
166 self.prev_row, self.prev_col = end
167 if tok_type in (NEWLINE, NL):
168 self.prev_row += 1
169 self.prev_col = 0
170 return "".join(self.tokens)
171
172 def compat(self, token, iterable):
173 startline = False
174 indents = []
175 toks_append = self.tokens.append
176 toknum, tokval = token
Trent Nelson428de652008-03-18 22:41:35 +0000177
Thomas Wouters89f507f2006-12-13 04:49:30 +0000178 if toknum in (NAME, NUMBER):
179 tokval += ' '
180 if toknum in (NEWLINE, NL):
181 startline = True
Christian Heimesba4af492008-03-28 00:55:15 +0000182 prevstring = False
Thomas Wouters89f507f2006-12-13 04:49:30 +0000183 for tok in iterable:
184 toknum, tokval = tok[:2]
Trent Nelson428de652008-03-18 22:41:35 +0000185 if toknum == ENCODING:
186 self.encoding = tokval
187 continue
Thomas Wouters89f507f2006-12-13 04:49:30 +0000188
189 if toknum in (NAME, NUMBER):
190 tokval += ' '
191
Christian Heimesba4af492008-03-28 00:55:15 +0000192 # Insert a space between two consecutive strings
193 if toknum == STRING:
194 if prevstring:
195 tokval = ' ' + tokval
196 prevstring = True
197 else:
198 prevstring = False
199
Thomas Wouters89f507f2006-12-13 04:49:30 +0000200 if toknum == INDENT:
201 indents.append(tokval)
202 continue
203 elif toknum == DEDENT:
204 indents.pop()
205 continue
206 elif toknum in (NEWLINE, NL):
207 startline = True
208 elif startline and indents:
209 toks_append(indents[-1])
210 startline = False
211 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000212
Trent Nelson428de652008-03-18 22:41:35 +0000213
Raymond Hettinger68c04532005-06-10 11:05:19 +0000214def untokenize(iterable):
215 """Transform tokens back into Python source code.
Trent Nelson428de652008-03-18 22:41:35 +0000216 It returns a bytes object, encoded using the ENCODING
217 token, which is the first token sequence output by tokenize.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000218
219 Each element returned by the iterable must be a token sequence
Thomas Wouters89f507f2006-12-13 04:49:30 +0000220 with at least two elements, a token number and token value. If
221 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000222
Thomas Wouters89f507f2006-12-13 04:49:30 +0000223 Round-trip invariant for full input:
224 Untokenized source will match input source exactly
225
226 Round-trip invariant for limited intput:
Trent Nelson428de652008-03-18 22:41:35 +0000227 # Output bytes will tokenize the back to the input
228 t1 = [tok[:2] for tok in tokenize(f.readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000229 newcode = untokenize(t1)
Trent Nelson428de652008-03-18 22:41:35 +0000230 readline = BytesIO(newcode).readline
231 t2 = [tok[:2] for tok in tokenize(readline)]
Raymond Hettinger68c04532005-06-10 11:05:19 +0000232 assert t1 == t2
233 """
Thomas Wouters89f507f2006-12-13 04:49:30 +0000234 ut = Untokenizer()
Trent Nelson428de652008-03-18 22:41:35 +0000235 out = ut.untokenize(iterable)
236 if ut.encoding is not None:
237 out = out.encode(ut.encoding)
238 return out
Raymond Hettinger68c04532005-06-10 11:05:19 +0000239
Trent Nelson428de652008-03-18 22:41:35 +0000240
241def detect_encoding(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000242 """
Trent Nelson428de652008-03-18 22:41:35 +0000243 The detect_encoding() function is used to detect the encoding that should
244 be used to decode a Python source file. It requires one argment, readline,
245 in the same way as the tokenize() generator.
246
247 It will call readline a maximum of twice, and return the encoding used
248 (as a string) and a list of any lines (left as bytes) it has read
249 in.
250
251 It detects the encoding from the presence of a utf-8 bom or an encoding
252 cookie as specified in pep-0263. If both a bom and a cookie are present,
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000253 but disagree, a SyntaxError will be raised. If the encoding cookie is an
254 invalid charset, raise a SyntaxError.
Trent Nelson428de652008-03-18 22:41:35 +0000255
256 If no encoding is specified, then the default of 'utf-8' will be returned.
257 """
Trent Nelson428de652008-03-18 22:41:35 +0000258 bom_found = False
259 encoding = None
260 def read_or_stop():
261 try:
262 return readline()
263 except StopIteration:
264 return b''
265
266 def find_cookie(line):
267 try:
268 line_string = line.decode('ascii')
269 except UnicodeDecodeError:
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000270 return None
271
272 matches = cookie_re.findall(line_string)
273 if not matches:
274 return None
275 encoding = matches[0]
276 try:
277 codec = lookup(encoding)
278 except LookupError:
279 # This behaviour mimics the Python interpreter
280 raise SyntaxError("unknown encoding: " + encoding)
281
282 if bom_found and codec.name != 'utf-8':
283 # This behaviour mimics the Python interpreter
284 raise SyntaxError('encoding problem: utf-8')
285 return encoding
Trent Nelson428de652008-03-18 22:41:35 +0000286
287 first = read_or_stop()
Benjamin Peterson433f32c2008-12-12 01:25:05 +0000288 if first.startswith(BOM_UTF8):
Trent Nelson428de652008-03-18 22:41:35 +0000289 bom_found = True
290 first = first[3:]
291 if not first:
292 return 'utf-8', []
293
294 encoding = find_cookie(first)
295 if encoding:
296 return encoding, [first]
297
298 second = read_or_stop()
299 if not second:
300 return 'utf-8', [first]
301
302 encoding = find_cookie(second)
303 if encoding:
304 return encoding, [first, second]
305
306 return 'utf-8', [first, second]
307
308
309def tokenize(readline):
310 """
311 The tokenize() generator requires one argment, readline, which
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000312 must be a callable object which provides the same interface as the
313 readline() method of built-in file objects. Each call to the function
Trent Nelson428de652008-03-18 22:41:35 +0000314 should return one line of input as bytes. Alternately, readline
Raymond Hettinger68c04532005-06-10 11:05:19 +0000315 can be a callable function terminating with StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000316 readline = open(myfile, 'rb').__next__ # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000317
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000318 The generator produces 5-tuples with these members: the token type; the
319 token string; a 2-tuple (srow, scol) of ints specifying the row and
320 column where the token begins in the source; a 2-tuple (erow, ecol) of
321 ints specifying the row and column where the token ends in the source;
322 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000323 logical line; continuation lines are included.
Trent Nelson428de652008-03-18 22:41:35 +0000324
325 The first token sequence will always be an ENCODING token
326 which tells you which encoding was used to decode the bytes stream.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000327 """
Trent Nelson428de652008-03-18 22:41:35 +0000328 encoding, consumed = detect_encoding(readline)
Benjamin Peterson9b8d24b2009-03-24 22:30:15 +0000329 def readline_generator(consumed):
330 for line in consumed:
331 yield line
Trent Nelson428de652008-03-18 22:41:35 +0000332 while True:
333 try:
334 yield readline()
335 except StopIteration:
336 return
Benjamin Peterson9b8d24b2009-03-24 22:30:15 +0000337 chained = readline_generator(consumed)
Trent Nelson428de652008-03-18 22:41:35 +0000338 return _tokenize(chained.__next__, encoding)
339
340
341def _tokenize(readline, encoding):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000342 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000343 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000344 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000345 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000346 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000347
Trent Nelson428de652008-03-18 22:41:35 +0000348 if encoding is not None:
349 yield (ENCODING, encoding, (0, 0), (0, 0), '')
Benjamin Peterson0fe14382008-06-05 23:07:42 +0000350 while True: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000351 try:
352 line = readline()
353 except StopIteration:
Trent Nelson428de652008-03-18 22:41:35 +0000354 line = b''
355
356 if encoding is not None:
357 line = line.decode(encoding)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000358 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000359 pos, max = 0, len(line)
360
361 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000362 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000363 raise TokenError("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000364 endmatch = endprog.match(line)
365 if endmatch:
366 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000367 yield (STRING, contstr + line[:end],
Thomas Wouters89f507f2006-12-13 04:49:30 +0000368 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000369 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000370 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000371 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000372 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000373 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000374 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000375 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000376 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000377 else:
378 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000379 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000380 continue
381
Guido van Rossum1aec3231997-04-08 14:24:39 +0000382 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000383 if not line: break
384 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000385 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000386 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000387 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000388 elif line[pos] == '\f': column = 0
389 else: break
390 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000391 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000392
393 if line[pos] in '#\r\n': # skip comments or blank lines
Thomas Wouters89f507f2006-12-13 04:49:30 +0000394 if line[pos] == '#':
395 comment_token = line[pos:].rstrip('\r\n')
396 nl_pos = pos + len(comment_token)
397 yield (COMMENT, comment_token,
398 (lnum, pos), (lnum, pos + len(comment_token)), line)
399 yield (NL, line[nl_pos:],
400 (lnum, nl_pos), (lnum, len(line)), line)
401 else:
402 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000403 (lnum, pos), (lnum, len(line)), line)
404 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000405
406 if column > indents[-1]: # count indents or dedents
407 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000408 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000409 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000410 if column not in indents:
411 raise IndentationError(
Thomas Wouters00ee7ba2006-08-21 19:07:27 +0000412 "unindent does not match any outer indentation level",
413 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000414 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000415 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000416
417 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000418 if not line:
Collin Winterce36ad82007-08-30 01:19:48 +0000419 raise TokenError("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000420 continued = 0
421
422 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000423 pseudomatch = pseudoprog.match(line, pos)
424 if pseudomatch: # scan for tokens
425 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000426 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000427 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000428
Georg Brandldde00282007-03-18 19:01:53 +0000429 if (initial in numchars or # ordinary number
430 (initial == '.' and token != '.' and token != '...')):
Tim Peters5ca576e2001-06-18 22:08:13 +0000431 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000432 elif initial in '\r\n':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000433 yield (NL if parenlev > 0 else NEWLINE,
434 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000435 elif initial == '#':
Thomas Wouters89f507f2006-12-13 04:49:30 +0000436 assert not token.endswith("\n")
Tim Peters5ca576e2001-06-18 22:08:13 +0000437 yield (COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000438 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000439 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000440 endmatch = endprog.match(line, pos)
441 if endmatch: # all on one line
442 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000443 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000444 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000445 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000446 strstart = (lnum, start) # multiple lines
447 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000448 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000449 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000450 elif initial in single_quoted or \
451 token[:2] in single_quoted or \
452 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000453 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000454 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000455 endprog = (endprogs[initial] or endprogs[token[1]] or
456 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000457 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000458 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000459 break
460 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000461 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000462 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000463 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000464 elif initial == '\\': # continued stmt
465 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000466 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000467 if initial in '([{': parenlev = parenlev + 1
468 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000469 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000470 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000471 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000472 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000473 pos = pos + 1
474
475 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000476 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
477 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000478
Trent Nelson428de652008-03-18 22:41:35 +0000479
480# An undocumented, backwards compatible, API for all the places in the standard
481# library that expect to be able to use tokenize with strings
482def generate_tokens(readline):
483 return _tokenize(readline, None)