blob: 9322e0fe1bcd1a46d80da9692b7b07687f4f5980 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Tim Peters4efb6e92001-06-29 23:51:08 +00003generate_tokens(readline) is a generator that breaks a stream of
Guido van Rossum1aec3231997-04-08 14:24:39 +00004text into Python tokens. It accepts a readline-like method which is called
Tim Peters4efb6e92001-06-29 23:51:08 +00005repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000024
Ka-Ping Yee244c5932001-03-01 13:56:40 +000025__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000026__credits__ = \
Raymond Hettinger8a7e76b2006-12-02 02:00:39 +000027 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000028
Guido van Rossum3b631771997-10-27 20:44:15 +000029import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000030from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032import token
Raymond Hettinger78a7aee2002-11-05 06:06:02 +000033__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
Raymond Hettinger68c04532005-06-10 11:05:19 +000034 "generate_tokens", "NL", "untokenize"]
Neal Norwitze98d16e2002-03-26 16:20:26 +000035del x
Skip Montanaro40fc1602001-03-01 04:27:19 +000036del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000042N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000043
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000044def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum68468eb2003-02-27 20:14:51 +000045def any(*choices): return group(*choices) + '*'
46def maybe(*choices): return group(*choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000047
Guido van Rossum3b631771997-10-27 20:44:15 +000048Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000052
Guido van Rossum3b631771997-10-27 20:44:15 +000053Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54Octnumber = r'0[0-7]*[lL]?'
55Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000056Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000057Exponent = r'[eE][-+]?\d+'
58Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Tim Petersd507dab2001-08-30 20:51:59 +000059Expfloat = r'\d+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000060Floatnumber = group(Pointfloat, Expfloat)
Tim Petersd507dab2001-08-30 20:51:59 +000061Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000062Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000063
Tim Petersde495832000-10-07 05:09:39 +000064# Tail end of ' string.
65Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66# Tail end of " string.
67Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68# Tail end of ''' string.
69Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70# Tail end of """ string.
71Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000072Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000073# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000074String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000076
Tim Petersde495832000-10-07 05:09:39 +000077# Because of leftmost-then-longest match semantics, be sure to put the
78# longest operators first (e.g., if = came before ==, == would get
79# recognized as two instances of =).
80Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
Guido van Rossum96204f52001-08-08 05:04:07 +000081 r"//=?",
Tim Petersde495832000-10-07 05:09:39 +000082 r"[+\-*/%&|^=<>]=?",
83 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000084
Guido van Rossum4d8e8591992-01-01 19:34:47 +000085Bracket = '[][(){}]'
Anthony Baxterc2a5a632004-08-02 06:10:11 +000086Special = group(r'\r?\n', r'[:;.,`@]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000087Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000088
Guido van Rossum3b631771997-10-27 20:44:15 +000089PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000090Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000091
Tim Petersde495832000-10-07 05:09:39 +000092# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000093ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000097PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000099
Guido van Rossum3b631771997-10-27 20:44:15 +0000100tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000102endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000103 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000104 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
Christian Heimes288e89a2008-01-18 18:24:07 +0000112 "b'''": single3prog, 'b"""': double3prog,
113 "br'''": single3prog, 'br"""': double3prog,
114 "B'''": single3prog, 'B"""': double3prog,
115 "bR'''": single3prog, 'bR"""': double3prog,
116 "Br'''": single3prog, 'Br"""': double3prog,
117 "BR'''": single3prog, 'BR"""': double3prog,
118 'r': None, 'R': None, 'u': None, 'U': None,
119 'b': None, 'B': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000120
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000121triple_quoted = {}
122for t in ("'''", '"""',
123 "r'''", 'r"""', "R'''", 'R"""',
124 "u'''", 'u"""', "U'''", 'U"""',
125 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
Christian Heimes288e89a2008-01-18 18:24:07 +0000126 "uR'''", 'uR"""', "UR'''", 'UR"""',
127 "b'''", 'b"""', "B'''", 'B"""',
128 "br'''", 'br"""', "Br'''", 'Br"""',
129 "bR'''", 'bR"""', "BR'''", 'BR"""'):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000130 triple_quoted[t] = t
131single_quoted = {}
132for t in ("'", '"',
133 "r'", 'r"', "R'", 'R"',
134 "u'", 'u"', "U'", 'U"',
135 "ur'", 'ur"', "Ur'", 'Ur"',
Christian Heimes288e89a2008-01-18 18:24:07 +0000136 "uR'", 'uR"', "UR'", 'UR"',
137 "b'", 'b"', "B'", 'B"',
138 "br'", 'br"', "Br'", 'Br"',
139 "bR'", 'bR"', "BR'", 'BR"' ):
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000140 single_quoted[t] = t
141
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000142tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000143
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000144class TokenError(Exception): pass
145
146class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000147
Guido van Rossum1aec3231997-04-08 14:24:39 +0000148def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
149 print "%d,%d-%d,%d:\t%s\t%s" % \
150 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000151
Guido van Rossum1aec3231997-04-08 14:24:39 +0000152def tokenize(readline, tokeneater=printtoken):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000153 """
154 The tokenize() function accepts two parameters: one representing the
155 input stream, and one providing an output mechanism for tokenize().
Tim Peters8ac14952002-05-23 15:15:30 +0000156
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000157 The first parameter, readline, must be a callable object which provides
158 the same interface as the readline() method of built-in file objects.
Tim Peters8ac14952002-05-23 15:15:30 +0000159 Each call to the function should return one line of input as a string.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000160
161 The second parameter, tokeneater, must also be a callable object. It is
162 called once for each token, with five arguments, corresponding to the
Tim Peters8ac14952002-05-23 15:15:30 +0000163 tuples generated by generate_tokens().
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000164 """
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000165 try:
166 tokenize_loop(readline, tokeneater)
167 except StopTokenizing:
168 pass
169
Tim Peters4efb6e92001-06-29 23:51:08 +0000170# backwards compatible interface
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000171def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000172 for token_info in generate_tokens(readline):
Guido van Rossum68468eb2003-02-27 20:14:51 +0000173 tokeneater(*token_info)
Tim Peters5ca576e2001-06-18 22:08:13 +0000174
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000175class Untokenizer:
176
177 def __init__(self):
178 self.tokens = []
179 self.prev_row = 1
180 self.prev_col = 0
181
182 def add_whitespace(self, start):
183 row, col = start
Jeremy Hylton39c532c2006-08-23 21:26:46 +0000184 assert row <= self.prev_row
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000185 col_offset = col - self.prev_col
186 if col_offset:
187 self.tokens.append(" " * col_offset)
188
189 def untokenize(self, iterable):
190 for t in iterable:
191 if len(t) == 2:
192 self.compat(t, iterable)
193 break
194 tok_type, token, start, end, line = t
195 self.add_whitespace(start)
196 self.tokens.append(token)
197 self.prev_row, self.prev_col = end
198 if tok_type in (NEWLINE, NL):
199 self.prev_row += 1
200 self.prev_col = 0
201 return "".join(self.tokens)
202
203 def compat(self, token, iterable):
204 startline = False
205 indents = []
206 toks_append = self.tokens.append
207 toknum, tokval = token
208 if toknum in (NAME, NUMBER):
209 tokval += ' '
210 if toknum in (NEWLINE, NL):
211 startline = True
212 for tok in iterable:
213 toknum, tokval = tok[:2]
214
215 if toknum in (NAME, NUMBER):
216 tokval += ' '
217
218 if toknum == INDENT:
219 indents.append(tokval)
220 continue
221 elif toknum == DEDENT:
222 indents.pop()
223 continue
224 elif toknum in (NEWLINE, NL):
225 startline = True
226 elif startline and indents:
227 toks_append(indents[-1])
228 startline = False
229 toks_append(tokval)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000230
231def untokenize(iterable):
232 """Transform tokens back into Python source code.
233
234 Each element returned by the iterable must be a token sequence
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000235 with at least two elements, a token number and token value. If
236 only two tokens are passed, the resulting output is poor.
Raymond Hettinger68c04532005-06-10 11:05:19 +0000237
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000238 Round-trip invariant for full input:
239 Untokenized source will match input source exactly
240
241 Round-trip invariant for limited intput:
Raymond Hettinger68c04532005-06-10 11:05:19 +0000242 # Output text will tokenize the back to the input
243 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
244 newcode = untokenize(t1)
245 readline = iter(newcode.splitlines(1)).next
246 t2 = [tok[:2] for tokin generate_tokens(readline)]
247 assert t1 == t2
248 """
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000249 ut = Untokenizer()
250 return ut.untokenize(iterable)
Raymond Hettinger68c04532005-06-10 11:05:19 +0000251
Tim Peters5ca576e2001-06-18 22:08:13 +0000252def generate_tokens(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000253 """
254 The generate_tokens() generator requires one argment, readline, which
255 must be a callable object which provides the same interface as the
256 readline() method of built-in file objects. Each call to the function
Raymond Hettinger68c04532005-06-10 11:05:19 +0000257 should return one line of input as a string. Alternately, readline
258 can be a callable function terminating with StopIteration:
259 readline = open(myfile).next # Example of alternate readline
Tim Peters8ac14952002-05-23 15:15:30 +0000260
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000261 The generator produces 5-tuples with these members: the token type; the
262 token string; a 2-tuple (srow, scol) of ints specifying the row and
263 column where the token begins in the source; a 2-tuple (erow, ecol) of
264 ints specifying the row and column where the token ends in the source;
265 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000266 logical line; continuation lines are included.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000267 """
Guido van Rossum1aec3231997-04-08 14:24:39 +0000268 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000269 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000270 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000271 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000272 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000273
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000274 while 1: # loop over lines in stream
Raymond Hettinger68c04532005-06-10 11:05:19 +0000275 try:
276 line = readline()
277 except StopIteration:
278 line = ''
Guido van Rossum1aec3231997-04-08 14:24:39 +0000279 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000280 pos, max = 0, len(line)
281
282 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000283 if not line:
284 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000285 endmatch = endprog.match(line)
286 if endmatch:
287 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000288 yield (STRING, contstr + line[:end],
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000289 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000290 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000291 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000292 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000293 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000294 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000295 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000296 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000297 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000298 else:
299 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000300 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000301 continue
302
Guido van Rossum1aec3231997-04-08 14:24:39 +0000303 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000304 if not line: break
305 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000306 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000307 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000308 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000309 elif line[pos] == '\f': column = 0
310 else: break
311 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000312 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000313
314 if line[pos] in '#\r\n': # skip comments or blank lines
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000315 if line[pos] == '#':
316 comment_token = line[pos:].rstrip('\r\n')
317 nl_pos = pos + len(comment_token)
318 yield (COMMENT, comment_token,
319 (lnum, pos), (lnum, pos + len(comment_token)), line)
320 yield (NL, line[nl_pos:],
321 (lnum, nl_pos), (lnum, len(line)), line)
322 else:
323 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000324 (lnum, pos), (lnum, len(line)), line)
325 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000326
327 if column > indents[-1]: # count indents or dedents
328 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000329 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000330 while column < indents[-1]:
Raymond Hettingerda99d1c2005-06-21 07:43:58 +0000331 if column not in indents:
332 raise IndentationError(
Georg Brandl2463f8f2006-08-14 21:34:08 +0000333 "unindent does not match any outer indentation level",
334 ("<tokenize>", lnum, pos, line))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000335 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000336 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000337
338 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000339 if not line:
340 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000341 continued = 0
342
343 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000344 pseudomatch = pseudoprog.match(line, pos)
345 if pseudomatch: # scan for tokens
346 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000347 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000348 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000349
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000350 if initial in numchars or \
351 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000352 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000353 elif initial in '\r\n':
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000354 yield (NL if parenlev > 0 else NEWLINE,
355 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000356 elif initial == '#':
Jeremy Hylton76467ba2006-08-23 21:14:03 +0000357 assert not token.endswith("\n")
Tim Peters5ca576e2001-06-18 22:08:13 +0000358 yield (COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000359 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000360 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000361 endmatch = endprog.match(line, pos)
362 if endmatch: # all on one line
363 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000364 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000365 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000366 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000367 strstart = (lnum, start) # multiple lines
368 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000369 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000370 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000371 elif initial in single_quoted or \
372 token[:2] in single_quoted or \
373 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000374 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000375 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000376 endprog = (endprogs[initial] or endprogs[token[1]] or
377 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000378 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000379 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000380 break
381 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000382 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000383 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000384 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000385 elif initial == '\\': # continued stmt
386 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000387 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000388 if initial in '([{': parenlev = parenlev + 1
389 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000390 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000391 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000392 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000393 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000394 pos = pos + 1
395
396 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000397 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
398 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000399
400if __name__ == '__main__': # testing
401 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000402 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000403 else: tokenize(sys.stdin.readline)