blob: 76ea7a2ef99cfb7d3eb075983ba13f14730c6c40 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Tim Peters4efb6e92001-06-29 23:51:08 +00003generate_tokens(readline) is a generator that breaks a stream of
Guido van Rossum1aec3231997-04-08 14:24:39 +00004text into Python tokens. It accepts a readline-like method which is called
Tim Peters4efb6e92001-06-29 23:51:08 +00005repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000024
Ka-Ping Yee244c5932001-03-01 13:56:40 +000025__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000026__credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000028
Guido van Rossum3b631771997-10-27 20:44:15 +000029import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000030from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032import token
33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
Neal Norwitze98d16e2002-03-26 16:20:26 +000034del x
Skip Montanaro40fc1602001-03-01 04:27:19 +000035del token
36
Guido van Rossum1aec3231997-04-08 14:24:39 +000037COMMENT = N_TOKENS
38tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000039NL = N_TOKENS + 1
40tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000041N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000042
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000043def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000044def any(*choices): return apply(group, choices) + '*'
45def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000046
Guido van Rossum3b631771997-10-27 20:44:15 +000047Whitespace = r'[ \f\t]*'
48Comment = r'#[^\r\n]*'
49Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
50Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000051
Guido van Rossum3b631771997-10-27 20:44:15 +000052Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
53Octnumber = r'0[0-7]*[lL]?'
54Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000055Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000056Exponent = r'[eE][-+]?\d+'
57Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Tim Petersd507dab2001-08-30 20:51:59 +000058Expfloat = r'\d+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000059Floatnumber = group(Pointfloat, Expfloat)
Tim Petersd507dab2001-08-30 20:51:59 +000060Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000061Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000062
Tim Petersde495832000-10-07 05:09:39 +000063# Tail end of ' string.
64Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
65# Tail end of " string.
66Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
67# Tail end of ''' string.
68Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
69# Tail end of """ string.
70Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000071Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000072# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000073String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
74 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000075
Tim Petersde495832000-10-07 05:09:39 +000076# Because of leftmost-then-longest match semantics, be sure to put the
77# longest operators first (e.g., if = came before ==, == would get
78# recognized as two instances of =).
79Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
Guido van Rossum96204f52001-08-08 05:04:07 +000080 r"//=?",
Tim Petersde495832000-10-07 05:09:39 +000081 r"[+\-*/%&|^=<>]=?",
82 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000083
Guido van Rossum4d8e8591992-01-01 19:34:47 +000084Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000085Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000086Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000087
Guido van Rossum3b631771997-10-27 20:44:15 +000088PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000089Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000090
Tim Petersde495832000-10-07 05:09:39 +000091# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000092ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
93 group("'", r'\\\r?\n'),
94 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
95 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000096PseudoExtras = group(r'\\\r?\n', Comment, Triple)
97PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000098
Guido van Rossum3b631771997-10-27 20:44:15 +000099tokenprog, pseudoprog, single3prog, double3prog = map(
100 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000101endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000102 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000103 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000104 "u'''": single3prog, 'u"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "R'''": single3prog, 'R"""': double3prog,
107 "U'''": single3prog, 'U"""': double3prog,
108 "uR'''": single3prog, 'uR"""': double3prog,
109 "Ur'''": single3prog, 'Ur"""': double3prog,
110 "UR'''": single3prog, 'UR"""': double3prog,
111 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000112
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000113triple_quoted = {}
114for t in ("'''", '"""',
115 "r'''", 'r"""', "R'''", 'R"""',
116 "u'''", 'u"""', "U'''", 'U"""',
117 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
118 "uR'''", 'uR"""', "UR'''", 'UR"""'):
119 triple_quoted[t] = t
120single_quoted = {}
121for t in ("'", '"',
122 "r'", 'r"', "R'", 'R"',
123 "u'", 'u"', "U'", 'U"',
124 "ur'", 'ur"', "Ur'", 'Ur"',
125 "uR'", 'uR"', "UR'", 'UR"' ):
126 single_quoted[t] = t
127
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000128tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000129
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000130class TokenError(Exception): pass
131
132class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000133
Guido van Rossum1aec3231997-04-08 14:24:39 +0000134def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
135 print "%d,%d-%d,%d:\t%s\t%s" % \
136 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000137
Guido van Rossum1aec3231997-04-08 14:24:39 +0000138def tokenize(readline, tokeneater=printtoken):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000139 """
140 The tokenize() function accepts two parameters: one representing the
141 input stream, and one providing an output mechanism for tokenize().
Tim Peters8ac14952002-05-23 15:15:30 +0000142
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000143 The first parameter, readline, must be a callable object which provides
144 the same interface as the readline() method of built-in file objects.
Tim Peters8ac14952002-05-23 15:15:30 +0000145 Each call to the function should return one line of input as a string.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000146
147 The second parameter, tokeneater, must also be a callable object. It is
148 called once for each token, with five arguments, corresponding to the
Tim Peters8ac14952002-05-23 15:15:30 +0000149 tuples generated by generate_tokens().
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000150 """
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000151 try:
152 tokenize_loop(readline, tokeneater)
153 except StopTokenizing:
154 pass
155
Tim Peters4efb6e92001-06-29 23:51:08 +0000156# backwards compatible interface
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000157def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000158 for token_info in generate_tokens(readline):
159 apply(tokeneater, token_info)
160
161def generate_tokens(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000162 """
163 The generate_tokens() generator requires one argment, readline, which
164 must be a callable object which provides the same interface as the
165 readline() method of built-in file objects. Each call to the function
166 should return one line of input as a string.
Tim Peters8ac14952002-05-23 15:15:30 +0000167
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000168 The generator produces 5-tuples with these members: the token type; the
169 token string; a 2-tuple (srow, scol) of ints specifying the row and
170 column where the token begins in the source; a 2-tuple (erow, ecol) of
171 ints specifying the row and column where the token ends in the source;
172 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000173 logical line; continuation lines are included.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000174 """
Guido van Rossum1aec3231997-04-08 14:24:39 +0000175 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000176 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000177 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000178 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000179 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000180
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000181 while 1: # loop over lines in stream
182 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000183 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000184 pos, max = 0, len(line)
185
186 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000187 if not line:
188 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000189 endmatch = endprog.match(line)
190 if endmatch:
191 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000192 yield (STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000193 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000194 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000195 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000196 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000197 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000198 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000199 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000200 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000201 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000202 else:
203 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000204 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000205 continue
206
Guido van Rossum1aec3231997-04-08 14:24:39 +0000207 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000208 if not line: break
209 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000210 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000211 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000212 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000213 elif line[pos] == '\f': column = 0
214 else: break
215 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000216 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000217
218 if line[pos] in '#\r\n': # skip comments or blank lines
Tim Peters5ca576e2001-06-18 22:08:13 +0000219 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000220 (lnum, pos), (lnum, len(line)), line)
221 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000222
223 if column > indents[-1]: # count indents or dedents
224 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000225 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000226 while column < indents[-1]:
227 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000228 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000229
230 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000231 if not line:
232 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000233 continued = 0
234
235 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000236 pseudomatch = pseudoprog.match(line, pos)
237 if pseudomatch: # scan for tokens
238 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000239 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000240 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000241
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000242 if initial in numchars or \
243 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000244 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000245 elif initial in '\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000246 yield (parenlev > 0 and NL or NEWLINE,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000247 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000248 elif initial == '#':
Tim Peters5ca576e2001-06-18 22:08:13 +0000249 yield (COMMENT, token, spos, epos, line)
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000250 elif token in triple_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000251 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000252 endmatch = endprog.match(line, pos)
253 if endmatch: # all on one line
254 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000255 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000256 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000257 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000258 strstart = (lnum, start) # multiple lines
259 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000260 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000261 break
Guido van Rossum9d6897a2002-08-24 06:54:19 +0000262 elif initial in single_quoted or \
263 token[:2] in single_quoted or \
264 token[:3] in single_quoted:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000265 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000266 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000267 endprog = (endprogs[initial] or endprogs[token[1]] or
268 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000269 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000270 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000271 break
272 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000273 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000274 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000275 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000276 elif initial == '\\': # continued stmt
277 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000278 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000279 if initial in '([{': parenlev = parenlev + 1
280 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000281 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000282 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000283 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000284 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000285 pos = pos + 1
286
287 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000288 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
289 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000290
291if __name__ == '__main__': # testing
292 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000293 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000294 else: tokenize(sys.stdin.readline)