blob: 22f28c44269e6715846f69c2c4bb57b46fbac1a3 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Tim Peters4efb6e92001-06-29 23:51:08 +00003generate_tokens(readline) is a generator that breaks a stream of
Guido van Rossum1aec3231997-04-08 14:24:39 +00004text into Python tokens. It accepts a readline-like method which is called
Tim Peters4efb6e92001-06-29 23:51:08 +00005repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000024
Ka-Ping Yee244c5932001-03-01 13:56:40 +000025__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000026__credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000028
Guido van Rossum3b631771997-10-27 20:44:15 +000029import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000030from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032import token
33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
Neal Norwitze98d16e2002-03-26 16:20:26 +000034del x
Skip Montanaro40fc1602001-03-01 04:27:19 +000035del token
36
Guido van Rossum1aec3231997-04-08 14:24:39 +000037COMMENT = N_TOKENS
38tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000039NL = N_TOKENS + 1
40tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000041N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000042
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000043def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000044def any(*choices): return apply(group, choices) + '*'
45def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000046
Guido van Rossum3b631771997-10-27 20:44:15 +000047Whitespace = r'[ \f\t]*'
48Comment = r'#[^\r\n]*'
49Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
50Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000051
Guido van Rossum3b631771997-10-27 20:44:15 +000052Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
53Octnumber = r'0[0-7]*[lL]?'
54Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000055Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000056Exponent = r'[eE][-+]?\d+'
57Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Tim Petersd507dab2001-08-30 20:51:59 +000058Expfloat = r'\d+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000059Floatnumber = group(Pointfloat, Expfloat)
Tim Petersd507dab2001-08-30 20:51:59 +000060Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000061Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000062
Tim Petersde495832000-10-07 05:09:39 +000063# Tail end of ' string.
64Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
65# Tail end of " string.
66Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
67# Tail end of ''' string.
68Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
69# Tail end of """ string.
70Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000071Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000072# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000073String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
74 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000075
Tim Petersde495832000-10-07 05:09:39 +000076# Because of leftmost-then-longest match semantics, be sure to put the
77# longest operators first (e.g., if = came before ==, == would get
78# recognized as two instances of =).
79Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
Guido van Rossum96204f52001-08-08 05:04:07 +000080 r"//=?",
Tim Petersde495832000-10-07 05:09:39 +000081 r"[+\-*/%&|^=<>]=?",
82 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000083
Guido van Rossum4d8e8591992-01-01 19:34:47 +000084Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000085Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000086Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000087
Guido van Rossum3b631771997-10-27 20:44:15 +000088PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000089Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000090
Tim Petersde495832000-10-07 05:09:39 +000091# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000092ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
93 group("'", r'\\\r?\n'),
94 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
95 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000096PseudoExtras = group(r'\\\r?\n', Comment, Triple)
97PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000098
Guido van Rossum3b631771997-10-27 20:44:15 +000099tokenprog, pseudoprog, single3prog, double3prog = map(
100 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000101endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000102 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000103 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000104 "u'''": single3prog, 'u"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "R'''": single3prog, 'R"""': double3prog,
107 "U'''": single3prog, 'U"""': double3prog,
108 "uR'''": single3prog, 'uR"""': double3prog,
109 "Ur'''": single3prog, 'Ur"""': double3prog,
110 "UR'''": single3prog, 'UR"""': double3prog,
111 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000112
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000113tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000114
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000115class TokenError(Exception): pass
116
117class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000118
Guido van Rossum1aec3231997-04-08 14:24:39 +0000119def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
120 print "%d,%d-%d,%d:\t%s\t%s" % \
121 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000122
Guido van Rossum1aec3231997-04-08 14:24:39 +0000123def tokenize(readline, tokeneater=printtoken):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000124 """
125 The tokenize() function accepts two parameters: one representing the
126 input stream, and one providing an output mechanism for tokenize().
Tim Peters8ac14952002-05-23 15:15:30 +0000127
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000128 The first parameter, readline, must be a callable object which provides
129 the same interface as the readline() method of built-in file objects.
Tim Peters8ac14952002-05-23 15:15:30 +0000130 Each call to the function should return one line of input as a string.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000131
132 The second parameter, tokeneater, must also be a callable object. It is
133 called once for each token, with five arguments, corresponding to the
Tim Peters8ac14952002-05-23 15:15:30 +0000134 tuples generated by generate_tokens().
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000135 """
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000136 try:
137 tokenize_loop(readline, tokeneater)
138 except StopTokenizing:
139 pass
140
Tim Peters4efb6e92001-06-29 23:51:08 +0000141# backwards compatible interface
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000142def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000143 for token_info in generate_tokens(readline):
144 apply(tokeneater, token_info)
145
146def generate_tokens(readline):
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000147 """
148 The generate_tokens() generator requires one argment, readline, which
149 must be a callable object which provides the same interface as the
150 readline() method of built-in file objects. Each call to the function
151 should return one line of input as a string.
Tim Peters8ac14952002-05-23 15:15:30 +0000152
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000153 The generator produces 5-tuples with these members: the token type; the
154 token string; a 2-tuple (srow, scol) of ints specifying the row and
155 column where the token begins in the source; a 2-tuple (erow, ecol) of
156 ints specifying the row and column where the token ends in the source;
157 and the line on which the token was found. The line passed is the
Tim Peters8ac14952002-05-23 15:15:30 +0000158 logical line; continuation lines are included.
Raymond Hettingerd1fa3db2002-05-15 02:56:03 +0000159 """
Guido van Rossum1aec3231997-04-08 14:24:39 +0000160 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000161 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000162 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000163 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000164 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000165
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000166 while 1: # loop over lines in stream
167 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000168 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000169 pos, max = 0, len(line)
170
171 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000172 if not line:
173 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000174 endmatch = endprog.match(line)
175 if endmatch:
176 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000177 yield (STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000178 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000179 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000180 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000181 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000182 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000183 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000184 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000185 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000186 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000187 else:
188 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000189 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000190 continue
191
Guido van Rossum1aec3231997-04-08 14:24:39 +0000192 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000193 if not line: break
194 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000195 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000196 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000197 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000198 elif line[pos] == '\f': column = 0
199 else: break
200 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000201 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000202
203 if line[pos] in '#\r\n': # skip comments or blank lines
Tim Peters5ca576e2001-06-18 22:08:13 +0000204 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000205 (lnum, pos), (lnum, len(line)), line)
206 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000207
208 if column > indents[-1]: # count indents or dedents
209 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000210 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000211 while column < indents[-1]:
212 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000213 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000214
215 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000216 if not line:
217 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000218 continued = 0
219
220 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000221 pseudomatch = pseudoprog.match(line, pos)
222 if pseudomatch: # scan for tokens
223 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000224 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000225 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000226
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000227 if initial in numchars or \
228 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000229 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000230 elif initial in '\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000231 yield (parenlev > 0 and NL or NEWLINE,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000232 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000233 elif initial == '#':
Tim Peters5ca576e2001-06-18 22:08:13 +0000234 yield (COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000235 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000236 "r'''", 'r"""', "R'''", 'R"""',
237 "u'''", 'u"""', "U'''", 'U"""',
238 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
239 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000240 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000241 endmatch = endprog.match(line, pos)
242 if endmatch: # all on one line
243 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000244 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000245 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000246 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000247 strstart = (lnum, start) # multiple lines
248 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000249 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000250 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000251 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000252 token[:2] in ("r'", 'r"', "R'", 'R"',
253 "u'", 'u"', "U'", 'U"') or \
254 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
255 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000256 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000257 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000258 endprog = (endprogs[initial] or endprogs[token[1]] or
259 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000260 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000261 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000262 break
263 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000264 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000265 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000266 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000267 elif initial == '\\': # continued stmt
268 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000269 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000270 if initial in '([{': parenlev = parenlev + 1
271 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000272 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000273 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000274 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000275 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000276 pos = pos + 1
277
278 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000279 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
280 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000281
282if __name__ == '__main__': # testing
283 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000284 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000285 else: tokenize(sys.stdin.readline)