blob: b79cdc001630231f11dc386731562cbc254ebe5b [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Tim Peters4efb6e92001-06-29 23:51:08 +00003generate_tokens(readline) is a generator that breaks a stream of
Guido van Rossum1aec3231997-04-08 14:24:39 +00004text into Python tokens. It accepts a readline-like method which is called
Tim Peters4efb6e92001-06-29 23:51:08 +00005repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000024
Ka-Ping Yee244c5932001-03-01 13:56:40 +000025__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000026__credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000028
Guido van Rossum3b631771997-10-27 20:44:15 +000029import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000030from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000031
Skip Montanaro40fc1602001-03-01 04:27:19 +000032import token
33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
34del token
35
Guido van Rossum1aec3231997-04-08 14:24:39 +000036COMMENT = N_TOKENS
37tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000038NL = N_TOKENS + 1
39tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000040N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000041
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000042def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000043def any(*choices): return apply(group, choices) + '*'
44def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000045
Guido van Rossum3b631771997-10-27 20:44:15 +000046Whitespace = r'[ \f\t]*'
47Comment = r'#[^\r\n]*'
48Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
49Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000050
Guido van Rossum3b631771997-10-27 20:44:15 +000051Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
52Octnumber = r'0[0-7]*[lL]?'
53Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000054Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000055Exponent = r'[eE][-+]?\d+'
56Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
57Expfloat = r'[1-9]\d*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000058Floatnumber = group(Pointfloat, Expfloat)
Guido van Rossum3b631771997-10-27 20:44:15 +000059Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000060Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000061
Tim Petersde495832000-10-07 05:09:39 +000062# Tail end of ' string.
63Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
64# Tail end of " string.
65Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
66# Tail end of ''' string.
67Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
68# Tail end of """ string.
69Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000070Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000071# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000072String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
73 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000074
Tim Petersde495832000-10-07 05:09:39 +000075# Because of leftmost-then-longest match semantics, be sure to put the
76# longest operators first (e.g., if = came before ==, == would get
77# recognized as two instances of =).
78Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
79 r"[+\-*/%&|^=<>]=?",
80 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000081
Guido van Rossum4d8e8591992-01-01 19:34:47 +000082Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000083Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000084Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000085
Guido van Rossum3b631771997-10-27 20:44:15 +000086PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000087Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000088
Tim Petersde495832000-10-07 05:09:39 +000089# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000090ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
91 group("'", r'\\\r?\n'),
92 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
93 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000094PseudoExtras = group(r'\\\r?\n', Comment, Triple)
95PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000096
Guido van Rossum3b631771997-10-27 20:44:15 +000097tokenprog, pseudoprog, single3prog, double3prog = map(
98 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +000099endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000100 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000101 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000102 "u'''": single3prog, 'u"""': double3prog,
103 "ur'''": single3prog, 'ur"""': double3prog,
104 "R'''": single3prog, 'R"""': double3prog,
105 "U'''": single3prog, 'U"""': double3prog,
106 "uR'''": single3prog, 'uR"""': double3prog,
107 "Ur'''": single3prog, 'Ur"""': double3prog,
108 "UR'''": single3prog, 'UR"""': double3prog,
109 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000110
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000111tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000112
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000113class TokenError(Exception): pass
114
115class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000116
Guido van Rossum1aec3231997-04-08 14:24:39 +0000117def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
118 print "%d,%d-%d,%d:\t%s\t%s" % \
119 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000120
Guido van Rossum1aec3231997-04-08 14:24:39 +0000121def tokenize(readline, tokeneater=printtoken):
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000122 try:
123 tokenize_loop(readline, tokeneater)
124 except StopTokenizing:
125 pass
126
Tim Peters4efb6e92001-06-29 23:51:08 +0000127# backwards compatible interface
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000128def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000129 for token_info in generate_tokens(readline):
130 apply(tokeneater, token_info)
131
132def generate_tokens(readline):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000133 lnum = parenlev = continued = 0
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000134 namechars, numchars = string.letters + '_', string.digits
Guido van Rossumde655271997-04-09 17:15:54 +0000135 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000136 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000137 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000138
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000139 while 1: # loop over lines in stream
140 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000141 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000142 pos, max = 0, len(line)
143
144 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000145 if not line:
146 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000147 endmatch = endprog.match(line)
148 if endmatch:
149 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000150 yield (STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000151 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000152 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000153 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000154 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000155 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000156 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000157 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000158 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000159 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000160 else:
161 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000162 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000163 continue
164
Guido van Rossum1aec3231997-04-08 14:24:39 +0000165 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000166 if not line: break
167 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000168 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000169 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000170 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000171 elif line[pos] == '\f': column = 0
172 else: break
173 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000174 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000175
176 if line[pos] in '#\r\n': # skip comments or blank lines
Tim Peters5ca576e2001-06-18 22:08:13 +0000177 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000178 (lnum, pos), (lnum, len(line)), line)
179 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000180
181 if column > indents[-1]: # count indents or dedents
182 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000183 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000184 while column < indents[-1]:
185 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000186 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000187
188 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000189 if not line:
190 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000191 continued = 0
192
193 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000194 pseudomatch = pseudoprog.match(line, pos)
195 if pseudomatch: # scan for tokens
196 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000197 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000198 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000199
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000200 if initial in numchars or \
201 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000202 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000203 elif initial in '\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000204 yield (parenlev > 0 and NL or NEWLINE,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000205 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000206 elif initial == '#':
Tim Peters5ca576e2001-06-18 22:08:13 +0000207 yield (COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000208 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000209 "r'''", 'r"""', "R'''", 'R"""',
210 "u'''", 'u"""', "U'''", 'U"""',
211 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
212 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000213 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000214 endmatch = endprog.match(line, pos)
215 if endmatch: # all on one line
216 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000217 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000218 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000219 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000220 strstart = (lnum, start) # multiple lines
221 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000222 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000223 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000224 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000225 token[:2] in ("r'", 'r"', "R'", 'R"',
226 "u'", 'u"', "U'", 'U"') or \
227 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
228 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000229 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000230 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000231 endprog = (endprogs[initial] or endprogs[token[1]] or
232 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000233 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000234 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000235 break
236 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000237 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000238 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000239 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000240 elif initial == '\\': # continued stmt
241 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000242 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000243 if initial in '([{': parenlev = parenlev + 1
244 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000245 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000246 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000247 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000248 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000249 pos = pos + 1
250
251 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000252 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
253 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000254
255if __name__ == '__main__': # testing
256 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000257 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000258 else: tokenize(sys.stdin.readline)