blob: b3ee4a85c30b0801311f79b86305d8d3a2065993 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Guido van Rossum1aec3231997-04-08 14:24:39 +00003This module exports a function called 'tokenize()' that breaks a stream of
4text into Python tokens. It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6function which is called once for each token found. The latter function is
7passed the token type, a string containing the token, the starting and
8ending (row, column) coordinates of the token, and the original line. It is
9designed to match the working of the Python tokenizer exactly, except that
Guido van Rossum3b631771997-10-27 20:44:15 +000010it produces COMMENT tokens for comments and gives type OP for all operators."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000011
Ka-Ping Yee244c5932001-03-01 13:56:40 +000012__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000013__credits__ = \
14 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000015
Guido van Rossum3b631771997-10-27 20:44:15 +000016import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000017from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000018
Skip Montanaro40fc1602001-03-01 04:27:19 +000019import token
20__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
21del token
22
Guido van Rossum1aec3231997-04-08 14:24:39 +000023COMMENT = N_TOKENS
24tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000025NL = N_TOKENS + 1
26tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000027N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000028
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000029def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000030def any(*choices): return apply(group, choices) + '*'
31def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Guido van Rossum3b631771997-10-27 20:44:15 +000033Whitespace = r'[ \f\t]*'
34Comment = r'#[^\r\n]*'
35Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
36Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000037
Guido van Rossum3b631771997-10-27 20:44:15 +000038Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
39Octnumber = r'0[0-7]*[lL]?'
40Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000041Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000042Exponent = r'[eE][-+]?\d+'
43Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
44Expfloat = r'[1-9]\d*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000045Floatnumber = group(Pointfloat, Expfloat)
Guido van Rossum3b631771997-10-27 20:44:15 +000046Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000047Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000048
Tim Petersde495832000-10-07 05:09:39 +000049# Tail end of ' string.
50Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
51# Tail end of " string.
52Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
53# Tail end of ''' string.
54Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
55# Tail end of """ string.
56Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000057Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000058# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000059String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
60 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000061
Tim Petersde495832000-10-07 05:09:39 +000062# Because of leftmost-then-longest match semantics, be sure to put the
63# longest operators first (e.g., if = came before ==, == would get
64# recognized as two instances of =).
65Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
66 r"[+\-*/%&|^=<>]=?",
67 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000068
Guido van Rossum4d8e8591992-01-01 19:34:47 +000069Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000070Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000071Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000072
Guido van Rossum3b631771997-10-27 20:44:15 +000073PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000074Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000075
Tim Petersde495832000-10-07 05:09:39 +000076# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000077ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
78 group("'", r'\\\r?\n'),
79 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
80 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000081PseudoExtras = group(r'\\\r?\n', Comment, Triple)
82PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000083
Guido van Rossum3b631771997-10-27 20:44:15 +000084tokenprog, pseudoprog, single3prog, double3prog = map(
85 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +000086endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +000087 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +000088 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000089 "u'''": single3prog, 'u"""': double3prog,
90 "ur'''": single3prog, 'ur"""': double3prog,
91 "R'''": single3prog, 'R"""': double3prog,
92 "U'''": single3prog, 'U"""': double3prog,
93 "uR'''": single3prog, 'uR"""': double3prog,
94 "Ur'''": single3prog, 'Ur"""': double3prog,
95 "UR'''": single3prog, 'UR"""': double3prog,
96 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +000097
Guido van Rossumfc6f5331997-03-07 00:21:12 +000098tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +000099
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000100class TokenError(Exception): pass
101
102class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000103
Guido van Rossum1aec3231997-04-08 14:24:39 +0000104def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
105 print "%d,%d-%d,%d:\t%s\t%s" % \
106 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000107
Guido van Rossum1aec3231997-04-08 14:24:39 +0000108def tokenize(readline, tokeneater=printtoken):
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000109 try:
110 tokenize_loop(readline, tokeneater)
111 except StopTokenizing:
112 pass
113
Tim Peters5ca576e2001-06-18 22:08:13 +0000114# backwards compatible interface, probably not used
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000115def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000116 for token_info in generate_tokens(readline):
117 apply(tokeneater, token_info)
118
119def generate_tokens(readline):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000120 lnum = parenlev = continued = 0
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000121 namechars, numchars = string.letters + '_', string.digits
Guido van Rossumde655271997-04-09 17:15:54 +0000122 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000123 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000124 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000125
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000126 while 1: # loop over lines in stream
127 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000128 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000129 pos, max = 0, len(line)
130
131 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000132 if not line:
133 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000134 endmatch = endprog.match(line)
135 if endmatch:
136 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000137 yield (STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000138 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000139 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000140 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000141 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000142 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000143 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000144 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000145 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000146 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000147 else:
148 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000149 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000150 continue
151
Guido van Rossum1aec3231997-04-08 14:24:39 +0000152 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000153 if not line: break
154 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000155 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000156 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000157 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000158 elif line[pos] == '\f': column = 0
159 else: break
160 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000161 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000162
163 if line[pos] in '#\r\n': # skip comments or blank lines
Tim Peters5ca576e2001-06-18 22:08:13 +0000164 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000165 (lnum, pos), (lnum, len(line)), line)
166 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000167
168 if column > indents[-1]: # count indents or dedents
169 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000170 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000171 while column < indents[-1]:
172 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000173 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000174
175 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000176 if not line:
177 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000178 continued = 0
179
180 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000181 pseudomatch = pseudoprog.match(line, pos)
182 if pseudomatch: # scan for tokens
183 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000184 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000185 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000186
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000187 if initial in numchars or \
188 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000189 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000190 elif initial in '\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000191 yield (parenlev > 0 and NL or NEWLINE,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000192 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000193 elif initial == '#':
Tim Peters5ca576e2001-06-18 22:08:13 +0000194 yield (COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000195 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000196 "r'''", 'r"""', "R'''", 'R"""',
197 "u'''", 'u"""', "U'''", 'U"""',
198 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
199 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000200 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000201 endmatch = endprog.match(line, pos)
202 if endmatch: # all on one line
203 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000204 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000205 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000206 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000207 strstart = (lnum, start) # multiple lines
208 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000209 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000210 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000211 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000212 token[:2] in ("r'", 'r"', "R'", 'R"',
213 "u'", 'u"', "U'", 'U"') or \
214 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
215 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000216 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000217 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000218 endprog = (endprogs[initial] or endprogs[token[1]] or
219 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000220 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000221 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000222 break
223 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000224 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000225 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000226 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000227 elif initial == '\\': # continued stmt
228 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000229 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000230 if initial in '([{': parenlev = parenlev + 1
231 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000232 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000233 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000234 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000235 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000236 pos = pos + 1
237
238 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000239 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
240 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000241
242if __name__ == '__main__': # testing
243 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000244 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000245 else: tokenize(sys.stdin.readline)