blob: b952b365c2a856142157fb9caef38608641b8248 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Tim Peters4efb6e92001-06-29 23:51:08 +00003generate_tokens(readline) is a generator that breaks a stream of
Guido van Rossum1aec3231997-04-08 14:24:39 +00004text into Python tokens. It accepts a readline-like method which is called
Tim Peters4efb6e92001-06-29 23:51:08 +00005repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000024
Guido van Rossumb09f7ed2001-07-15 21:08:29 +000025from __future__ import generators
26
Ka-Ping Yee244c5932001-03-01 13:56:40 +000027__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000028__credits__ = \
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000030
Guido van Rossum3b631771997-10-27 20:44:15 +000031import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000032from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000033
Skip Montanaro40fc1602001-03-01 04:27:19 +000034import token
35__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
36del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000042N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000043
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000044def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000045def any(*choices): return apply(group, choices) + '*'
46def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000047
Guido van Rossum3b631771997-10-27 20:44:15 +000048Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000052
Guido van Rossum3b631771997-10-27 20:44:15 +000053Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54Octnumber = r'0[0-7]*[lL]?'
55Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000056Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000057Exponent = r'[eE][-+]?\d+'
58Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59Expfloat = r'[1-9]\d*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000060Floatnumber = group(Pointfloat, Expfloat)
Guido van Rossum3b631771997-10-27 20:44:15 +000061Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000062Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000063
Tim Petersde495832000-10-07 05:09:39 +000064# Tail end of ' string.
65Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66# Tail end of " string.
67Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68# Tail end of ''' string.
69Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70# Tail end of """ string.
71Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000072Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000073# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000074String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000076
Tim Petersde495832000-10-07 05:09:39 +000077# Because of leftmost-then-longest match semantics, be sure to put the
78# longest operators first (e.g., if = came before ==, == would get
79# recognized as two instances of =).
80Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
Guido van Rossum96204f52001-08-08 05:04:07 +000081 r"//=?",
Tim Petersde495832000-10-07 05:09:39 +000082 r"[+\-*/%&|^=<>]=?",
83 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000084
Guido van Rossum4d8e8591992-01-01 19:34:47 +000085Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000086Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000087Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000088
Guido van Rossum3b631771997-10-27 20:44:15 +000089PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000090Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000091
Tim Petersde495832000-10-07 05:09:39 +000092# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000093ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000097PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000099
Guido van Rossum3b631771997-10-27 20:44:15 +0000100tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000102endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000103 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000104 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000113
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000114tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000115
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000116class TokenError(Exception): pass
117
118class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000119
Guido van Rossum1aec3231997-04-08 14:24:39 +0000120def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
121 print "%d,%d-%d,%d:\t%s\t%s" % \
122 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000123
Guido van Rossum1aec3231997-04-08 14:24:39 +0000124def tokenize(readline, tokeneater=printtoken):
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000125 try:
126 tokenize_loop(readline, tokeneater)
127 except StopTokenizing:
128 pass
129
Tim Peters4efb6e92001-06-29 23:51:08 +0000130# backwards compatible interface
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000131def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000132 for token_info in generate_tokens(readline):
133 apply(tokeneater, token_info)
134
135def generate_tokens(readline):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000136 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000137 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000138 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000139 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000140 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000141
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000142 while 1: # loop over lines in stream
143 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000144 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000145 pos, max = 0, len(line)
146
147 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000148 if not line:
149 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000150 endmatch = endprog.match(line)
151 if endmatch:
152 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000153 yield (STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000154 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000155 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000156 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000157 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000158 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000159 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000160 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000161 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000162 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000163 else:
164 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000165 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000166 continue
167
Guido van Rossum1aec3231997-04-08 14:24:39 +0000168 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000169 if not line: break
170 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000171 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000172 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000173 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000174 elif line[pos] == '\f': column = 0
175 else: break
176 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000177 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000178
179 if line[pos] in '#\r\n': # skip comments or blank lines
Tim Peters5ca576e2001-06-18 22:08:13 +0000180 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000181 (lnum, pos), (lnum, len(line)), line)
182 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000183
184 if column > indents[-1]: # count indents or dedents
185 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000186 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000187 while column < indents[-1]:
188 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000189 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000190
191 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000192 if not line:
193 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000194 continued = 0
195
196 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000197 pseudomatch = pseudoprog.match(line, pos)
198 if pseudomatch: # scan for tokens
199 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000200 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000201 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000202
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000203 if initial in numchars or \
204 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000205 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000206 elif initial in '\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000207 yield (parenlev > 0 and NL or NEWLINE,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000208 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000209 elif initial == '#':
Tim Peters5ca576e2001-06-18 22:08:13 +0000210 yield (COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000211 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000212 "r'''", 'r"""', "R'''", 'R"""',
213 "u'''", 'u"""', "U'''", 'U"""',
214 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
215 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000216 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000217 endmatch = endprog.match(line, pos)
218 if endmatch: # all on one line
219 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000220 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000221 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000222 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000223 strstart = (lnum, start) # multiple lines
224 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000225 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000226 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000227 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000228 token[:2] in ("r'", 'r"', "R'", 'R"',
229 "u'", 'u"', "U'", 'U"') or \
230 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
231 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000232 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000233 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000234 endprog = (endprogs[initial] or endprogs[token[1]] or
235 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000236 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000237 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000238 break
239 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000240 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000241 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000242 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000243 elif initial == '\\': # continued stmt
244 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000245 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000246 if initial in '([{': parenlev = parenlev + 1
247 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000248 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000249 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000250 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000251 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000252 pos = pos + 1
253
254 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000255 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
256 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000257
258if __name__ == '__main__': # testing
259 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000260 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000261 else: tokenize(sys.stdin.readline)