blob: f7ff000d016a9a37fbbe58abc8f245067253954b [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Tim Peters4efb6e92001-06-29 23:51:08 +00003generate_tokens(readline) is a generator that breaks a stream of
Guido van Rossum1aec3231997-04-08 14:24:39 +00004text into Python tokens. It accepts a readline-like method which is called
Tim Peters4efb6e92001-06-29 23:51:08 +00005repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000024
Guido van Rossumb09f7ed2001-07-15 21:08:29 +000025from __future__ import generators
26
Ka-Ping Yee244c5932001-03-01 13:56:40 +000027__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000028__credits__ = \
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000030
Guido van Rossum3b631771997-10-27 20:44:15 +000031import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000032from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000033
Skip Montanaro40fc1602001-03-01 04:27:19 +000034import token
35__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
Neal Norwitze98d16e2002-03-26 16:20:26 +000036del x
Skip Montanaro40fc1602001-03-01 04:27:19 +000037del token
38
Guido van Rossum1aec3231997-04-08 14:24:39 +000039COMMENT = N_TOKENS
40tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000041NL = N_TOKENS + 1
42tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000043N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000044
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000045def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000046def any(*choices): return apply(group, choices) + '*'
47def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000048
Guido van Rossum3b631771997-10-27 20:44:15 +000049Whitespace = r'[ \f\t]*'
50Comment = r'#[^\r\n]*'
51Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
52Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000053
Guido van Rossum3b631771997-10-27 20:44:15 +000054Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
55Octnumber = r'0[0-7]*[lL]?'
56Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000057Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000058Exponent = r'[eE][-+]?\d+'
59Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Tim Petersd507dab2001-08-30 20:51:59 +000060Expfloat = r'\d+' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000061Floatnumber = group(Pointfloat, Expfloat)
Tim Petersd507dab2001-08-30 20:51:59 +000062Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000063Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000064
Tim Petersde495832000-10-07 05:09:39 +000065# Tail end of ' string.
66Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
67# Tail end of " string.
68Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
69# Tail end of ''' string.
70Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
71# Tail end of """ string.
72Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000073Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000074# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000075String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
76 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000077
Tim Petersde495832000-10-07 05:09:39 +000078# Because of leftmost-then-longest match semantics, be sure to put the
79# longest operators first (e.g., if = came before ==, == would get
80# recognized as two instances of =).
81Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
Guido van Rossum96204f52001-08-08 05:04:07 +000082 r"//=?",
Tim Petersde495832000-10-07 05:09:39 +000083 r"[+\-*/%&|^=<>]=?",
84 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000085
Guido van Rossum4d8e8591992-01-01 19:34:47 +000086Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000087Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000088Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000089
Guido van Rossum3b631771997-10-27 20:44:15 +000090PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000091Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000092
Tim Petersde495832000-10-07 05:09:39 +000093# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000094ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
95 group("'", r'\\\r?\n'),
96 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
97 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000098PseudoExtras = group(r'\\\r?\n', Comment, Triple)
99PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000100
Guido van Rossum3b631771997-10-27 20:44:15 +0000101tokenprog, pseudoprog, single3prog, double3prog = map(
102 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000103endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000104 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000105 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000106 "u'''": single3prog, 'u"""': double3prog,
107 "ur'''": single3prog, 'ur"""': double3prog,
108 "R'''": single3prog, 'R"""': double3prog,
109 "U'''": single3prog, 'U"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000114
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000115tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000116
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000117class TokenError(Exception): pass
118
119class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000120
Guido van Rossum1aec3231997-04-08 14:24:39 +0000121def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
122 print "%d,%d-%d,%d:\t%s\t%s" % \
123 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000124
Guido van Rossum1aec3231997-04-08 14:24:39 +0000125def tokenize(readline, tokeneater=printtoken):
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000126 try:
127 tokenize_loop(readline, tokeneater)
128 except StopTokenizing:
129 pass
130
Tim Peters4efb6e92001-06-29 23:51:08 +0000131# backwards compatible interface
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000132def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000133 for token_info in generate_tokens(readline):
134 apply(tokeneater, token_info)
135
136def generate_tokens(readline):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000137 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000138 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000139 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000140 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000141 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000142
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000143 while 1: # loop over lines in stream
144 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000145 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000146 pos, max = 0, len(line)
147
148 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000149 if not line:
150 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000151 endmatch = endprog.match(line)
152 if endmatch:
153 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000154 yield (STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000155 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000156 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000157 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000158 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000159 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000160 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000161 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000162 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000163 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000164 else:
165 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000166 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000167 continue
168
Guido van Rossum1aec3231997-04-08 14:24:39 +0000169 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000170 if not line: break
171 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000172 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000173 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000174 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000175 elif line[pos] == '\f': column = 0
176 else: break
177 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000178 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000179
180 if line[pos] in '#\r\n': # skip comments or blank lines
Tim Peters5ca576e2001-06-18 22:08:13 +0000181 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000182 (lnum, pos), (lnum, len(line)), line)
183 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000184
185 if column > indents[-1]: # count indents or dedents
186 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000187 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000188 while column < indents[-1]:
189 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000190 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000191
192 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000193 if not line:
194 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000195 continued = 0
196
197 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000198 pseudomatch = pseudoprog.match(line, pos)
199 if pseudomatch: # scan for tokens
200 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000201 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000202 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000203
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000204 if initial in numchars or \
205 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000206 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000207 elif initial in '\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000208 yield (parenlev > 0 and NL or NEWLINE,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000209 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000210 elif initial == '#':
Tim Peters5ca576e2001-06-18 22:08:13 +0000211 yield (COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000212 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000213 "r'''", 'r"""', "R'''", 'R"""',
214 "u'''", 'u"""', "U'''", 'U"""',
215 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
216 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000217 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000218 endmatch = endprog.match(line, pos)
219 if endmatch: # all on one line
220 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000221 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000222 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000223 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000224 strstart = (lnum, start) # multiple lines
225 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000226 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000227 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000228 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000229 token[:2] in ("r'", 'r"', "R'", 'R"',
230 "u'", 'u"', "U'", 'U"') or \
231 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
232 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000233 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000234 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000235 endprog = (endprogs[initial] or endprogs[token[1]] or
236 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000237 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000238 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000239 break
240 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000241 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000242 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000243 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000244 elif initial == '\\': # continued stmt
245 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000246 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000247 if initial in '([{': parenlev = parenlev + 1
248 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000249 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000250 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000251 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000252 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000253 pos = pos + 1
254
255 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000256 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
257 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000258
259if __name__ == '__main__': # testing
260 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000261 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000262 else: tokenize(sys.stdin.readline)