blob: 33c5e41d7ae40d65ced3c9b3e0e2cb4eb4dfa20d [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Tim Peters4efb6e92001-06-29 23:51:08 +00003generate_tokens(readline) is a generator that breaks a stream of
Guido van Rossum1aec3231997-04-08 14:24:39 +00004text into Python tokens. It accepts a readline-like method which is called
Tim Peters4efb6e92001-06-29 23:51:08 +00005repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000024
Guido van Rossumb09f7ed2001-07-15 21:08:29 +000025from __future__ import generators
26
Ka-Ping Yee244c5932001-03-01 13:56:40 +000027__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000028__credits__ = \
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000030
Guido van Rossum3b631771997-10-27 20:44:15 +000031import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000032from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000033
Skip Montanaro40fc1602001-03-01 04:27:19 +000034import token
35__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
36del token
37
Guido van Rossum1aec3231997-04-08 14:24:39 +000038COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000040NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000042N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000043
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000044def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000045def any(*choices): return apply(group, choices) + '*'
46def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000047
Guido van Rossum3b631771997-10-27 20:44:15 +000048Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000052
Guido van Rossum3b631771997-10-27 20:44:15 +000053Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54Octnumber = r'0[0-7]*[lL]?'
55Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000056Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000057Exponent = r'[eE][-+]?\d+'
58Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59Expfloat = r'[1-9]\d*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000060Floatnumber = group(Pointfloat, Expfloat)
Guido van Rossum3b631771997-10-27 20:44:15 +000061Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000062Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000063
Tim Petersde495832000-10-07 05:09:39 +000064# Tail end of ' string.
65Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66# Tail end of " string.
67Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68# Tail end of ''' string.
69Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70# Tail end of """ string.
71Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000072Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000073# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000074String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000076
Tim Petersde495832000-10-07 05:09:39 +000077# Because of leftmost-then-longest match semantics, be sure to put the
78# longest operators first (e.g., if = came before ==, == would get
79# recognized as two instances of =).
80Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81 r"[+\-*/%&|^=<>]=?",
82 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000083
Guido van Rossum4d8e8591992-01-01 19:34:47 +000084Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000085Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000086Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000087
Guido van Rossum3b631771997-10-27 20:44:15 +000088PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000089Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000090
Tim Petersde495832000-10-07 05:09:39 +000091# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000092ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
93 group("'", r'\\\r?\n'),
94 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
95 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000096PseudoExtras = group(r'\\\r?\n', Comment, Triple)
97PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000098
Guido van Rossum3b631771997-10-27 20:44:15 +000099tokenprog, pseudoprog, single3prog, double3prog = map(
100 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +0000101endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +0000102 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +0000103 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000104 "u'''": single3prog, 'u"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "R'''": single3prog, 'R"""': double3prog,
107 "U'''": single3prog, 'U"""': double3prog,
108 "uR'''": single3prog, 'uR"""': double3prog,
109 "Ur'''": single3prog, 'Ur"""': double3prog,
110 "UR'''": single3prog, 'UR"""': double3prog,
111 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000112
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000113tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000114
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000115class TokenError(Exception): pass
116
117class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000118
Guido van Rossum1aec3231997-04-08 14:24:39 +0000119def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
120 print "%d,%d-%d,%d:\t%s\t%s" % \
121 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000122
Guido van Rossum1aec3231997-04-08 14:24:39 +0000123def tokenize(readline, tokeneater=printtoken):
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000124 try:
125 tokenize_loop(readline, tokeneater)
126 except StopTokenizing:
127 pass
128
Tim Peters4efb6e92001-06-29 23:51:08 +0000129# backwards compatible interface
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000130def tokenize_loop(readline, tokeneater):
Tim Peters5ca576e2001-06-18 22:08:13 +0000131 for token_info in generate_tokens(readline):
132 apply(tokeneater, token_info)
133
134def generate_tokens(readline):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000135 lnum = parenlev = continued = 0
Fred Drake79e75e12001-07-20 19:05:50 +0000136 namechars, numchars = string.ascii_letters + '_', '0123456789'
Guido van Rossumde655271997-04-09 17:15:54 +0000137 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000138 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000139 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000140
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000141 while 1: # loop over lines in stream
142 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000143 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000144 pos, max = 0, len(line)
145
146 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000147 if not line:
148 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000149 endmatch = endprog.match(line)
150 if endmatch:
151 pos = end = endmatch.end(0)
Tim Peters5ca576e2001-06-18 22:08:13 +0000152 yield (STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000153 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000154 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000155 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000156 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000157 yield (ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000158 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000159 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000160 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000161 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000162 else:
163 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000164 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000165 continue
166
Guido van Rossum1aec3231997-04-08 14:24:39 +0000167 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000168 if not line: break
169 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000170 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000171 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000172 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000173 elif line[pos] == '\f': column = 0
174 else: break
175 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000176 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000177
178 if line[pos] in '#\r\n': # skip comments or blank lines
Tim Peters5ca576e2001-06-18 22:08:13 +0000179 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000180 (lnum, pos), (lnum, len(line)), line)
181 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000182
183 if column > indents[-1]: # count indents or dedents
184 indents.append(column)
Tim Peters5ca576e2001-06-18 22:08:13 +0000185 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000186 while column < indents[-1]:
187 indents = indents[:-1]
Tim Peters5ca576e2001-06-18 22:08:13 +0000188 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000189
190 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000191 if not line:
192 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000193 continued = 0
194
195 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000196 pseudomatch = pseudoprog.match(line, pos)
197 if pseudomatch: # scan for tokens
198 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000199 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000200 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000201
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000202 if initial in numchars or \
203 (initial == '.' and token != '.'): # ordinary number
Tim Peters5ca576e2001-06-18 22:08:13 +0000204 yield (NUMBER, token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000205 elif initial in '\r\n':
Tim Peters5ca576e2001-06-18 22:08:13 +0000206 yield (parenlev > 0 and NL or NEWLINE,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000207 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000208 elif initial == '#':
Tim Peters5ca576e2001-06-18 22:08:13 +0000209 yield (COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000210 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000211 "r'''", 'r"""', "R'''", 'R"""',
212 "u'''", 'u"""', "U'''", 'U"""',
213 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
214 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000215 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000216 endmatch = endprog.match(line, pos)
217 if endmatch: # all on one line
218 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000219 token = line[start:pos]
Tim Peters5ca576e2001-06-18 22:08:13 +0000220 yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000221 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000222 strstart = (lnum, start) # multiple lines
223 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000224 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000225 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000226 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000227 token[:2] in ("r'", 'r"', "R'", 'R"',
228 "u'", 'u"', "U'", 'U"') or \
229 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
230 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000231 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000232 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000233 endprog = (endprogs[initial] or endprogs[token[1]] or
234 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000235 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000236 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000237 break
238 else: # ordinary string
Tim Peters5ca576e2001-06-18 22:08:13 +0000239 yield (STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000240 elif initial in namechars: # ordinary name
Tim Peters5ca576e2001-06-18 22:08:13 +0000241 yield (NAME, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000242 elif initial == '\\': # continued stmt
243 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000244 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000245 if initial in '([{': parenlev = parenlev + 1
246 elif initial in ')]}': parenlev = parenlev - 1
Tim Peters5ca576e2001-06-18 22:08:13 +0000247 yield (OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000248 else:
Tim Peters5ca576e2001-06-18 22:08:13 +0000249 yield (ERRORTOKEN, line[pos],
Guido van Rossumde655271997-04-09 17:15:54 +0000250 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000251 pos = pos + 1
252
253 for indent in indents[1:]: # pop remaining indent levels
Tim Peters5ca576e2001-06-18 22:08:13 +0000254 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
255 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000256
257if __name__ == '__main__': # testing
258 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000259 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000260 else: tokenize(sys.stdin.readline)