blob: 2af595daea44425dc0a46b9717d836d8ebfd9197 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Guido van Rossum1aec3231997-04-08 14:24:39 +00003This module exports a function called 'tokenize()' that breaks a stream of
4text into Python tokens. It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6function which is called once for each token found. The latter function is
7passed the token type, a string containing the token, the starting and
8ending (row, column) coordinates of the token, and the original line. It is
9designed to match the working of the Python tokenizer exactly, except that
Guido van Rossum3b631771997-10-27 20:44:15 +000010it produces COMMENT tokens for comments and gives type OP for all operators."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000011
Ka-Ping Yee244c5932001-03-01 13:56:40 +000012__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000013__credits__ = \
14 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000015
Guido van Rossum3b631771997-10-27 20:44:15 +000016import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000017from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000018
Skip Montanaro40fc1602001-03-01 04:27:19 +000019import token
20__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
21del token
22
Guido van Rossum1aec3231997-04-08 14:24:39 +000023COMMENT = N_TOKENS
24tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000025NL = N_TOKENS + 1
26tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000027N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000028
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000029def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000030def any(*choices): return apply(group, choices) + '*'
31def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Guido van Rossum3b631771997-10-27 20:44:15 +000033Whitespace = r'[ \f\t]*'
34Comment = r'#[^\r\n]*'
35Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
36Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000037
Guido van Rossum3b631771997-10-27 20:44:15 +000038Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
39Octnumber = r'0[0-7]*[lL]?'
40Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000041Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000042Exponent = r'[eE][-+]?\d+'
43Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
44Expfloat = r'[1-9]\d*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000045Floatnumber = group(Pointfloat, Expfloat)
Guido van Rossum3b631771997-10-27 20:44:15 +000046Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000047Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000048
Tim Petersde495832000-10-07 05:09:39 +000049# Tail end of ' string.
50Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
51# Tail end of " string.
52Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
53# Tail end of ''' string.
54Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
55# Tail end of """ string.
56Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000057Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000058# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000059String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
60 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000061
Tim Petersde495832000-10-07 05:09:39 +000062# Because of leftmost-then-longest match semantics, be sure to put the
63# longest operators first (e.g., if = came before ==, == would get
64# recognized as two instances of =).
65Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
66 r"[+\-*/%&|^=<>]=?",
67 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000068
Guido van Rossum4d8e8591992-01-01 19:34:47 +000069Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000070Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000071Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000072
Guido van Rossum3b631771997-10-27 20:44:15 +000073PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000074Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000075
Tim Petersde495832000-10-07 05:09:39 +000076# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000077ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
78 group("'", r'\\\r?\n'),
79 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
80 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000081PseudoExtras = group(r'\\\r?\n', Comment, Triple)
82PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000083
Guido van Rossum3b631771997-10-27 20:44:15 +000084tokenprog, pseudoprog, single3prog, double3prog = map(
85 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +000086endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +000087 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +000088 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000089 "u'''": single3prog, 'u"""': double3prog,
90 "ur'''": single3prog, 'ur"""': double3prog,
91 "R'''": single3prog, 'R"""': double3prog,
92 "U'''": single3prog, 'U"""': double3prog,
93 "uR'''": single3prog, 'uR"""': double3prog,
94 "Ur'''": single3prog, 'Ur"""': double3prog,
95 "UR'''": single3prog, 'UR"""': double3prog,
96 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +000097
Guido van Rossumfc6f5331997-03-07 00:21:12 +000098tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +000099
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000100class TokenError(Exception): pass
101
102class StopTokenizing(Exception): pass
Fred Drake9b8d8012000-08-17 04:45:13 +0000103
Guido van Rossum1aec3231997-04-08 14:24:39 +0000104def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
105 print "%d,%d-%d,%d:\t%s\t%s" % \
106 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000107
Guido van Rossum1aec3231997-04-08 14:24:39 +0000108def tokenize(readline, tokeneater=printtoken):
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000109 try:
110 tokenize_loop(readline, tokeneater)
111 except StopTokenizing:
112 pass
113
114def tokenize_loop(readline, tokeneater):
Guido van Rossum1aec3231997-04-08 14:24:39 +0000115 lnum = parenlev = continued = 0
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000116 namechars, numchars = string.letters + '_', string.digits
Guido van Rossumde655271997-04-09 17:15:54 +0000117 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000118 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000119 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000120
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000121 while 1: # loop over lines in stream
122 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000123 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000124 pos, max = 0, len(line)
125
126 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000127 if not line:
128 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000129 endmatch = endprog.match(line)
130 if endmatch:
131 pos = end = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000132 tokeneater(STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000133 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000134 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000135 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000136 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
137 tokeneater(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000138 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000139 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000140 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000141 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000142 else:
143 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000144 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000145 continue
146
Guido van Rossum1aec3231997-04-08 14:24:39 +0000147 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000148 if not line: break
149 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000150 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000151 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000152 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000153 elif line[pos] == '\f': column = 0
154 else: break
155 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000156 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000157
158 if line[pos] in '#\r\n': # skip comments or blank lines
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000159 tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000160 (lnum, pos), (lnum, len(line)), line)
161 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000162
163 if column > indents[-1]: # count indents or dedents
164 indents.append(column)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000165 tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000166 while column < indents[-1]:
167 indents = indents[:-1]
Guido van Rossumde655271997-04-09 17:15:54 +0000168 tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000169
170 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000171 if not line:
172 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000173 continued = 0
174
175 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000176 pseudomatch = pseudoprog.match(line, pos)
177 if pseudomatch: # scan for tokens
178 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000179 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000180 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000181
Ka-Ping Yee28c62bb2001-03-23 05:22:49 +0000182 if initial in numchars or \
183 (initial == '.' and token != '.'): # ordinary number
Guido van Rossum1aec3231997-04-08 14:24:39 +0000184 tokeneater(NUMBER, token, spos, epos, line)
185 elif initial in '\r\n':
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000186 tokeneater(parenlev > 0 and NL or NEWLINE,
187 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000188 elif initial == '#':
189 tokeneater(COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000190 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000191 "r'''", 'r"""', "R'''", 'R"""',
192 "u'''", 'u"""', "U'''", 'U"""',
193 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
194 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000195 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000196 endmatch = endprog.match(line, pos)
197 if endmatch: # all on one line
198 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000199 token = line[start:pos]
200 tokeneater(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000201 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000202 strstart = (lnum, start) # multiple lines
203 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000204 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000205 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000206 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000207 token[:2] in ("r'", 'r"', "R'", 'R"',
208 "u'", 'u"', "U'", 'U"') or \
209 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
210 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000211 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000212 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000213 endprog = (endprogs[initial] or endprogs[token[1]] or
214 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000215 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000216 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000217 break
218 else: # ordinary string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000219 tokeneater(STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000220 elif initial in namechars: # ordinary name
221 tokeneater(NAME, token, spos, epos, line)
222 elif initial == '\\': # continued stmt
223 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000224 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000225 if initial in '([{': parenlev = parenlev + 1
226 elif initial in ')]}': parenlev = parenlev - 1
227 tokeneater(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000228 else:
Guido van Rossumde655271997-04-09 17:15:54 +0000229 tokeneater(ERRORTOKEN, line[pos],
230 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000231 pos = pos + 1
232
233 for indent in indents[1:]: # pop remaining indent levels
Guido van Rossum1aec3231997-04-08 14:24:39 +0000234 tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumde655271997-04-09 17:15:54 +0000235 tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000236
237if __name__ == '__main__': # testing
238 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000239 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000240 else: tokenize(sys.stdin.readline)