blob: e0e902b5a61432bc9d807b6bb2a27659d17ac3be [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Guido van Rossum1aec3231997-04-08 14:24:39 +00003This module exports a function called 'tokenize()' that breaks a stream of
4text into Python tokens. It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6function which is called once for each token found. The latter function is
7passed the token type, a string containing the token, the starting and
8ending (row, column) coordinates of the token, and the original line. It is
9designed to match the working of the Python tokenizer exactly, except that
Guido van Rossum3b631771997-10-27 20:44:15 +000010it produces COMMENT tokens for comments and gives type OP for all operators."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000011
Ka-Ping Yee244c5932001-03-01 13:56:40 +000012__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Ka-Ping Yee4f64c132001-03-01 17:11:17 +000013__credits__ = \
14 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
Guido van Rossumb51eaa11997-03-07 00:21:55 +000015
Guido van Rossum3b631771997-10-27 20:44:15 +000016import string, re
Guido van Rossumfc6f5331997-03-07 00:21:12 +000017from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000018
Skip Montanaro40fc1602001-03-01 04:27:19 +000019import token
20__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
21del token
22
Guido van Rossum1aec3231997-04-08 14:24:39 +000023COMMENT = N_TOKENS
24tok_name[COMMENT] = 'COMMENT'
Guido van Rossuma90c78b1998-04-03 16:05:38 +000025NL = N_TOKENS + 1
26tok_name[NL] = 'NL'
Skip Montanaro40fc1602001-03-01 04:27:19 +000027N_TOKENS += 2
Guido van Rossum1aec3231997-04-08 14:24:39 +000028
29# Changes from 1.3:
30# Ignore now accepts \f as whitespace. Operator now includes '**'.
31# Ignore and Special now accept \n or \r\n at the end of a line.
32# Imagnumber is new. Expfloat is corrected to reject '0e4'.
Guido van Rossum3b631771997-10-27 20:44:15 +000033# Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.
Guido van Rossum1aec3231997-04-08 14:24:39 +000034
Eric S. Raymondb08b2d32001-02-09 11:10:16 +000035def group(*choices): return '(' + '|'.join(choices) + ')'
Guido van Rossum3b631771997-10-27 20:44:15 +000036def any(*choices): return apply(group, choices) + '*'
37def maybe(*choices): return apply(group, choices) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000038
Guido van Rossum3b631771997-10-27 20:44:15 +000039Whitespace = r'[ \f\t]*'
40Comment = r'#[^\r\n]*'
41Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
42Name = r'[a-zA-Z_]\w*'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000043
Guido van Rossum3b631771997-10-27 20:44:15 +000044Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
45Octnumber = r'0[0-7]*[lL]?'
46Decnumber = r'[1-9]\d*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000047Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum3b631771997-10-27 20:44:15 +000048Exponent = r'[eE][-+]?\d+'
49Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
50Expfloat = r'[1-9]\d*' + Exponent
Guido van Rossum1aec3231997-04-08 14:24:39 +000051Floatnumber = group(Pointfloat, Expfloat)
Guido van Rossum3b631771997-10-27 20:44:15 +000052Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
Guido van Rossum1aec3231997-04-08 14:24:39 +000053Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000054
Tim Petersde495832000-10-07 05:09:39 +000055# Tail end of ' string.
56Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
57# Tail end of " string.
58Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
59# Tail end of ''' string.
60Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
61# Tail end of """ string.
62Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000063Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
Tim Petersde495832000-10-07 05:09:39 +000064# Single-line ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000065String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
66 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000067
Tim Petersde495832000-10-07 05:09:39 +000068# Because of leftmost-then-longest match semantics, be sure to put the
69# longest operators first (e.g., if = came before ==, == would get
70# recognized as two instances of =).
71Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
72 r"[+\-*/%&|^=<>]=?",
73 r"~")
Thomas Wouterse1519a12000-08-24 21:44:52 +000074
Guido van Rossum4d8e8591992-01-01 19:34:47 +000075Bracket = '[][(){}]'
Guido van Rossum3b631771997-10-27 20:44:15 +000076Special = group(r'\r?\n', r'[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000077Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000078
Guido van Rossum3b631771997-10-27 20:44:15 +000079PlainToken = group(Number, Funny, String, Name)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000080Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000081
Tim Petersde495832000-10-07 05:09:39 +000082# First (or only) line of ' or " string.
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000083ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
84 group("'", r'\\\r?\n'),
85 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
86 group('"', r'\\\r?\n'))
Guido van Rossum3b631771997-10-27 20:44:15 +000087PseudoExtras = group(r'\\\r?\n', Comment, Triple)
88PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
Guido van Rossum1aec3231997-04-08 14:24:39 +000089
Guido van Rossum3b631771997-10-27 20:44:15 +000090tokenprog, pseudoprog, single3prog, double3prog = map(
91 re.compile, (Token, PseudoToken, Single3, Double3))
Guido van Rossumfefc9221997-10-27 21:17:24 +000092endprogs = {"'": re.compile(Single), '"': re.compile(Double),
Guido van Rossum3b631771997-10-27 20:44:15 +000093 "'''": single3prog, '"""': double3prog,
Guido van Rossumfefc9221997-10-27 21:17:24 +000094 "r'''": single3prog, 'r"""': double3prog,
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +000095 "u'''": single3prog, 'u"""': double3prog,
96 "ur'''": single3prog, 'ur"""': double3prog,
97 "R'''": single3prog, 'R"""': double3prog,
98 "U'''": single3prog, 'U"""': double3prog,
99 "uR'''": single3prog, 'uR"""': double3prog,
100 "Ur'''": single3prog, 'Ur"""': double3prog,
101 "UR'''": single3prog, 'UR"""': double3prog,
102 'r': None, 'R': None, 'u': None, 'U': None}
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000103
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000104tabsize = 8
Fred Drake9b8d8012000-08-17 04:45:13 +0000105
106class TokenError(Exception):
107 pass
108
Guido van Rossum1aec3231997-04-08 14:24:39 +0000109def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
110 print "%d,%d-%d,%d:\t%s\t%s" % \
111 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +0000112
Guido van Rossum1aec3231997-04-08 14:24:39 +0000113def tokenize(readline, tokeneater=printtoken):
114 lnum = parenlev = continued = 0
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000115 namechars, numchars = string.letters + '_', string.digits
Guido van Rossumde655271997-04-09 17:15:54 +0000116 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000117 contline = None
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000118 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000119
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000120 while 1: # loop over lines in stream
121 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +0000122 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000123 pos, max = 0, len(line)
124
125 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +0000126 if not line:
127 raise TokenError, ("EOF in multi-line string", strstart)
Guido van Rossum3b631771997-10-27 20:44:15 +0000128 endmatch = endprog.match(line)
129 if endmatch:
130 pos = end = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000131 tokeneater(STRING, contstr + line[:end],
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000132 strstart, (lnum, end), contline + line)
Guido van Rossumde655271997-04-09 17:15:54 +0000133 contstr, needcont = '', 0
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000134 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000135 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
136 tokeneater(ERRORTOKEN, contstr + line,
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000137 strstart, (lnum, len(line)), contline)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000138 contstr = ''
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000139 contline = None
Guido van Rossumde655271997-04-09 17:15:54 +0000140 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000141 else:
142 contstr = contstr + line
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000143 contline = contline + line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000144 continue
145
Guido van Rossum1aec3231997-04-08 14:24:39 +0000146 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000147 if not line: break
148 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000149 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000150 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000151 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000152 elif line[pos] == '\f': column = 0
153 else: break
154 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000155 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000156
157 if line[pos] in '#\r\n': # skip comments or blank lines
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000158 tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
Guido van Rossum1aec3231997-04-08 14:24:39 +0000159 (lnum, pos), (lnum, len(line)), line)
160 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000161
162 if column > indents[-1]: # count indents or dedents
163 indents.append(column)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000164 tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000165 while column < indents[-1]:
166 indents = indents[:-1]
Guido van Rossumde655271997-04-09 17:15:54 +0000167 tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000168
169 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000170 if not line:
171 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000172 continued = 0
173
174 while pos < max:
Guido van Rossum3b631771997-10-27 20:44:15 +0000175 pseudomatch = pseudoprog.match(line, pos)
176 if pseudomatch: # scan for tokens
177 start, end = pseudomatch.span(1)
Guido van Rossumde655271997-04-09 17:15:54 +0000178 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000179 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000180
Guido van Rossum3b631771997-10-27 20:44:15 +0000181 if initial in numchars \
Guido van Rossumde655271997-04-09 17:15:54 +0000182 or (initial == '.' and token != '.'): # ordinary number
Guido van Rossum1aec3231997-04-08 14:24:39 +0000183 tokeneater(NUMBER, token, spos, epos, line)
184 elif initial in '\r\n':
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000185 tokeneater(parenlev > 0 and NL or NEWLINE,
186 token, spos, epos, line)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000187 elif initial == '#':
188 tokeneater(COMMENT, token, spos, epos, line)
Guido van Rossumfefc9221997-10-27 21:17:24 +0000189 elif token in ("'''", '"""', # triple-quoted
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000190 "r'''", 'r"""', "R'''", 'R"""',
191 "u'''", 'u"""', "U'''", 'U"""',
192 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
193 "uR'''", 'uR"""', "UR'''", 'UR"""'):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000194 endprog = endprogs[token]
Guido van Rossum3b631771997-10-27 20:44:15 +0000195 endmatch = endprog.match(line, pos)
196 if endmatch: # all on one line
197 pos = endmatch.end(0)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000198 token = line[start:pos]
199 tokeneater(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000200 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000201 strstart = (lnum, start) # multiple lines
202 contstr = line[start:]
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000203 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000204 break
Guido van Rossumfefc9221997-10-27 21:17:24 +0000205 elif initial in ("'", '"') or \
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000206 token[:2] in ("r'", 'r"', "R'", 'R"',
207 "u'", 'u"', "U'", 'U"') or \
208 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
209 "uR'", 'uR"', "UR'", 'UR"' ):
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000210 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000211 strstart = (lnum, start)
Ka-Ping Yee1ff08b12001-01-15 22:04:30 +0000212 endprog = (endprogs[initial] or endprogs[token[1]] or
213 endprogs[token[2]])
Guido van Rossumde655271997-04-09 17:15:54 +0000214 contstr, needcont = line[start:], 1
Guido van Rossuma90c78b1998-04-03 16:05:38 +0000215 contline = line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000216 break
217 else: # ordinary string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000218 tokeneater(STRING, token, spos, epos, line)
Guido van Rossum3b631771997-10-27 20:44:15 +0000219 elif initial in namechars: # ordinary name
220 tokeneater(NAME, token, spos, epos, line)
221 elif initial == '\\': # continued stmt
222 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000223 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000224 if initial in '([{': parenlev = parenlev + 1
225 elif initial in ')]}': parenlev = parenlev - 1
226 tokeneater(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000227 else:
Guido van Rossumde655271997-04-09 17:15:54 +0000228 tokeneater(ERRORTOKEN, line[pos],
229 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000230 pos = pos + 1
231
232 for indent in indents[1:]: # pop remaining indent levels
Guido van Rossum1aec3231997-04-08 14:24:39 +0000233 tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumde655271997-04-09 17:15:54 +0000234 tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000235
236if __name__ == '__main__': # testing
237 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000238 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000239 else: tokenize(sys.stdin.readline)