blob: d742899619e475d3137402e210b2297e7f72a742 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Guido van Rossumb51eaa11997-03-07 00:21:55 +00003This module compiles a regular expression that recognizes Python
4tokens in individual lines of text. The regular expression handles
5everything except indentation, continuations, and triple-quoted
6strings. The function 'tokenize.tokenize()' takes care of these
7things for streams of text. It accepts a readline-like function which
8is called repeatedly to come up with the next input line (or "" for
9EOF), and a "token-eater" function which is called for each token
10found, passing its type, a string containing the token, the line
11number, the line, and the starting and ending positions of the token
12within the line. It is designed to match the working of the Python
13tokenizer exactly.
14
15"""
16
17__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 6 March 1997"
Guido van Rossum4d8e8591992-01-01 19:34:47 +000018
Guido van Rossumfc6f5331997-03-07 00:21:12 +000019import string, regex
20from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000021
Guido van Rossumfc6f5331997-03-07 00:21:12 +000022def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000023
Guido van Rossumfc6f5331997-03-07 00:21:12 +000024Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000025Name = '[a-zA-Z_][a-zA-Z0-9_]*'
26
27Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
28Octnumber = '0[0-7]*[lL]?'
29Decnumber = '[1-9][0-9]*[lL]?'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000030Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000031Exponent = '[eE][-+]?[0-9]+'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000032Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000033Expfloat = '[0-9]+' + Exponent
Guido van Rossumfc6f5331997-03-07 00:21:12 +000034Floatnumber = group(Pointfloat, Expfloat)
35Number = group(Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000036
Guido van Rossumfc6f5331997-03-07 00:21:12 +000037Single = group('^\'', '[^\]\'')
38Double = group('^"', '[^\]"')
39Tsingle = group('^\'\'\'', '[^\]\'\'\'')
40Tdouble = group('^"""', '[^\]"""')
41Triple = group('\'\'\'', '"""')
42String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
43 '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
Guido van Rossum4d8e8591992-01-01 19:34:47 +000044
Guido van Rossumfc6f5331997-03-07 00:21:12 +000045Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
46 '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000047Bracket = '[][(){}]'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000048Special = group('[\]?\r?\n', '[:;.,`\f]')
49Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000050
Guido van Rossumfc6f5331997-03-07 00:21:12 +000051PlainToken = group(Name, Number, Triple, String, Funny)
52Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000053
54try:
Guido van Rossumfc6f5331997-03-07 00:21:12 +000055 save_syntax = regex.set_syntax(0) # use default syntax
56 tokenprog = regex.compile(Token)
57 endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
58 '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
Guido van Rossum4d8e8591992-01-01 19:34:47 +000059finally:
Guido van Rossumfc6f5331997-03-07 00:21:12 +000060 regex.set_syntax(save_syntax) # restore original syntax
Guido van Rossum4d8e8591992-01-01 19:34:47 +000061
Guido van Rossumfc6f5331997-03-07 00:21:12 +000062tabsize = 8
63TokenError = 'TokenError'
64def printtoken(type, string, linenum, line, start, end): # for testing
65 print `linenum` + ':', tok_name[type], repr(string)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000066
Guido van Rossumfc6f5331997-03-07 00:21:12 +000067def tokenize(readline, tokeneater = printtoken):
68 linenum = parenlev = continued = 0
69 namechars, numchars = string.letters + '_', string.digits
70 contstr = ''
71 indents = [0]
72 while 1: # loop over lines in stream
73 line = readline()
74 linenum = linenum + 1
75 if line[-2:] == '\r\n': line = line[:-2] + '\n'
76 pos, max = 0, len(line)
77
78 if contstr: # continued string
79 if not line: raise TokenError, "EOF within multi-line string"
80 if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
81 if endprog.search(line) >= 0:
82 pos = end = endprog.regs[0][1]
83 tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
84 contstr = ''
85 else:
86 contstr = contstr + line
87 continue
88
89 elif parenlev == 0 and not continued: # this is a new statement
90 if not line: break
91 column = 0
92 while 1: # measure leading whitespace
93 if line[pos] == ' ': column = column + 1
94 elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
95 elif line[pos] == '\f': column = 0
96 else: break
97 pos = pos + 1
98 if line[pos] in '#\n': continue # skip comments or blank lines
99
100 if column > indents[-1]: # count indents or dedents
101 indents.append(column)
102 tokeneater(INDENT, '\t', linenum, line, 0, 0)
103 while column < indents[-1]:
104 indents = indents[:-1]
105 tokeneater(DEDENT, '\t', linenum, line, 0, 0)
106
107 else: # continued statement
108 if not line: raise TokenError, "EOF within multi-line statement"
109 continued = 0
110
111 while pos < max:
112 if tokenprog.match(line, pos) > 0: # scan for tokens
113 start, end = tokenprog.regs[3]
114 token = line[start:end]
115 pos = end
116
117 if token[0] in namechars: # ordinary name
118 tokeneater(NAME, token, linenum, line, start, end)
119 elif token[0] in numchars: # ordinary number
120 tokeneater(NUMBER, token, linenum, line, start, end)
121
122 elif token in ('\'\'\'', '"""'): # triple-quoted
123 endprog = endprogs[token]
124 if endprog.search(line, pos) >= 0: # all on one line
125 pos = endprog.regs[0][1]
Guido van Rossumb51eaa11997-03-07 00:21:55 +0000126 token = line[start:pos]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000127 tokeneater(STRING, token, linenum, line, start, pos)
128 else:
129 contstr = line[start:] # multiple lines
130 break
131 elif token[0] in '\'"':
132 if token[-1] == '\n': # continued string
133 endprog, contstr = endprogs[token[0]], line[start:]
134 break
135 else: # ordinary string
136 tokeneater(STRING, token, linenum, line, start, end)
137
138 elif token[0] == '\n':
139 tokeneater(NEWLINE, token, linenum, line, start, end)
140 elif token[0] == '\\': # continued stmt
141 continued = 1
142
143 else:
144 if token[0] in '([{': parenlev = parenlev + 1
145 if token[0] in ')]}': parenlev = parenlev - 1
146 tokeneater(OP, token, linenum, line, start, end)
147 else:
148 tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
149 pos = pos + 1
150
151 for indent in indents[1:]: # pop remaining indent levels
152 tokeneater(DEDENT, '\t', linenum, line, 0, 0)
153
154if __name__ == '__main__': # testing
155 import sys
156 file = open(sys.argv[-1])
157 tokenize(file.readline)