blob: d6985e0fbc575f28be3ce19128d741190823370f [file] [log] [blame]
Guido van Rossumfc6f5331997-03-07 00:21:12 +00001"""tokenize.py (Ka-Ping Yee, 4 March 1997)
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Guido van Rossumfc6f5331997-03-07 00:21:12 +00003This module compiles a regular expression that recognizes Python tokens
4in individual lines of text. The regular expression handles everything
5except indentation, continuations, and triple-quoted strings. The function
6'tokenize.tokenize()' takes care of these things for streams of text. It
7accepts a file-like object and a function, uses the readline() method to
8scan the file, and calls the function called once for each token found
9passing its type, a string containing the token, the line number, the line,
10and the starting and ending positions of the token within the line.
11It is designed to match the working of the Python tokenizer exactly."""
Guido van Rossum4d8e8591992-01-01 19:34:47 +000012
Guido van Rossumfc6f5331997-03-07 00:21:12 +000013import string, regex
14from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000015
Guido van Rossumfc6f5331997-03-07 00:21:12 +000016def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000017
Guido van Rossumfc6f5331997-03-07 00:21:12 +000018Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000019Name = '[a-zA-Z_][a-zA-Z0-9_]*'
20
21Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
22Octnumber = '0[0-7]*[lL]?'
23Decnumber = '[1-9][0-9]*[lL]?'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000024Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000025Exponent = '[eE][-+]?[0-9]+'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000026Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000027Expfloat = '[0-9]+' + Exponent
Guido van Rossumfc6f5331997-03-07 00:21:12 +000028Floatnumber = group(Pointfloat, Expfloat)
29Number = group(Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000030
Guido van Rossumfc6f5331997-03-07 00:21:12 +000031Single = group('^\'', '[^\]\'')
32Double = group('^"', '[^\]"')
33Tsingle = group('^\'\'\'', '[^\]\'\'\'')
34Tdouble = group('^"""', '[^\]"""')
35Triple = group('\'\'\'', '"""')
36String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
37 '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
Guido van Rossum4d8e8591992-01-01 19:34:47 +000038
Guido van Rossumfc6f5331997-03-07 00:21:12 +000039Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
40 '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000041Bracket = '[][(){}]'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000042Special = group('[\]?\r?\n', '[:;.,`\f]')
43Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000044
Guido van Rossumfc6f5331997-03-07 00:21:12 +000045PlainToken = group(Name, Number, Triple, String, Funny)
46Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000047
48try:
Guido van Rossumfc6f5331997-03-07 00:21:12 +000049 save_syntax = regex.set_syntax(0) # use default syntax
50 tokenprog = regex.compile(Token)
51 endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
52 '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
Guido van Rossum4d8e8591992-01-01 19:34:47 +000053finally:
Guido van Rossumfc6f5331997-03-07 00:21:12 +000054 regex.set_syntax(save_syntax) # restore original syntax
Guido van Rossum4d8e8591992-01-01 19:34:47 +000055
Guido van Rossumfc6f5331997-03-07 00:21:12 +000056tabsize = 8
57TokenError = 'TokenError'
58def printtoken(type, string, linenum, line, start, end): # for testing
59 print `linenum` + ':', tok_name[type], repr(string)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000060
Guido van Rossumfc6f5331997-03-07 00:21:12 +000061def tokenize(readline, tokeneater = printtoken):
62 linenum = parenlev = continued = 0
63 namechars, numchars = string.letters + '_', string.digits
64 contstr = ''
65 indents = [0]
66 while 1: # loop over lines in stream
67 line = readline()
68 linenum = linenum + 1
69 if line[-2:] == '\r\n': line = line[:-2] + '\n'
70 pos, max = 0, len(line)
71
72 if contstr: # continued string
73 if not line: raise TokenError, "EOF within multi-line string"
74 if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
75 if endprog.search(line) >= 0:
76 pos = end = endprog.regs[0][1]
77 tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
78 contstr = ''
79 else:
80 contstr = contstr + line
81 continue
82
83 elif parenlev == 0 and not continued: # this is a new statement
84 if not line: break
85 column = 0
86 while 1: # measure leading whitespace
87 if line[pos] == ' ': column = column + 1
88 elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
89 elif line[pos] == '\f': column = 0
90 else: break
91 pos = pos + 1
92 if line[pos] in '#\n': continue # skip comments or blank lines
93
94 if column > indents[-1]: # count indents or dedents
95 indents.append(column)
96 tokeneater(INDENT, '\t', linenum, line, 0, 0)
97 while column < indents[-1]:
98 indents = indents[:-1]
99 tokeneater(DEDENT, '\t', linenum, line, 0, 0)
100
101 else: # continued statement
102 if not line: raise TokenError, "EOF within multi-line statement"
103 continued = 0
104
105 while pos < max:
106 if tokenprog.match(line, pos) > 0: # scan for tokens
107 start, end = tokenprog.regs[3]
108 token = line[start:end]
109 pos = end
110
111 if token[0] in namechars: # ordinary name
112 tokeneater(NAME, token, linenum, line, start, end)
113 elif token[0] in numchars: # ordinary number
114 tokeneater(NUMBER, token, linenum, line, start, end)
115
116 elif token in ('\'\'\'', '"""'): # triple-quoted
117 endprog = endprogs[token]
118 if endprog.search(line, pos) >= 0: # all on one line
119 pos = endprog.regs[0][1]
120 tokeneater(STRING, token, linenum, line, start, pos)
121 else:
122 contstr = line[start:] # multiple lines
123 break
124 elif token[0] in '\'"':
125 if token[-1] == '\n': # continued string
126 endprog, contstr = endprogs[token[0]], line[start:]
127 break
128 else: # ordinary string
129 tokeneater(STRING, token, linenum, line, start, end)
130
131 elif token[0] == '\n':
132 tokeneater(NEWLINE, token, linenum, line, start, end)
133 elif token[0] == '\\': # continued stmt
134 continued = 1
135
136 else:
137 if token[0] in '([{': parenlev = parenlev + 1
138 if token[0] in ')]}': parenlev = parenlev - 1
139 tokeneater(OP, token, linenum, line, start, end)
140 else:
141 tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
142 pos = pos + 1
143
144 for indent in indents[1:]: # pop remaining indent levels
145 tokeneater(DEDENT, '\t', linenum, line, 0, 0)
146
147if __name__ == '__main__': # testing
148 import sys
149 file = open(sys.argv[-1])
150 tokenize(file.readline)