blob: 06ed74606ec6a8d4aa4685dcdf8665c7ab97393b [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Guido van Rossumb51eaa11997-03-07 00:21:55 +00003This module compiles a regular expression that recognizes Python
4tokens in individual lines of text. The regular expression handles
5everything except indentation, continuations, and triple-quoted
6strings. The function 'tokenize.tokenize()' takes care of these
7things for streams of text. It accepts a readline-like function which
8is called repeatedly to come up with the next input line (or "" for
9EOF), and a "token-eater" function which is called for each token
10found, passing its type, a string containing the token, the line
11number, the line, and the starting and ending positions of the token
12within the line. It is designed to match the working of the Python
13tokenizer exactly.
14
15"""
16
Guido van Rossumb5dc5e31997-03-10 23:17:01 +000017__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"
Guido van Rossum4d8e8591992-01-01 19:34:47 +000018
Guido van Rossumfc6f5331997-03-07 00:21:12 +000019import string, regex
20from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000021
Guido van Rossumfc6f5331997-03-07 00:21:12 +000022def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000023
Guido van Rossumfc6f5331997-03-07 00:21:12 +000024Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000025Name = '[a-zA-Z_][a-zA-Z0-9_]*'
26
Guido van Rossumb5dc5e31997-03-10 23:17:01 +000027ImagZero = '0[jJ]' # This is not caught by any of the following
Guido van Rossum4d8e8591992-01-01 19:34:47 +000028Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
29Octnumber = '0[0-7]*[lL]?'
Guido van Rossumb5dc5e31997-03-10 23:17:01 +000030Decnumber = '[1-9][0-9]*[lLjJ]?'
31Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032Exponent = '[eE][-+]?[0-9]+'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000033Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000034Expfloat = '[0-9]+' + Exponent
Guido van Rossumb5dc5e31997-03-10 23:17:01 +000035Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"
Guido van Rossumfc6f5331997-03-07 00:21:12 +000036Number = group(Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000037
Guido van Rossumfc6f5331997-03-07 00:21:12 +000038Single = group('^\'', '[^\]\'')
39Double = group('^"', '[^\]"')
40Tsingle = group('^\'\'\'', '[^\]\'\'\'')
41Tdouble = group('^"""', '[^\]"""')
42Triple = group('\'\'\'', '"""')
43String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),
44 '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))
Guido van Rossum4d8e8591992-01-01 19:34:47 +000045
Guido van Rossumfc6f5331997-03-07 00:21:12 +000046Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
47 '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000048Bracket = '[][(){}]'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000049Special = group('[\]?\r?\n', '[:;.,`\f]')
50Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000051
Guido van Rossumfc6f5331997-03-07 00:21:12 +000052PlainToken = group(Name, Number, Triple, String, Funny)
53Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000054
55try:
Guido van Rossumfc6f5331997-03-07 00:21:12 +000056 save_syntax = regex.set_syntax(0) # use default syntax
57 tokenprog = regex.compile(Token)
58 endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
59 '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }
Guido van Rossum4d8e8591992-01-01 19:34:47 +000060finally:
Guido van Rossumfc6f5331997-03-07 00:21:12 +000061 regex.set_syntax(save_syntax) # restore original syntax
Guido van Rossum4d8e8591992-01-01 19:34:47 +000062
Guido van Rossumfc6f5331997-03-07 00:21:12 +000063tabsize = 8
64TokenError = 'TokenError'
65def printtoken(type, string, linenum, line, start, end): # for testing
66 print `linenum` + ':', tok_name[type], repr(string)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000067
Guido van Rossumfc6f5331997-03-07 00:21:12 +000068def tokenize(readline, tokeneater = printtoken):
69 linenum = parenlev = continued = 0
70 namechars, numchars = string.letters + '_', string.digits
71 contstr = ''
72 indents = [0]
73 while 1: # loop over lines in stream
74 line = readline()
75 linenum = linenum + 1
76 if line[-2:] == '\r\n': line = line[:-2] + '\n'
77 pos, max = 0, len(line)
78
79 if contstr: # continued string
80 if not line: raise TokenError, "EOF within multi-line string"
81 if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'
82 if endprog.search(line) >= 0:
83 pos = end = endprog.regs[0][1]
84 tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)
85 contstr = ''
86 else:
87 contstr = contstr + line
88 continue
89
90 elif parenlev == 0 and not continued: # this is a new statement
91 if not line: break
92 column = 0
93 while 1: # measure leading whitespace
94 if line[pos] == ' ': column = column + 1
95 elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize
96 elif line[pos] == '\f': column = 0
97 else: break
98 pos = pos + 1
99 if line[pos] in '#\n': continue # skip comments or blank lines
100
101 if column > indents[-1]: # count indents or dedents
102 indents.append(column)
103 tokeneater(INDENT, '\t', linenum, line, 0, 0)
104 while column < indents[-1]:
105 indents = indents[:-1]
106 tokeneater(DEDENT, '\t', linenum, line, 0, 0)
107
108 else: # continued statement
109 if not line: raise TokenError, "EOF within multi-line statement"
110 continued = 0
111
112 while pos < max:
113 if tokenprog.match(line, pos) > 0: # scan for tokens
114 start, end = tokenprog.regs[3]
115 token = line[start:end]
116 pos = end
117
118 if token[0] in namechars: # ordinary name
119 tokeneater(NAME, token, linenum, line, start, end)
120 elif token[0] in numchars: # ordinary number
121 tokeneater(NUMBER, token, linenum, line, start, end)
122
123 elif token in ('\'\'\'', '"""'): # triple-quoted
124 endprog = endprogs[token]
125 if endprog.search(line, pos) >= 0: # all on one line
126 pos = endprog.regs[0][1]
Guido van Rossumb51eaa11997-03-07 00:21:55 +0000127 token = line[start:pos]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000128 tokeneater(STRING, token, linenum, line, start, pos)
129 else:
130 contstr = line[start:] # multiple lines
131 break
132 elif token[0] in '\'"':
133 if token[-1] == '\n': # continued string
134 endprog, contstr = endprogs[token[0]], line[start:]
135 break
136 else: # ordinary string
137 tokeneater(STRING, token, linenum, line, start, end)
138
139 elif token[0] == '\n':
140 tokeneater(NEWLINE, token, linenum, line, start, end)
141 elif token[0] == '\\': # continued stmt
142 continued = 1
143
144 else:
145 if token[0] in '([{': parenlev = parenlev + 1
146 if token[0] in ')]}': parenlev = parenlev - 1
147 tokeneater(OP, token, linenum, line, start, end)
148 else:
149 tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)
150 pos = pos + 1
151
152 for indent in indents[1:]: # pop remaining indent levels
153 tokeneater(DEDENT, '\t', linenum, line, 0, 0)
154
155if __name__ == '__main__': # testing
156 import sys
157 file = open(sys.argv[-1])
158 tokenize(file.readline)