| """Tokenization help for Python programs. |
| |
| This module compiles a regular expression that recognizes Python |
| tokens in individual lines of text. The regular expression handles |
| everything except indentation, continuations, and triple-quoted |
| strings. The function 'tokenize.tokenize()' takes care of these |
| things for streams of text. It accepts a readline-like function which |
| is called repeatedly to come up with the next input line (or "" for |
| EOF), and a "token-eater" function which is called for each token |
| found, passing its type, a string containing the token, the line |
| number, the line, and the starting and ending positions of the token |
| within the line. It is designed to match the working of the Python |
| tokenizer exactly. |
| |
| """ |
| |
| __version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997" |
| |
| import string, regex |
| from token import * |
| |
| def group(*choices): return '\(' + string.join(choices, '\|') + '\)' |
| |
| Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?' |
| Name = '[a-zA-Z_][a-zA-Z0-9_]*' |
| |
| ImagZero = '0[jJ]' # This is not caught by any of the following |
| Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' |
| Octnumber = '0[0-7]*[lL]?' |
| Decnumber = '[1-9][0-9]*[lLjJ]?' |
| Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber) |
| Exponent = '[eE][-+]?[0-9]+' |
| Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?' |
| Expfloat = '[0-9]+' + Exponent |
| Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?" |
| Number = group(Floatnumber, Intnumber) |
| |
| Single = group('^\'', '[^\]\'') |
| Double = group('^"', '[^\]"') |
| Tsingle = group('^\'\'\'', '[^\]\'\'\'') |
| Tdouble = group('^"""', '[^\]"""') |
| Triple = group('\'\'\'', '"""') |
| String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'), |
| '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n')) |
| |
| Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|', |
| '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') |
| Bracket = '[][(){}]' |
| Special = group('[\]?\r?\n', '[:;.,`\f]') |
| Funny = group(Operator, Bracket, Special) |
| |
| PlainToken = group(Name, Number, Triple, String, Funny) |
| Token = Ignore + PlainToken |
| |
| try: |
| save_syntax = regex.set_syntax(0) # use default syntax |
| tokenprog = regex.compile(Token) |
| endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double), |
| '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) } |
| finally: |
| regex.set_syntax(save_syntax) # restore original syntax |
| |
| tabsize = 8 |
| TokenError = 'TokenError' |
| def printtoken(type, string, linenum, line, start, end): # for testing |
| print `linenum` + ':', tok_name[type], repr(string) |
| |
| def tokenize(readline, tokeneater = printtoken): |
| linenum = parenlev = continued = 0 |
| namechars, numchars = string.letters + '_', string.digits |
| contstr = '' |
| indents = [0] |
| while 1: # loop over lines in stream |
| line = readline() |
| linenum = linenum + 1 |
| if line[-2:] == '\r\n': line = line[:-2] + '\n' |
| pos, max = 0, len(line) |
| |
| if contstr: # continued string |
| if not line: raise TokenError, "EOF within multi-line string" |
| if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n' |
| if endprog.search(line) >= 0: |
| pos = end = endprog.regs[0][1] |
| tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0) |
| contstr = '' |
| else: |
| contstr = contstr + line |
| continue |
| |
| elif parenlev == 0 and not continued: # this is a new statement |
| if not line: break |
| column = 0 |
| while 1: # measure leading whitespace |
| if line[pos] == ' ': column = column + 1 |
| elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize |
| elif line[pos] == '\f': column = 0 |
| else: break |
| pos = pos + 1 |
| if line[pos] in '#\n': continue # skip comments or blank lines |
| |
| if column > indents[-1]: # count indents or dedents |
| indents.append(column) |
| tokeneater(INDENT, '\t', linenum, line, 0, 0) |
| while column < indents[-1]: |
| indents = indents[:-1] |
| tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| |
| else: # continued statement |
| if not line: raise TokenError, "EOF within multi-line statement" |
| continued = 0 |
| |
| while pos < max: |
| if tokenprog.match(line, pos) > 0: # scan for tokens |
| start, end = tokenprog.regs[3] |
| token = line[start:end] |
| pos = end |
| |
| if token[0] in namechars: # ordinary name |
| tokeneater(NAME, token, linenum, line, start, end) |
| elif token[0] in numchars: # ordinary number |
| tokeneater(NUMBER, token, linenum, line, start, end) |
| |
| elif token in ('\'\'\'', '"""'): # triple-quoted |
| endprog = endprogs[token] |
| if endprog.search(line, pos) >= 0: # all on one line |
| pos = endprog.regs[0][1] |
| token = line[start:pos] |
| tokeneater(STRING, token, linenum, line, start, pos) |
| else: |
| contstr = line[start:] # multiple lines |
| break |
| elif token[0] in '\'"': |
| if token[-1] == '\n': # continued string |
| endprog, contstr = endprogs[token[0]], line[start:] |
| break |
| else: # ordinary string |
| tokeneater(STRING, token, linenum, line, start, end) |
| |
| elif token[0] == '\n': |
| tokeneater(NEWLINE, token, linenum, line, start, end) |
| elif token[0] == '\\': # continued stmt |
| continued = 1 |
| |
| else: |
| if token[0] in '([{': parenlev = parenlev + 1 |
| if token[0] in ')]}': parenlev = parenlev - 1 |
| tokeneater(OP, token, linenum, line, start, end) |
| else: |
| tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1) |
| pos = pos + 1 |
| |
| for indent in indents[1:]: # pop remaining indent levels |
| tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| |
| if __name__ == '__main__': # testing |
| import sys |
| file = open(sys.argv[-1]) |
| tokenize(file.readline) |