Guido van Rossum | b51eaa1 | 1997-03-07 00:21:55 +0000 | [diff] [blame] | 1 | """Tokenization help for Python programs. |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 2 | |
Guido van Rossum | b51eaa1 | 1997-03-07 00:21:55 +0000 | [diff] [blame] | 3 | This module compiles a regular expression that recognizes Python |
| 4 | tokens in individual lines of text. The regular expression handles |
| 5 | everything except indentation, continuations, and triple-quoted |
| 6 | strings. The function 'tokenize.tokenize()' takes care of these |
| 7 | things for streams of text. It accepts a readline-like function which |
| 8 | is called repeatedly to come up with the next input line (or "" for |
| 9 | EOF), and a "token-eater" function which is called for each token |
| 10 | found, passing its type, a string containing the token, the line |
| 11 | number, the line, and the starting and ending positions of the token |
| 12 | within the line. It is designed to match the working of the Python |
| 13 | tokenizer exactly. |
| 14 | |
| 15 | """ |
| 16 | |
Guido van Rossum | b5dc5e3 | 1997-03-10 23:17:01 +0000 | [diff] [blame] | 17 | __version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997" |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 18 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 19 | import string, regex |
| 20 | from token import * |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 21 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 22 | def group(*choices): return '\(' + string.join(choices, '\|') + '\)' |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 23 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 24 | Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?' |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 25 | Name = '[a-zA-Z_][a-zA-Z0-9_]*' |
| 26 | |
Guido van Rossum | b5dc5e3 | 1997-03-10 23:17:01 +0000 | [diff] [blame] | 27 | ImagZero = '0[jJ]' # This is not caught by any of the following |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 28 | Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' |
| 29 | Octnumber = '0[0-7]*[lL]?' |
Guido van Rossum | b5dc5e3 | 1997-03-10 23:17:01 +0000 | [diff] [blame] | 30 | Decnumber = '[1-9][0-9]*[lLjJ]?' |
| 31 | Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 32 | Exponent = '[eE][-+]?[0-9]+' |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 33 | Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?' |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 34 | Expfloat = '[0-9]+' + Exponent |
Guido van Rossum | b5dc5e3 | 1997-03-10 23:17:01 +0000 | [diff] [blame] | 35 | Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?" |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 36 | Number = group(Floatnumber, Intnumber) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 37 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 38 | Single = group('^\'', '[^\]\'') |
| 39 | Double = group('^"', '[^\]"') |
| 40 | Tsingle = group('^\'\'\'', '[^\]\'\'\'') |
| 41 | Tdouble = group('^"""', '[^\]"""') |
| 42 | Triple = group('\'\'\'', '"""') |
| 43 | String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'), |
| 44 | '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n')) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 45 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 46 | Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|', |
| 47 | '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 48 | Bracket = '[][(){}]' |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 49 | Special = group('[\]?\r?\n', '[:;.,`\f]') |
| 50 | Funny = group(Operator, Bracket, Special) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 51 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 52 | PlainToken = group(Name, Number, Triple, String, Funny) |
| 53 | Token = Ignore + PlainToken |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 54 | |
| 55 | try: |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 56 | save_syntax = regex.set_syntax(0) # use default syntax |
| 57 | tokenprog = regex.compile(Token) |
| 58 | endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double), |
| 59 | '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) } |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 60 | finally: |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 61 | regex.set_syntax(save_syntax) # restore original syntax |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 62 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 63 | tabsize = 8 |
| 64 | TokenError = 'TokenError' |
| 65 | def printtoken(type, string, linenum, line, start, end): # for testing |
| 66 | print `linenum` + ':', tok_name[type], repr(string) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 67 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 68 | def tokenize(readline, tokeneater = printtoken): |
| 69 | linenum = parenlev = continued = 0 |
| 70 | namechars, numchars = string.letters + '_', string.digits |
| 71 | contstr = '' |
| 72 | indents = [0] |
| 73 | while 1: # loop over lines in stream |
| 74 | line = readline() |
| 75 | linenum = linenum + 1 |
| 76 | if line[-2:] == '\r\n': line = line[:-2] + '\n' |
| 77 | pos, max = 0, len(line) |
| 78 | |
| 79 | if contstr: # continued string |
| 80 | if not line: raise TokenError, "EOF within multi-line string" |
| 81 | if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n' |
| 82 | if endprog.search(line) >= 0: |
| 83 | pos = end = endprog.regs[0][1] |
| 84 | tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0) |
| 85 | contstr = '' |
| 86 | else: |
| 87 | contstr = contstr + line |
| 88 | continue |
| 89 | |
| 90 | elif parenlev == 0 and not continued: # this is a new statement |
| 91 | if not line: break |
| 92 | column = 0 |
| 93 | while 1: # measure leading whitespace |
| 94 | if line[pos] == ' ': column = column + 1 |
| 95 | elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize |
| 96 | elif line[pos] == '\f': column = 0 |
| 97 | else: break |
| 98 | pos = pos + 1 |
| 99 | if line[pos] in '#\n': continue # skip comments or blank lines |
| 100 | |
| 101 | if column > indents[-1]: # count indents or dedents |
| 102 | indents.append(column) |
| 103 | tokeneater(INDENT, '\t', linenum, line, 0, 0) |
| 104 | while column < indents[-1]: |
| 105 | indents = indents[:-1] |
| 106 | tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| 107 | |
| 108 | else: # continued statement |
| 109 | if not line: raise TokenError, "EOF within multi-line statement" |
| 110 | continued = 0 |
| 111 | |
| 112 | while pos < max: |
| 113 | if tokenprog.match(line, pos) > 0: # scan for tokens |
| 114 | start, end = tokenprog.regs[3] |
| 115 | token = line[start:end] |
| 116 | pos = end |
| 117 | |
| 118 | if token[0] in namechars: # ordinary name |
| 119 | tokeneater(NAME, token, linenum, line, start, end) |
| 120 | elif token[0] in numchars: # ordinary number |
| 121 | tokeneater(NUMBER, token, linenum, line, start, end) |
| 122 | |
| 123 | elif token in ('\'\'\'', '"""'): # triple-quoted |
| 124 | endprog = endprogs[token] |
| 125 | if endprog.search(line, pos) >= 0: # all on one line |
| 126 | pos = endprog.regs[0][1] |
Guido van Rossum | b51eaa1 | 1997-03-07 00:21:55 +0000 | [diff] [blame] | 127 | token = line[start:pos] |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame] | 128 | tokeneater(STRING, token, linenum, line, start, pos) |
| 129 | else: |
| 130 | contstr = line[start:] # multiple lines |
| 131 | break |
| 132 | elif token[0] in '\'"': |
| 133 | if token[-1] == '\n': # continued string |
| 134 | endprog, contstr = endprogs[token[0]], line[start:] |
| 135 | break |
| 136 | else: # ordinary string |
| 137 | tokeneater(STRING, token, linenum, line, start, end) |
| 138 | |
| 139 | elif token[0] == '\n': |
| 140 | tokeneater(NEWLINE, token, linenum, line, start, end) |
| 141 | elif token[0] == '\\': # continued stmt |
| 142 | continued = 1 |
| 143 | |
| 144 | else: |
| 145 | if token[0] in '([{': parenlev = parenlev + 1 |
| 146 | if token[0] in ')]}': parenlev = parenlev - 1 |
| 147 | tokeneater(OP, token, linenum, line, start, end) |
| 148 | else: |
| 149 | tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1) |
| 150 | pos = pos + 1 |
| 151 | |
| 152 | for indent in indents[1:]: # pop remaining indent levels |
| 153 | tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| 154 | |
| 155 | if __name__ == '__main__': # testing |
| 156 | import sys |
| 157 | file = open(sys.argv[-1]) |
| 158 | tokenize(file.readline) |