Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 1 | """tokenize.py (Ka-Ping Yee, 4 March 1997) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 2 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 3 | This module compiles a regular expression that recognizes Python tokens |
| 4 | in individual lines of text. The regular expression handles everything |
| 5 | except indentation, continuations, and triple-quoted strings. The function |
| 6 | 'tokenize.tokenize()' takes care of these things for streams of text. It |
| 7 | accepts a file-like object and a function, uses the readline() method to |
| 8 | scan the file, and calls the function called once for each token found |
| 9 | passing its type, a string containing the token, the line number, the line, |
| 10 | and the starting and ending positions of the token within the line. |
| 11 | It is designed to match the working of the Python tokenizer exactly.""" |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 12 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 13 | import string, regex |
| 14 | from token import * |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 15 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 16 | def group(*choices): return '\(' + string.join(choices, '\|') + '\)' |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 17 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 18 | Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?' |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 19 | Name = '[a-zA-Z_][a-zA-Z0-9_]*' |
| 20 | |
| 21 | Hexnumber = '0[xX][0-9a-fA-F]*[lL]?' |
| 22 | Octnumber = '0[0-7]*[lL]?' |
| 23 | Decnumber = '[1-9][0-9]*[lL]?' |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 24 | Intnumber = group(Hexnumber, Octnumber, Decnumber) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 25 | Exponent = '[eE][-+]?[0-9]+' |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 26 | Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?' |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 27 | Expfloat = '[0-9]+' + Exponent |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 28 | Floatnumber = group(Pointfloat, Expfloat) |
| 29 | Number = group(Floatnumber, Intnumber) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 30 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 31 | Single = group('^\'', '[^\]\'') |
| 32 | Double = group('^"', '[^\]"') |
| 33 | Tsingle = group('^\'\'\'', '[^\]\'\'\'') |
| 34 | Tdouble = group('^"""', '[^\]"""') |
| 35 | Triple = group('\'\'\'', '"""') |
| 36 | String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'), |
| 37 | '"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n')) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 38 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 39 | Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|', |
| 40 | '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>') |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 41 | Bracket = '[][(){}]' |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 42 | Special = group('[\]?\r?\n', '[:;.,`\f]') |
| 43 | Funny = group(Operator, Bracket, Special) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 44 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 45 | PlainToken = group(Name, Number, Triple, String, Funny) |
| 46 | Token = Ignore + PlainToken |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 47 | |
| 48 | try: |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 49 | save_syntax = regex.set_syntax(0) # use default syntax |
| 50 | tokenprog = regex.compile(Token) |
| 51 | endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double), |
| 52 | '\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) } |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 53 | finally: |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 54 | regex.set_syntax(save_syntax) # restore original syntax |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 55 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 56 | tabsize = 8 |
| 57 | TokenError = 'TokenError' |
| 58 | def printtoken(type, string, linenum, line, start, end): # for testing |
| 59 | print `linenum` + ':', tok_name[type], repr(string) |
Guido van Rossum | 4d8e859 | 1992-01-01 19:34:47 +0000 | [diff] [blame] | 60 | |
Guido van Rossum | fc6f533 | 1997-03-07 00:21:12 +0000 | [diff] [blame^] | 61 | def tokenize(readline, tokeneater = printtoken): |
| 62 | linenum = parenlev = continued = 0 |
| 63 | namechars, numchars = string.letters + '_', string.digits |
| 64 | contstr = '' |
| 65 | indents = [0] |
| 66 | while 1: # loop over lines in stream |
| 67 | line = readline() |
| 68 | linenum = linenum + 1 |
| 69 | if line[-2:] == '\r\n': line = line[:-2] + '\n' |
| 70 | pos, max = 0, len(line) |
| 71 | |
| 72 | if contstr: # continued string |
| 73 | if not line: raise TokenError, "EOF within multi-line string" |
| 74 | if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n' |
| 75 | if endprog.search(line) >= 0: |
| 76 | pos = end = endprog.regs[0][1] |
| 77 | tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0) |
| 78 | contstr = '' |
| 79 | else: |
| 80 | contstr = contstr + line |
| 81 | continue |
| 82 | |
| 83 | elif parenlev == 0 and not continued: # this is a new statement |
| 84 | if not line: break |
| 85 | column = 0 |
| 86 | while 1: # measure leading whitespace |
| 87 | if line[pos] == ' ': column = column + 1 |
| 88 | elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize |
| 89 | elif line[pos] == '\f': column = 0 |
| 90 | else: break |
| 91 | pos = pos + 1 |
| 92 | if line[pos] in '#\n': continue # skip comments or blank lines |
| 93 | |
| 94 | if column > indents[-1]: # count indents or dedents |
| 95 | indents.append(column) |
| 96 | tokeneater(INDENT, '\t', linenum, line, 0, 0) |
| 97 | while column < indents[-1]: |
| 98 | indents = indents[:-1] |
| 99 | tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| 100 | |
| 101 | else: # continued statement |
| 102 | if not line: raise TokenError, "EOF within multi-line statement" |
| 103 | continued = 0 |
| 104 | |
| 105 | while pos < max: |
| 106 | if tokenprog.match(line, pos) > 0: # scan for tokens |
| 107 | start, end = tokenprog.regs[3] |
| 108 | token = line[start:end] |
| 109 | pos = end |
| 110 | |
| 111 | if token[0] in namechars: # ordinary name |
| 112 | tokeneater(NAME, token, linenum, line, start, end) |
| 113 | elif token[0] in numchars: # ordinary number |
| 114 | tokeneater(NUMBER, token, linenum, line, start, end) |
| 115 | |
| 116 | elif token in ('\'\'\'', '"""'): # triple-quoted |
| 117 | endprog = endprogs[token] |
| 118 | if endprog.search(line, pos) >= 0: # all on one line |
| 119 | pos = endprog.regs[0][1] |
| 120 | tokeneater(STRING, token, linenum, line, start, pos) |
| 121 | else: |
| 122 | contstr = line[start:] # multiple lines |
| 123 | break |
| 124 | elif token[0] in '\'"': |
| 125 | if token[-1] == '\n': # continued string |
| 126 | endprog, contstr = endprogs[token[0]], line[start:] |
| 127 | break |
| 128 | else: # ordinary string |
| 129 | tokeneater(STRING, token, linenum, line, start, end) |
| 130 | |
| 131 | elif token[0] == '\n': |
| 132 | tokeneater(NEWLINE, token, linenum, line, start, end) |
| 133 | elif token[0] == '\\': # continued stmt |
| 134 | continued = 1 |
| 135 | |
| 136 | else: |
| 137 | if token[0] in '([{': parenlev = parenlev + 1 |
| 138 | if token[0] in ')]}': parenlev = parenlev - 1 |
| 139 | tokeneater(OP, token, linenum, line, start, end) |
| 140 | else: |
| 141 | tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1) |
| 142 | pos = pos + 1 |
| 143 | |
| 144 | for indent in indents[1:]: # pop remaining indent levels |
| 145 | tokeneater(DEDENT, '\t', linenum, line, 0, 0) |
| 146 | |
| 147 | if __name__ == '__main__': # testing |
| 148 | import sys |
| 149 | file = open(sys.argv[-1]) |
| 150 | tokenize(file.readline) |