blob: fc1d45bc20a5f43a4c5917f101d4b844fa849728 [file] [log] [blame]
Guido van Rossumb51eaa11997-03-07 00:21:55 +00001"""Tokenization help for Python programs.
Guido van Rossum4d8e8591992-01-01 19:34:47 +00002
Guido van Rossum1aec3231997-04-08 14:24:39 +00003This module exports a function called 'tokenize()' that breaks a stream of
4text into Python tokens. It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6function which is called once for each token found. The latter function is
7passed the token type, a string containing the token, the starting and
8ending (row, column) coordinates of the token, and the original line. It is
9designed to match the working of the Python tokenizer exactly, except that
10it produces COMMENT tokens for comments and gives type OP for all operators.
Guido van Rossumb51eaa11997-03-07 00:21:55 +000011
Guido van Rossum1aec3231997-04-08 14:24:39 +000012For compatibility with the older 'tokenize' module, this also compiles a
13regular expression into 'tokenprog' that matches Python tokens in individual
14lines of text, leaving the token in 'tokenprog.group(3)', but does not
15handle indentation, continuations, or multi-line strings."""
Guido van Rossumb51eaa11997-03-07 00:21:55 +000016
Guido van Rossumde655271997-04-09 17:15:54 +000017__version__ = "Ka-Ping Yee, 29 March 1997"
Guido van Rossum4d8e8591992-01-01 19:34:47 +000018
Guido van Rossumfc6f5331997-03-07 00:21:12 +000019import string, regex
20from token import *
Guido van Rossum4d8e8591992-01-01 19:34:47 +000021
Guido van Rossum1aec3231997-04-08 14:24:39 +000022COMMENT = N_TOKENS
23tok_name[COMMENT] = 'COMMENT'
24
25# Changes from 1.3:
26# Ignore now accepts \f as whitespace. Operator now includes '**'.
27# Ignore and Special now accept \n or \r\n at the end of a line.
28# Imagnumber is new. Expfloat is corrected to reject '0e4'.
29# Note: to get a quoted backslash in a regex, it must be enclosed in brackets.
30
Guido van Rossumfc6f5331997-03-07 00:21:12 +000031def group(*choices): return '\(' + string.join(choices, '\|') + '\)'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000032
Guido van Rossum1aec3231997-04-08 14:24:39 +000033Whitespace = '[ \f\t]*'
34Comment = '\(#[^\r\n]*\)'
35Ignore = Whitespace + group('[\]\r?\n' + Whitespace)+'*' + Comment+'?'
Guido van Rossum4d8e8591992-01-01 19:34:47 +000036Name = '[a-zA-Z_][a-zA-Z0-9_]*'
37
38Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
39Octnumber = '0[0-7]*[lL]?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000040Decnumber = '[1-9][0-9]*[lL]?'
41Intnumber = group(Hexnumber, Octnumber, Decnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000042Exponent = '[eE][-+]?[0-9]+'
Guido van Rossumfc6f5331997-03-07 00:21:12 +000043Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'
Guido van Rossum1aec3231997-04-08 14:24:39 +000044Expfloat = '[1-9][0-9]*' + Exponent
45Floatnumber = group(Pointfloat, Expfloat)
46Imagnumber = group('0[jJ]', '[1-9][0-9]*[jJ]', Floatnumber + '[jJ]')
47Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000048
Guido van Rossumde655271997-04-09 17:15:54 +000049Single = group("[^'\]", "[\].") + "*'"
50Double = group('[^"\]', '[\].') + '*"'
51Single3 = group("[^'\]","[\].","'[^'\]","'[\].","''[^'\]","''[\].") + "*'''"
52Double3 = group('[^"\]','[\].','"[^"\]','"[\].','""[^"\]','""[\].') + '*"""'
Guido van Rossum1aec3231997-04-08 14:24:39 +000053Triple = group("'''", '"""')
Guido van Rossumde655271997-04-09 17:15:54 +000054String = group("'" + group("[^\n'\]", "[\].") + "*'",
55 '"' + group('[^\n"\]', '[\].') + '*"')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000056
Guido van Rossumfc6f5331997-03-07 00:21:12 +000057Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',
58 '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
Guido van Rossum4d8e8591992-01-01 19:34:47 +000059Bracket = '[][(){}]'
Guido van Rossum1aec3231997-04-08 14:24:39 +000060Special = group('\r?\n', '[:;.,`]')
Guido van Rossumfc6f5331997-03-07 00:21:12 +000061Funny = group(Operator, Bracket, Special)
Guido van Rossum4d8e8591992-01-01 19:34:47 +000062
Guido van Rossum1aec3231997-04-08 14:24:39 +000063PlainToken = group(Name, Number, String, Funny)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000064Token = Ignore + PlainToken
Guido van Rossum4d8e8591992-01-01 19:34:47 +000065
Guido van Rossum1aec3231997-04-08 14:24:39 +000066ContStr = group("'" + group('[\].', "[^\n'\]")+'*' + group("'", '[\]\r?\n'),
67 '"' + group('[\].', '[^\n"\]')+'*' + group('"', '[\]\r?\n'))
68PseudoExtras = group('[\]\r?\n', Comment, Triple)
69PseudoToken = Whitespace + group(PseudoExtras, Name, Number, ContStr, Funny)
70
Guido van Rossum4d8e8591992-01-01 19:34:47 +000071try:
Guido van Rossum1aec3231997-04-08 14:24:39 +000072 saved_syntax = regex.set_syntax(0) # use default syntax
Guido van Rossumfc6f5331997-03-07 00:21:12 +000073 tokenprog = regex.compile(Token)
Guido van Rossum1aec3231997-04-08 14:24:39 +000074 pseudoprog = regex.compile(PseudoToken)
Guido van Rossumfc6f5331997-03-07 00:21:12 +000075 endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),
Guido van Rossum1aec3231997-04-08 14:24:39 +000076 '\'\'\'': regex.compile(Single3), '"""': regex.compile(Double3) }
Guido van Rossum4d8e8591992-01-01 19:34:47 +000077finally:
Guido van Rossum1aec3231997-04-08 14:24:39 +000078 regex.set_syntax(saved_syntax) # restore original syntax
Guido van Rossum4d8e8591992-01-01 19:34:47 +000079
Guido van Rossumfc6f5331997-03-07 00:21:12 +000080tabsize = 8
81TokenError = 'TokenError'
Guido van Rossum1aec3231997-04-08 14:24:39 +000082def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
83 print "%d,%d-%d,%d:\t%s\t%s" % \
84 (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum4d8e8591992-01-01 19:34:47 +000085
Guido van Rossum1aec3231997-04-08 14:24:39 +000086def tokenize(readline, tokeneater=printtoken):
87 lnum = parenlev = continued = 0
Guido van Rossumfc6f5331997-03-07 00:21:12 +000088 namechars, numchars = string.letters + '_', string.digits
Guido van Rossumde655271997-04-09 17:15:54 +000089 contstr, needcont = '', 0
Guido van Rossumfc6f5331997-03-07 00:21:12 +000090 indents = [0]
Guido van Rossum1aec3231997-04-08 14:24:39 +000091
Guido van Rossumfc6f5331997-03-07 00:21:12 +000092 while 1: # loop over lines in stream
93 line = readline()
Guido van Rossum1aec3231997-04-08 14:24:39 +000094 lnum = lnum + 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +000095 pos, max = 0, len(line)
96
97 if contstr: # continued string
Guido van Rossumde655271997-04-09 17:15:54 +000098 if not line:
99 raise TokenError, ("EOF in multi-line string", strstart)
100 if endprog.match(line) >= 0:
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000101 pos = end = endprog.regs[0][1]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000102 tokeneater(STRING, contstr + line[:end],
103 strstart, (lnum, end), line)
Guido van Rossumde655271997-04-09 17:15:54 +0000104 contstr, needcont = '', 0
105 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
106 tokeneater(ERRORTOKEN, contstr + line,
107 strstart, (lnum, len(line)), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000108 contstr = ''
Guido van Rossumde655271997-04-09 17:15:54 +0000109 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000110 else:
111 contstr = contstr + line
112 continue
113
Guido van Rossum1aec3231997-04-08 14:24:39 +0000114 elif parenlev == 0 and not continued: # new statement
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000115 if not line: break
116 column = 0
Guido van Rossum1aec3231997-04-08 14:24:39 +0000117 while pos < max: # measure leading whitespace
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000118 if line[pos] == ' ': column = column + 1
Guido van Rossum1aec3231997-04-08 14:24:39 +0000119 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000120 elif line[pos] == '\f': column = 0
121 else: break
122 pos = pos + 1
Guido van Rossumde655271997-04-09 17:15:54 +0000123 if pos == max: break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000124
125 if line[pos] in '#\r\n': # skip comments or blank lines
126 tokeneater((NEWLINE, COMMENT)[line[pos] == '#'], line[pos:],
127 (lnum, pos), (lnum, len(line)), line)
128 continue
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000129
130 if column > indents[-1]: # count indents or dedents
131 indents.append(column)
Guido van Rossum1aec3231997-04-08 14:24:39 +0000132 tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000133 while column < indents[-1]:
134 indents = indents[:-1]
Guido van Rossumde655271997-04-09 17:15:54 +0000135 tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000136
137 else: # continued statement
Guido van Rossumde655271997-04-09 17:15:54 +0000138 if not line:
139 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000140 continued = 0
141
142 while pos < max:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000143 if pseudoprog.match(line, pos) > 0: # scan for tokens
144 start, end = pseudoprog.regs[1]
Guido van Rossumde655271997-04-09 17:15:54 +0000145 spos, epos, pos = (lnum, start), (lnum, end), end
Guido van Rossum1aec3231997-04-08 14:24:39 +0000146 token, initial = line[start:end], line[start]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000147
Guido van Rossum1aec3231997-04-08 14:24:39 +0000148 if initial in namechars: # ordinary name
149 tokeneater(NAME, token, spos, epos, line)
Guido van Rossumde655271997-04-09 17:15:54 +0000150 elif initial in numchars \
151 or (initial == '.' and token != '.'): # ordinary number
Guido van Rossum1aec3231997-04-08 14:24:39 +0000152 tokeneater(NUMBER, token, spos, epos, line)
153 elif initial in '\r\n':
154 tokeneater(NEWLINE, token, spos, epos, line)
155 elif initial == '#':
156 tokeneater(COMMENT, token, spos, epos, line)
157 elif initial == '\\': # continued stmt
158 continued = 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000159 elif token in ('\'\'\'', '"""'): # triple-quoted
160 endprog = endprogs[token]
Guido van Rossumde655271997-04-09 17:15:54 +0000161 if endprog.match(line, pos) >= 0: # all on one line
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000162 pos = endprog.regs[0][1]
Guido van Rossum1aec3231997-04-08 14:24:39 +0000163 token = line[start:pos]
164 tokeneater(STRING, token, spos, (lnum, pos), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000165 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000166 strstart = (lnum, start) # multiple lines
167 contstr = line[start:]
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000168 break
Guido van Rossum1aec3231997-04-08 14:24:39 +0000169 elif initial in '\'"':
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000170 if token[-1] == '\n': # continued string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000171 strstart = (lnum, start)
Guido van Rossumde655271997-04-09 17:15:54 +0000172 endprog = endprogs[initial]
173 contstr, needcont = line[start:], 1
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000174 break
175 else: # ordinary string
Guido van Rossum1aec3231997-04-08 14:24:39 +0000176 tokeneater(STRING, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000177 else:
Guido van Rossum1aec3231997-04-08 14:24:39 +0000178 if initial in '([{': parenlev = parenlev + 1
179 elif initial in ')]}': parenlev = parenlev - 1
180 tokeneater(OP, token, spos, epos, line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000181 else:
Guido van Rossumde655271997-04-09 17:15:54 +0000182 tokeneater(ERRORTOKEN, line[pos],
183 (lnum, pos), (lnum, pos+1), line)
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000184 pos = pos + 1
185
186 for indent in indents[1:]: # pop remaining indent levels
Guido van Rossum1aec3231997-04-08 14:24:39 +0000187 tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumde655271997-04-09 17:15:54 +0000188 tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossumfc6f5331997-03-07 00:21:12 +0000189
190if __name__ == '__main__': # testing
191 import sys
Guido van Rossumde655271997-04-09 17:15:54 +0000192 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
Guido van Rossum2b1566b1997-06-03 22:05:15 +0000193 else: tokenize(sys.stdin.readline)