Blame - Lib/tokenize.py - platform/external/python/cpython2

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

1

"""Tokenization help for Python programs.

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

3

This module compiles a regular expression that recognizes Python

4

tokens in individual lines of text. The regular expression handles

5

everything except indentation, continuations, and triple-quoted

6

strings. The function 'tokenize.tokenize()' takes care of these

7

things for streams of text. It accepts a readline-like function which

8

is called repeatedly to come up with the next input line (or "" for

9

EOF), and a "token-eater" function which is called for each token

10

found, passing its type, a string containing the token, the line

11

number, the line, and the starting and ending positions of the token

12

within the line. It is designed to match the working of the Python

tokenizer exactly.

"""

__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 6 March 1997"

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

18

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

19

import string, regex

20

from token import *

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

21

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

22

def group(*choices): return '\(' + string.join(choices, '\|') + '\)'

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

23

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

24

Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

25

Name = '[a-zA-Z_][a-zA-Z0-9_]*'

26

27

Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'

28

Octnumber = '0[0-7]*[lL]?'

29

Decnumber = '[1-9][0-9]*[lL]?'

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

30

Intnumber = group(Hexnumber, Octnumber, Decnumber)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

31

Exponent = '[eE][-+]?[0-9]+'

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

32

Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

33

Expfloat = '[0-9]+' + Exponent

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

34

Floatnumber = group(Pointfloat, Expfloat)

35

Number = group(Floatnumber, Intnumber)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

36

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

37

Single = group('^\'', '[^\]\'')

38

Double = group('^"', '[^\]"')

39

Tsingle = group('^\'\'\'', '[^\]\'\'\'')

40

Tdouble = group('^"""', '[^\]"""')

41

Triple = group('\'\'\'', '"""')

42

String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),

43

'"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

44

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

45

Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',

46

'<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

47

Bracket = '[][(){}]'

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

48

Special = group('[\]?\r?\n', '[:;.,`\f]')

49

Funny = group(Operator, Bracket, Special)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

50

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

51

PlainToken = group(Name, Number, Triple, String, Funny)

52

Token = Ignore + PlainToken

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

53

54

try:

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

55

save_syntax = regex.set_syntax(0) # use default syntax

56

tokenprog = regex.compile(Token)

57

endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),

58

'\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

59

finally:

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

60

regex.set_syntax(save_syntax) # restore original syntax

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

61

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

62

tabsize = 8

63

TokenError = 'TokenError'

64

def printtoken(type, string, linenum, line, start, end): # for testing

65

print `linenum` + ':', tok_name[type], repr(string)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

66

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

67

def tokenize(readline, tokeneater = printtoken):

68

linenum = parenlev = continued = 0

69

namechars, numchars = string.letters + '_', string.digits

70

contstr = ''

71

indents = [0]

72

while 1: # loop over lines in stream

73

line = readline()

74

linenum = linenum + 1

75

if line[-2:] == '\r\n': line = line[:-2] + '\n'

76

pos, max = 0, len(line)

77

78

if contstr: # continued string

79

if not line: raise TokenError, "EOF within multi-line string"

80

if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'

81

if endprog.search(line) >= 0:

82

pos = end = endprog.regs[0][1]

83

tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)

84

contstr = ''

85

else:

86

contstr = contstr + line

87

continue

88

89

elif parenlev == 0 and not continued: # this is a new statement

90

if not line: break

91

column = 0

92

while 1: # measure leading whitespace

93

if line[pos] == ' ': column = column + 1

94

elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize

95

elif line[pos] == '\f': column = 0

96

else: break

97

pos = pos + 1

98

if line[pos] in '#\n': continue # skip comments or blank lines

99

100

if column > indents[-1]: # count indents or dedents

101

indents.append(column)

102

tokeneater(INDENT, '\t', linenum, line, 0, 0)

103

while column < indents[-1]:

104

indents = indents[:-1]

105

tokeneater(DEDENT, '\t', linenum, line, 0, 0)

106

107

else: # continued statement

108

if not line: raise TokenError, "EOF within multi-line statement"

continued = 0

while pos < max:

if tokenprog.match(line, pos) > 0: # scan for tokens

113

start, end = tokenprog.regs[3]

114

token = line[start:end]

115

pos = end

116

117

if token[0] in namechars: # ordinary name

118

tokeneater(NAME, token, linenum, line, start, end)

119

elif token[0] in numchars: # ordinary number

120

tokeneater(NUMBER, token, linenum, line, start, end)

121

122

elif token in ('\'\'\'', '"""'): # triple-quoted

123

endprog = endprogs[token]

124

if endprog.search(line, pos) >= 0: # all on one line

125

pos = endprog.regs[0][1]

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

126

token = line[start:pos]

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

127

tokeneater(STRING, token, linenum, line, start, pos)

128

else:

129

contstr = line[start:] # multiple lines

130

break

131

elif token[0] in '\'"':

132

if token[-1] == '\n': # continued string

133

endprog, contstr = endprogs[token[0]], line[start:]

134

break

135

else: # ordinary string

136

tokeneater(STRING, token, linenum, line, start, end)

137

138

elif token[0] == '\n':

139

tokeneater(NEWLINE, token, linenum, line, start, end)

140

elif token[0] == '\\': # continued stmt

continued = 1

else:

if token[0] in '([{': parenlev = parenlev + 1

145

if token[0] in ')]}': parenlev = parenlev - 1

146

tokeneater(OP, token, linenum, line, start, end)

147

else:

148

tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)

149

pos = pos + 1

150

151

for indent in indents[1:]: # pop remaining indent levels

152

tokeneater(DEDENT, '\t', linenum, line, 0, 0)

153

154

if __name__ == '__main__': # testing

155

import sys

156

file = open(sys.argv[-1])

157

tokenize(file.readline)