Blame - Lib/tokenize.py - platform/external/python/cpython2

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

1

"""Tokenization help for Python programs.

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

3

This module compiles a regular expression that recognizes Python

4

tokens in individual lines of text. The regular expression handles

5

everything except indentation, continuations, and triple-quoted

6

strings. The function 'tokenize.tokenize()' takes care of these

7

things for streams of text. It accepts a readline-like function which

8

is called repeatedly to come up with the next input line (or "" for

9

EOF), and a "token-eater" function which is called for each token

10

found, passing its type, a string containing the token, the line

11

number, the line, and the starting and ending positions of the token

12

within the line. It is designed to match the working of the Python

tokenizer exactly.

"""

Guido van Rossum

1997-03-10 23:17:01 +0000

[diff] [blame]

17

__version__ = "Ka-Ping Yee, 4 March 1997, updated by GvR, 10 March 1997"

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

18

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

19

import string, regex

20

from token import *

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

21

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

22

def group(*choices): return '\(' + string.join(choices, '\|') + '\)'

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

23

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

24

Ignore = '[ \f\t]*\([\]\r?\n[ \t]*\)*\(#.*\)?'

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

25

Name = '[a-zA-Z_][a-zA-Z0-9_]*'

26

Guido van Rossum

b5dc5e3

1997-03-10 23:17:01 +0000

[diff] [blame]

27

ImagZero = '0[jJ]' # This is not caught by any of the following

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

28

Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'

29

Octnumber = '0[0-7]*[lL]?'

Guido van Rossum

b5dc5e3

1997-03-10 23:17:01 +0000

[diff] [blame]

30

Decnumber = '[1-9][0-9]*[lLjJ]?'

31

Intnumber = group(ImagZero, Hexnumber, Octnumber, Decnumber)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

32

Exponent = '[eE][-+]?[0-9]+'

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

33

Pointfloat = group('[0-9]+\.[0-9]*', '\.[0-9]+') + group(Exponent) + '?'

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

34

Expfloat = '[0-9]+' + Exponent

Guido van Rossum

b5dc5e3

1997-03-10 23:17:01 +0000

[diff] [blame]

35

Floatnumber = group(Pointfloat, Expfloat) + "[jJ]?"

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

36

Number = group(Floatnumber, Intnumber)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

37

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

38

Single = group('^\'', '[^\]\'')

39

Double = group('^"', '[^\]"')

40

Tsingle = group('^\'\'\'', '[^\]\'\'\'')

41

Tdouble = group('^"""', '[^\]"""')

42

Triple = group('\'\'\'', '"""')

43

String = group('\'' + group('[\].', '[^\'\]')+ '*' + group('\'', '[\]\n'),

44

'"' + group('[\].', '[^"\]') + '*' + group('"', '[\]\n'))

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

45

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

46

Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '|',

47

'<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

48

Bracket = '[][(){}]'

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

49

Special = group('[\]?\r?\n', '[:;.,`\f]')

50

Funny = group(Operator, Bracket, Special)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

51

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

52

PlainToken = group(Name, Number, Triple, String, Funny)

53

Token = Ignore + PlainToken

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

54

55

try:

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

56

save_syntax = regex.set_syntax(0) # use default syntax

57

tokenprog = regex.compile(Token)

58

endprogs = { '\'': regex.compile(Single), '"': regex.compile(Double),

59

'\'\'\'': regex.compile(Tsingle), '"""': regex.compile(Tdouble) }

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

60

finally:

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

61

regex.set_syntax(save_syntax) # restore original syntax

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

62

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

63

tabsize = 8

64

TokenError = 'TokenError'

65

def printtoken(type, string, linenum, line, start, end): # for testing

66

print `linenum` + ':', tok_name[type], repr(string)

Guido van Rossum

4d8e859

1992-01-01 19:34:47 +0000

[diff] [blame]

67

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

68

def tokenize(readline, tokeneater = printtoken):

69

linenum = parenlev = continued = 0

70

namechars, numchars = string.letters + '_', string.digits

71

contstr = ''

72

indents = [0]

73

while 1: # loop over lines in stream

74

line = readline()

75

linenum = linenum + 1

76

if line[-2:] == '\r\n': line = line[:-2] + '\n'

77

pos, max = 0, len(line)

78

79

if contstr: # continued string

80

if not line: raise TokenError, "EOF within multi-line string"

81

if contstr[-2:] == '\\\n': contstr = contstr[:-2] + '\n'

82

if endprog.search(line) >= 0:

83

pos = end = endprog.regs[0][1]

84

tokeneater(STRING, contstr + line[:end], linenum, line, 0, 0)

85

contstr = ''

86

else:

87

contstr = contstr + line

88

continue

89

90

elif parenlev == 0 and not continued: # this is a new statement

91

if not line: break

92

column = 0

93

while 1: # measure leading whitespace

94

if line[pos] == ' ': column = column + 1

95

elif line[pos] == '\t': column = (column/tabsize + 1) * tabsize

96

elif line[pos] == '\f': column = 0

97

else: break

98

pos = pos + 1

99

if line[pos] in '#\n': continue # skip comments or blank lines

100

101

if column > indents[-1]: # count indents or dedents

102

indents.append(column)

103

tokeneater(INDENT, '\t', linenum, line, 0, 0)

104

while column < indents[-1]:

105

indents = indents[:-1]

106

tokeneater(DEDENT, '\t', linenum, line, 0, 0)

107

108

else: # continued statement

109

if not line: raise TokenError, "EOF within multi-line statement"

continued = 0

while pos < max:

if tokenprog.match(line, pos) > 0: # scan for tokens

114

start, end = tokenprog.regs[3]

115

token = line[start:end]

116

pos = end

117

118

if token[0] in namechars: # ordinary name

119

tokeneater(NAME, token, linenum, line, start, end)

120

elif token[0] in numchars: # ordinary number

121

tokeneater(NUMBER, token, linenum, line, start, end)

122

123

elif token in ('\'\'\'', '"""'): # triple-quoted

124

endprog = endprogs[token]

125

if endprog.search(line, pos) >= 0: # all on one line

126

pos = endprog.regs[0][1]

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

127

token = line[start:pos]

Guido van Rossum

fc6f533

1997-03-07 00:21:12 +0000

[diff] [blame]

128

tokeneater(STRING, token, linenum, line, start, pos)

129

else:

130

contstr = line[start:] # multiple lines

131

break

132

elif token[0] in '\'"':

133

if token[-1] == '\n': # continued string

134

endprog, contstr = endprogs[token[0]], line[start:]

135

break

136

else: # ordinary string

137

tokeneater(STRING, token, linenum, line, start, end)

138

139

elif token[0] == '\n':

140

tokeneater(NEWLINE, token, linenum, line, start, end)

141

elif token[0] == '\\': # continued stmt

continued = 1

else:

if token[0] in '([{': parenlev = parenlev + 1

146

if token[0] in ')]}': parenlev = parenlev - 1

147

tokeneater(OP, token, linenum, line, start, end)

148

else:

149

tokeneater(ERRORTOKEN, line[pos], linenum, line, pos, pos + 1)

150

pos = pos + 1

151

152

for indent in indents[1:]: # pop remaining indent levels

153

tokeneater(DEDENT, '\t', linenum, line, 0, 0)

154

155

if __name__ == '__main__': # testing

156

import sys

157

file = open(sys.argv[-1])

158

tokenize(file.readline)