Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

27

import builtins

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

28

import re

29

import sys

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

30

from token import *

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

31

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

32

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

33

from io import TextIOWrapper

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

34

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

35

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

36

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

37

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

38

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

del token

40

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

41

COMMENT = N_TOKENS

42

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

43

NL = N_TOKENS + 1

44

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

45

ENCODING = N_TOKENS + 2

46

tok_name[ENCODING] = 'ENCODING'

47

N_TOKENS += 3

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

48

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

49

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

50

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

51

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

52

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

53

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

54

Eric S. Raymond

b08b2d3

2001-02-09 11:10:16 +0000

[diff] [blame]

55

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

56

def any(*choices): return group(*choices) + '*'

57

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

58

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

59

# Note: we use unicode matching for names ("\w") but ascii matching for

60

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

61

Whitespace = r'[ \f\t]*'

62

Comment = r'#[^\r\n]*'

63

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

64

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

65

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

66

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

67

Binnumber = r'0[bB][01]+'

68

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

69

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

70

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

71

Exponent = r'[eE][-+]?[0-9]+'

72

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

73

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

74

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

75

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

76

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

77

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

78

# Tail end of ' string.

79

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

80

# Tail end of " string.

81

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

82

# Tail end of ''' string.

83

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

84

# Tail end of """ string.

85

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

86

Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

87

# Single-line ' or " string.

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

88

String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

89

r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

90

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

91

# Because of leftmost-then-longest match semantics, be sure to put the

92

# longest operators first (e.g., if = came before ==, == would get

93

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

94

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

95

r"//=?", r"->",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

96

r"[+\-*/%&|^=<>]=?",

97

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

98

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

99

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

100

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

101

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

102

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

103

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

104

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

105

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

106

# First (or only) line of ' or " string.

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

107

ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

108

group("'", r'\\\r?\n'),

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

109

r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

110

group('"', r'\\\r?\n'))

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

111

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

112

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

113

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

114

def _compile(expr):

115

return re.compile(expr, re.UNICODE)

116

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame^]

117

endpats = {"'": Single, '"': Double,

118

"'''": Single3, '"""': Double3,

119

"r'''": Single3, 'r"""': Double3,

120

"b'''": Single3, 'b"""': Double3,

121

"br'''": Single3, 'br"""': Double3,

122

"R'''": Single3, 'R"""': Double3,

123

"B'''": Single3, 'B"""': Double3,

124

"bR'''": Single3, 'bR"""': Double3,

125

"Br'''": Single3, 'Br"""': Double3,

126

"BR'''": Single3, 'BR"""': Double3,

127

'r': None, 'R': None, 'b': None, 'B': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

128

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

129

triple_quoted = {}

130

for t in ("'''", '"""',

131

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

132

"b'''", 'b"""', "B'''", 'B"""',

133

"br'''", 'br"""', "Br'''", 'Br"""',

134

"bR'''", 'bR"""', "BR'''", 'BR"""'):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

139

"b'", 'b"', "B'", 'B"',

140

"br'", 'br"', "Br'", 'Br"',

141

"bR'", 'bR"', "BR'", 'BR"' ):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

142

single_quoted[t] = t

143

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

144

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

145

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

146

class TokenError(Exception): pass

147

148

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

149

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

150

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

157

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

158

159

def add_whitespace(self, start):

160

row, col = start

161

assert row <= self.prev_row

162

col_offset = col - self.prev_col

163

if col_offset:

164

self.tokens.append(" " * col_offset)

165

166

def untokenize(self, iterable):

167

for t in iterable:

168

if len(t) == 2:

169

self.compat(t, iterable)

170

break

171

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

172

if tok_type == ENCODING:

173

self.encoding = token

174

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

175

self.add_whitespace(start)

176

self.tokens.append(token)

177

self.prev_row, self.prev_col = end

178

if tok_type in (NEWLINE, NL):

179

self.prev_row += 1

180

self.prev_col = 0

181

return "".join(self.tokens)

182

183

def compat(self, token, iterable):

184

startline = False

185

indents = []

186

toks_append = self.tokens.append

187

toknum, tokval = token

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

188

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

189

if toknum in (NAME, NUMBER):

190

tokval += ' '

191

if toknum in (NEWLINE, NL):

192

startline = True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

193

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

194

for tok in iterable:

195

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

196

if toknum == ENCODING:

197

self.encoding = tokval

198

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

199

200

if toknum in (NAME, NUMBER):

201

tokval += ' '

202

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

203

# Insert a space between two consecutive strings

204

if toknum == STRING:

205

if prevstring:

206

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

211

if toknum == INDENT:

212

indents.append(tokval)

213

continue

214

elif toknum == DEDENT:

215

indents.pop()

216

continue

217

elif toknum in (NEWLINE, NL):

218

startline = True

219

elif startline and indents:

220

toks_append(indents[-1])

221

startline = False

222

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

223

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

224

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

225

def untokenize(iterable):

226

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

227

It returns a bytes object, encoded using the ENCODING

228

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

229

230

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

231

with at least two elements, a token number and token value. If

232

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

233

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

234

Round-trip invariant for full input:

235

Untokenized source will match input source exactly

236

237

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

238

# Output bytes will tokenize the back to the input

239

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

240

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

241

readline = BytesIO(newcode).readline

242

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

243

assert t1 == t2

244

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

245

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

246

out = ut.untokenize(iterable)

247

if ut.encoding is not None:

248

out = out.encode(ut.encoding)

249

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

250

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

251

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

252

def _get_normal_name(orig_enc):

253

"""Imitates get_normal_name in tokenizer.c."""

254

# Only care about the first 12 characters.

255

enc = orig_enc[:12].lower().replace("_", "-")

256

if enc == "utf-8" or enc.startswith("utf-8-"):

257

return "utf-8"

258

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

259

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

263

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

264

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

265

The detect_encoding() function is used to detect the encoding that should

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

266

be used to decode a Python source file. It requires one argment, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

267

in the same way as the tokenize() generator.

268

269

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

270

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

271

272

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

273

cookie as specified in pep-0263. If both a bom and a cookie are present,

274

but disagree, a SyntaxError will be raised. If the encoding cookie is an

275

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

276

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

277

278

If no encoding is specified, then the default of 'utf-8' will be returned.

279

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

280

bom_found = False

281

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

282

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

287

return b''

288

289

def find_cookie(line):

290

try:

291

line_string = line.decode('ascii')

292

except UnicodeDecodeError:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

293

return None

294

295

matches = cookie_re.findall(line_string)

296

if not matches:

297

return None

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

298

encoding = _get_normal_name(matches[0])

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

299

try:

300

codec = lookup(encoding)

301

except LookupError:

302

# This behaviour mimics the Python interpreter

303

raise SyntaxError("unknown encoding: " + encoding)

304

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

305

if bom_found:

306

if codec.name != 'utf-8':

307

# This behaviour mimics the Python interpreter

308

raise SyntaxError('encoding problem: utf-8')

309

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

310

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

311

312

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

313

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

314

bom_found = True

315

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

316

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

317

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

318

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

320

encoding = find_cookie(first)

321

if encoding:

322

return encoding, [first]

323

324

second = read_or_stop()

325

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

326

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

327

328

encoding = find_cookie(second)

329

if encoding:

330

return encoding, [first, second]

331

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

332

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

333

334

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

335

def open(filename):

336

"""Open a file in read only mode using the encoding detected by

337

detect_encoding().

338

"""

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

339

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

340

encoding, lines = detect_encoding(buffer.readline)

341

buffer.seek(0)

342

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

347

def tokenize(readline):

348

"""

349

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

350

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

351

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

352

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

353

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

354

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

355

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

356

The generator produces 5-tuples with these members: the token type; the

357

token string; a 2-tuple (srow, scol) of ints specifying the row and

358

column where the token begins in the source; a 2-tuple (erow, ecol) of

359

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

360

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

361

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

362

363

The first token sequence will always be an ENCODING token

364

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

365

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

366

# This import is here to avoid problems when the itertools module is not

367

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

368

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

369

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

370

rl_gen = iter(readline, b"")

371

empty = repeat(b"")

372

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

373

374

375

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

376

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

377

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

378

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

379

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

380

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

381

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

382

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

383

if encoding == "utf-8-sig":

384

# BOM will already have been stripped.

385

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

386

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

387

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

388

try:

389

line = readline()

390

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

391

line = b''

392

393

if encoding is not None:

394

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

395

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

396

pos, max = 0, len(line)

397

398

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

399

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

400

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

401

endmatch = endprog.match(line)

402

if endmatch:

403

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

404

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

405

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

406

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

407

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

408

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

409

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

410

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

411

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

412

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

413

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

414

else:

415

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

416

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

417

continue

418

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

419

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

420

if not line: break

421

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

422

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

423

if line[pos] == ' ':

424

column += 1

425

elif line[pos] == '\t':

426

column = (column//tabsize + 1)*tabsize

427

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

434

435

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

436

if line[pos] == '#':

437

comment_token = line[pos:].rstrip('\r\n')

438

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

439

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

440

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

441

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

442

(lnum, nl_pos), (lnum, len(line)), line)

443

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

444

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

445

(lnum, pos), (lnum, len(line)), line)

446

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

447

448

if column > indents[-1]: # count indents or dedents

449

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

450

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

451

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

452

if column not in indents:

453

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

454

"unindent does not match any outer indentation level",

455

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

456

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

457

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

458

459

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

460

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

461

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

462

continued = 0

463

464

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame^]

465

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

466

if pseudomatch: # scan for tokens

467

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

468

spos, epos, pos = (lnum, start), (lnum, end), end

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

469

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

470

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

471

if (initial in numchars or # ordinary number

472

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

473

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

474

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

475

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

476

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

477

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

478

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

479

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

480

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame^]

481

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

482

endmatch = endprog.match(line, pos)

483

if endmatch: # all on one line

484

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

485

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

486

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

487

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

488

strstart = (lnum, start) # multiple lines

489

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

490

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

491

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

492

elif initial in single_quoted or \

493

token[:2] in single_quoted or \

494

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

495

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

496

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame^]

497

endprog = _compile(endpats[initial] or

498

endpats[token[1]] or

499

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

500

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

501

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

502

break

503

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

504

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

505

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

506

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

507

elif initial == '\\': # continued stmt

508

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

509

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

510

if initial in '([{':

511

parenlev += 1

512

elif initial in ')]}':

513

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

514

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

515

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

516

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

517

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

518

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

519

520

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

521

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

522

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

523

Trent Nelson