Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

45b96d3

2011-02-22 03:35:18 +0000

[diff] [blame]

27

import builtins

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

28

import re

29

import sys

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

30

from token import *

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

31

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

32

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

33

from io import TextIOWrapper

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

34

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

35

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

36

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

37

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

38

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

del token

40

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

41

COMMENT = N_TOKENS

42

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

43

NL = N_TOKENS + 1

44

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

45

ENCODING = N_TOKENS + 2

46

tok_name[ENCODING] = 'ENCODING'

47

N_TOKENS += 3

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

48

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

49

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

50

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

51

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

52

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

53

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

54

Eric S. Raymond

b08b2d3

2001-02-09 11:10:16 +0000

[diff] [blame]

55

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

56

def any(*choices): return group(*choices) + '*'

57

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

58

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

59

# Note: we use unicode matching for names ("\w") but ascii matching for

60

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

61

Whitespace = r'[ \f\t]*'

62

Comment = r'#[^\r\n]*'

63

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

64

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

65

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

66

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

67

Binnumber = r'0[bB][01]+'

68

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

69

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

70

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

71

Exponent = r'[eE][-+]?[0-9]+'

72

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

73

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

74

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

75

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

76

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

77

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

78

# Tail end of ' string.

79

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

80

# Tail end of " string.

81

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

82

# Tail end of ''' string.

83

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

84

# Tail end of """ string.

85

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

86

Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

87

# Single-line ' or " string.

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

88

String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

89

r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

90

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

91

# Because of leftmost-then-longest match semantics, be sure to put the

92

# longest operators first (e.g., if = came before ==, == would get

93

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

94

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

95

r"//=?", r"->",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

96

r"[+\-*/%&|^=<>]=?",

97

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

98

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

99

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

100

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

101

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

102

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

103

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

104

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

105

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

106

# First (or only) line of ' or " string.

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

107

ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

2001-01-15 22:04:30 +0000

[diff] [blame]

108

group("'", r'\\\r?\n'),

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

109

r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

2001-01-15 22:04:30 +0000

[diff] [blame]

110

group('"', r'\\\r?\n'))

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

111

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

112

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

113

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

114

def _compile(expr):

115

return re.compile(expr, re.UNICODE)

116

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

117

tokenprog, pseudoprog, single3prog, double3prog = map(

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

118

_compile, (Token, PseudoToken, Single3, Double3))

119

endprogs = {"'": _compile(Single), '"': _compile(Double),

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

120

"'''": single3prog, '"""': double3prog,

Guido van Rossum

fefc922

1997-10-27 21:17:24 +0000

[diff] [blame]

121

"r'''": single3prog, 'r"""': double3prog,

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

122

"b'''": single3prog, 'b"""': double3prog,

123

"br'''": single3prog, 'br"""': double3prog,

Ka-Ping Yee

2001-01-15 22:04:30 +0000

[diff] [blame]

124

"R'''": single3prog, 'R"""': double3prog,

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

125

"B'''": single3prog, 'B"""': double3prog,

126

"bR'''": single3prog, 'bR"""': double3prog,

127

"Br'''": single3prog, 'Br"""': double3prog,

128

"BR'''": single3prog, 'BR"""': double3prog,

129

'r': None, 'R': None, 'b': None, 'B': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

130

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

131

triple_quoted = {}

132

for t in ("'''", '"""',

133

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

134

"b'''", 'b"""', "B'''", 'B"""',

135

"br'''", 'br"""', "Br'''", 'Br"""',

136

"bR'''", 'bR"""', "BR'''", 'BR"""'):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

141

"b'", 'b"', "B'", 'B"',

142

"br'", 'br"', "Br'", 'Br"',

143

"bR'", 'bR"', "BR'", 'BR"' ):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

144

single_quoted[t] = t

145

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

146

del _compile

147

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

148

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

149

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

150

class TokenError(Exception): pass

151

152

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

153

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

154

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

161

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

162

163

def add_whitespace(self, start):

164

row, col = start

165

assert row <= self.prev_row

166

col_offset = col - self.prev_col

167

if col_offset:

168

self.tokens.append(" " * col_offset)

169

170

def untokenize(self, iterable):

171

for t in iterable:

172

if len(t) == 2:

173

self.compat(t, iterable)

174

break

175

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

176

if tok_type == ENCODING:

177

self.encoding = token

178

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

179

self.add_whitespace(start)

180

self.tokens.append(token)

181

self.prev_row, self.prev_col = end

182

if tok_type in (NEWLINE, NL):

183

self.prev_row += 1

184

self.prev_col = 0

185

return "".join(self.tokens)

186

187

def compat(self, token, iterable):

188

startline = False

189

indents = []

190

toks_append = self.tokens.append

191

toknum, tokval = token

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

192

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

193

if toknum in (NAME, NUMBER):

194

tokval += ' '

195

if toknum in (NEWLINE, NL):

196

startline = True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

197

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

198

for tok in iterable:

199

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

200

if toknum == ENCODING:

201

self.encoding = tokval

202

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

203

204

if toknum in (NAME, NUMBER):

205

tokval += ' '

206

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

207

# Insert a space between two consecutive strings

208

if toknum == STRING:

209

if prevstring:

210

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

215

if toknum == INDENT:

216

indents.append(tokval)

217

continue

218

elif toknum == DEDENT:

219

indents.pop()

220

continue

221

elif toknum in (NEWLINE, NL):

222

startline = True

223

elif startline and indents:

224

toks_append(indents[-1])

225

startline = False

226

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

227

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

228

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

229

def untokenize(iterable):

230

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

231

It returns a bytes object, encoded using the ENCODING

232

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

233

234

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

235

with at least two elements, a token number and token value. If

236

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

237

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

238

Round-trip invariant for full input:

239

Untokenized source will match input source exactly

240

241

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

242

# Output bytes will tokenize the back to the input

243

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

244

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

245

readline = BytesIO(newcode).readline

246

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

247

assert t1 == t2

248

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

249

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

250

out = ut.untokenize(iterable)

251

if ut.encoding is not None:

252

out = out.encode(ut.encoding)

253

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

254

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

255

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

256

def _get_normal_name(orig_enc):

257

"""Imitates get_normal_name in tokenizer.c."""

258

# Only care about the first 12 characters.

259

enc = orig_enc[:12].lower().replace("_", "-")

260

if enc == "utf-8" or enc.startswith("utf-8-"):

261

return "utf-8"

262

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

263

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

267

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

268

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

269

The detect_encoding() function is used to detect the encoding that should

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

270

be used to decode a Python source file. It requires one argment, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

271

in the same way as the tokenize() generator.

272

273

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

274

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

275

276

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

277

cookie as specified in pep-0263. If both a bom and a cookie are present,

278

but disagree, a SyntaxError will be raised. If the encoding cookie is an

279

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

280

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

281

282

If no encoding is specified, then the default of 'utf-8' will be returned.

283

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

284

bom_found = False

285

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

286

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

291

return b''

292

293

def find_cookie(line):

294

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

295

# Decode as UTF-8. Either the line is an encoding declaration,

296

# in which case it should be pure ASCII, or it must be UTF-8

297

# per default encoding.

298

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

299

except UnicodeDecodeError:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

300

raise SyntaxError("invalid or missing encoding declaration")

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

301

302

matches = cookie_re.findall(line_string)

303

if not matches:

304

return None

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

305

encoding = _get_normal_name(matches[0])

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

306

try:

307

codec = lookup(encoding)

308

except LookupError:

309

# This behaviour mimics the Python interpreter

310

raise SyntaxError("unknown encoding: " + encoding)

311

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

312

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame^]

313

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

314

# This behaviour mimics the Python interpreter

315

raise SyntaxError('encoding problem: utf-8')

316

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

317

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

318

319

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

320

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

321

bom_found = True

322

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

323

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

324

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

325

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

326

327

encoding = find_cookie(first)

328

if encoding:

329

return encoding, [first]

330

331

second = read_or_stop()

332

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

333

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

334

335

encoding = find_cookie(second)

336

if encoding:

337

return encoding, [first, second]

338

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

339

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

340

341

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

342

def open(filename):

343

"""Open a file in read only mode using the encoding detected by

344

detect_encoding().

345

"""

Brett Cannon

45b96d3

2011-02-22 03:35:18 +0000

[diff] [blame]

346

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

347

encoding, lines = detect_encoding(buffer.readline)

348

buffer.seek(0)

349

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

354

def tokenize(readline):

355

"""

356

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

357

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

358

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

359

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

360

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

361

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

362

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

363

The generator produces 5-tuples with these members: the token type; the

364

token string; a 2-tuple (srow, scol) of ints specifying the row and

365

column where the token begins in the source; a 2-tuple (erow, ecol) of

366

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

367

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

368

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

369

370

The first token sequence will always be an ENCODING token

371

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

372

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

373

# This import is here to avoid problems when the itertools module is not

374

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

375

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

376

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

377

rl_gen = iter(readline, b"")

378

empty = repeat(b"")

379

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

380

381

382

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

383

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

384

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

385

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

386

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

387

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

388

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

389

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

390

if encoding == "utf-8-sig":

391

# BOM will already have been stripped.

392

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

393

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

394

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

395

try:

396

line = readline()

397

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

398

line = b''

399

400

if encoding is not None:

401

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

402

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

403

pos, max = 0, len(line)

404

405

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

406

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

407

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

408

endmatch = endprog.match(line)

409

if endmatch:

410

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

411

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

412

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

413

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

414

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

415

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

416

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

417

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

418

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

419

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

420

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

421

else:

422

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

423

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

424

continue

425

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

426

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

427

if not line: break

428

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

429

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

430

if line[pos] == ' ':

431

column += 1

432

elif line[pos] == '\t':

433

column = (column//tabsize + 1)*tabsize

434

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

441

442

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

443

if line[pos] == '#':

444

comment_token = line[pos:].rstrip('\r\n')

445

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

446

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

447

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

448

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

449

(lnum, nl_pos), (lnum, len(line)), line)

450

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

451

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

452

(lnum, pos), (lnum, len(line)), line)

453

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

454

455

if column > indents[-1]: # count indents or dedents

456

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

457

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

458

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

459

if column not in indents:

460

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

461

"unindent does not match any outer indentation level",

462

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

463

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

464

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

465

466

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

467

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

468

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

469

continued = 0

470

471

while pos < max:

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

472

pseudomatch = pseudoprog.match(line, pos)

473

if pseudomatch: # scan for tokens

474

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

475

spos, epos, pos = (lnum, start), (lnum, end), end

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

476

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

477

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

478

if (initial in numchars or # ordinary number

479

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

480

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

481

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

482

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

483

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

484

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

485

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

486

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

487

elif token in triple_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

488

endprog = endprogs[token]

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

489

endmatch = endprog.match(line, pos)

490

if endmatch: # all on one line

491

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

492

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

493

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

494

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

495

strstart = (lnum, start) # multiple lines

496

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

497

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

498

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

499

elif initial in single_quoted or \

500

token[:2] in single_quoted or \

501

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

502

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

503

strstart = (lnum, start)

Ka-Ping Yee

2001-01-15 22:04:30 +0000

[diff] [blame]

504

endprog = (endprogs[initial] or endprogs[token[1]] or

505

endprogs[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

506

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

507

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

508

break

509

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

510

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

511

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

512

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

513

elif initial == '\\': # continued stmt

514

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

515

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

516

if initial in '([{':

517

parenlev += 1

518

elif initial in ')]}':

519

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

520

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

521

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

522

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

523

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

524

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

525

526

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

527

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

528

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

529

Trent Nelson