Blame - Lib/tokenize.py - platform/external/python/cpython2

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

27

import builtins

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

28

import re

29

import sys

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

30

from token import *

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

31

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

32

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

33

from io import TextIOWrapper

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

34

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

35

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

36

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

37

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

38

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

del token

40

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

41

COMMENT = N_TOKENS

42

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

43

NL = N_TOKENS + 1

44

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

45

ENCODING = N_TOKENS + 2

46

tok_name[ENCODING] = 'ENCODING'

47

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

48

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

86

'<<=': LEFTSHIFTEQUAL,

87

'>>=': RIGHTSHIFTEQUAL,

88

'**=': DOUBLESTAREQUAL,

89

'//': DOUBLESLASH,

90

'//=': DOUBLESLASHEQUAL,

91

'@': AT

92

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

93

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

94

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

95

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

96

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

97

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

98

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

99

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

100

@property

101

def exact_type(self):

102

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

103

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

107

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

108

def any(*choices): return group(*choices) + '*'

109

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

110

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

111

# Note: we use unicode matching for names ("\w") but ascii matching for

112

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

113

Whitespace = r'[ \f\t]*'

114

Comment = r'#[^\r\n]*'

115

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

116

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

117

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

118

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

119

Binnumber = r'0[bB][01]+'

120

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

121

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

122

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

123

Exponent = r'[eE][-+]?[0-9]+'

124

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

125

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

126

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

127

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

128

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

129

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

130

# Tail end of ' string.

131

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

132

# Tail end of " string.

133

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

134

# Tail end of ''' string.

135

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

136

# Tail end of """ string.

137

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

138

Triple = group("[bBuU]?[rR]?'''", '[bBuU]?[rR]?"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

139

# Single-line ' or " string.

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

140

String = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

141

r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

142

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

143

# Because of leftmost-then-longest match semantics, be sure to put the

144

# longest operators first (e.g., if = came before ==, == would get

145

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

146

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

147

r"//=?", r"->",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

148

r"[+\-*/%&|^=<>]=?",

149

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

150

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

151

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

152

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

153

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

154

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

155

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

156

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

157

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

158

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

159

ContStr = group(r"[bBuU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

160

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

161

r'[bBuU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

162

group('"', r'\\\r?\n'))

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

163

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

164

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

165

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

166

def _compile(expr):

167

return re.compile(expr, re.UNICODE)

168

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

169

endpats = {"'": Single, '"': Double,

170

"'''": Single3, '"""': Double3,

171

"r'''": Single3, 'r"""': Double3,

172

"b'''": Single3, 'b"""': Double3,

173

"br'''": Single3, 'br"""': Double3,

174

"R'''": Single3, 'R"""': Double3,

175

"B'''": Single3, 'B"""': Double3,

176

"bR'''": Single3, 'bR"""': Double3,

177

"Br'''": Single3, 'Br"""': Double3,

178

"BR'''": Single3, 'BR"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

179

"u'''": Single3, 'u"""': Double3,

180

"ur'''": Single3, 'ur"""': Double3,

181

"R'''": Single3, 'R"""': Double3,

182

"U'''": Single3, 'U"""': Double3,

183

"uR'''": Single3, 'uR"""': Double3,

184

"Ur'''": Single3, 'Ur"""': Double3,

185

"UR'''": Single3, 'UR"""': Double3,

186

'r': None, 'R': None, 'b': None, 'B': None,

187

'u': None, 'U': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

188

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

189

triple_quoted = {}

190

for t in ("'''", '"""',

191

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

192

"b'''", 'b"""', "B'''", 'B"""',

193

"br'''", 'br"""', "Br'''", 'Br"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

194

"bR'''", 'bR"""', "BR'''", 'BR"""',

195

"u'''", 'u"""', "U'''", 'U"""',

196

"ur'''", 'ur"""', "Ur'''", 'Ur"""',

197

"uR'''", 'uR"""', "UR'''", 'UR"""'):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

202

"b'", 'b"', "B'", 'B"',

203

"br'", 'br"', "Br'", 'Br"',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

204

"bR'", 'bR"', "BR'", 'BR"' ,

205

"u'", 'u"', "U'", 'U"',

206

"ur'", 'ur"', "Ur'", 'Ur"',

207

"uR'", 'uR"', "UR'", 'UR"' ):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

208

single_quoted[t] = t

209

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

210

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

211

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

212

class TokenError(Exception): pass

213

214

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

215

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

216

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

223

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

224

225

def add_whitespace(self, start):

226

row, col = start

227

assert row <= self.prev_row

228

col_offset = col - self.prev_col

229

if col_offset:

230

self.tokens.append(" " * col_offset)

231

232

def untokenize(self, iterable):

233

for t in iterable:

234

if len(t) == 2:

235

self.compat(t, iterable)

236

break

237

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

238

if tok_type == ENCODING:

239

self.encoding = token

240

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

241

self.add_whitespace(start)

242

self.tokens.append(token)

243

self.prev_row, self.prev_col = end

244

if tok_type in (NEWLINE, NL):

245

self.prev_row += 1

246

self.prev_col = 0

247

return "".join(self.tokens)

248

249

def compat(self, token, iterable):

250

startline = False

251

indents = []

252

toks_append = self.tokens.append

253

toknum, tokval = token

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

254

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

255

if toknum in (NAME, NUMBER):

256

tokval += ' '

257

if toknum in (NEWLINE, NL):

258

startline = True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

259

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

260

for tok in iterable:

261

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

262

if toknum == ENCODING:

263

self.encoding = tokval

264

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

265

266

if toknum in (NAME, NUMBER):

267

tokval += ' '

268

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

269

# Insert a space between two consecutive strings

270

if toknum == STRING:

271

if prevstring:

272

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

277

if toknum == INDENT:

278

indents.append(tokval)

279

continue

280

elif toknum == DEDENT:

281

indents.pop()

282

continue

283

elif toknum in (NEWLINE, NL):

284

startline = True

285

elif startline and indents:

286

toks_append(indents[-1])

287

startline = False

288

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

289

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

290

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

291

def untokenize(iterable):

292

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

293

It returns a bytes object, encoded using the ENCODING

294

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

295

296

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

297

with at least two elements, a token number and token value. If

298

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

299

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

300

Round-trip invariant for full input:

301

Untokenized source will match input source exactly

302

303

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

304

# Output bytes will tokenize the back to the input

305

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

306

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

307

readline = BytesIO(newcode).readline

308

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

309

assert t1 == t2

310

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

311

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

312

out = ut.untokenize(iterable)

313

if ut.encoding is not None:

314

out = out.encode(ut.encoding)

315

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

316

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

317

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

318

def _get_normal_name(orig_enc):

319

"""Imitates get_normal_name in tokenizer.c."""

320

# Only care about the first 12 characters.

321

enc = orig_enc[:12].lower().replace("_", "-")

322

if enc == "utf-8" or enc.startswith("utf-8-"):

323

return "utf-8"

324

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

325

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

329

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

330

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

331

The detect_encoding() function is used to detect the encoding that should

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

332

be used to decode a Python source file. It requires one argment, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

333

in the same way as the tokenize() generator.

334

335

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

336

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

337

338

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

339

cookie as specified in pep-0263. If both a bom and a cookie are present,

340

but disagree, a SyntaxError will be raised. If the encoding cookie is an

341

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

342

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

343

344

If no encoding is specified, then the default of 'utf-8' will be returned.

345

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

346

bom_found = False

347

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

348

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

353

return b''

354

355

def find_cookie(line):

356

try:

357

line_string = line.decode('ascii')

358

except UnicodeDecodeError:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

359

return None

360

361

matches = cookie_re.findall(line_string)

362

if not matches:

363

return None

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

364

encoding = _get_normal_name(matches[0])

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

365

try:

366

codec = lookup(encoding)

367

except LookupError:

368

# This behaviour mimics the Python interpreter

369

raise SyntaxError("unknown encoding: " + encoding)

370

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

371

if bom_found:

372

if codec.name != 'utf-8':

373

# This behaviour mimics the Python interpreter

374

raise SyntaxError('encoding problem: utf-8')

375

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

376

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

377

378

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

379

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

380

bom_found = True

381

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

382

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

383

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

384

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

385

386

encoding = find_cookie(first)

387

if encoding:

388

return encoding, [first]

389

390

second = read_or_stop()

391

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

392

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

393

394

encoding = find_cookie(second)

395

if encoding:

396

return encoding, [first, second]

397

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

398

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

399

400

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

401

def open(filename):

402

"""Open a file in read only mode using the encoding detected by

403

detect_encoding().

404

"""

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

405

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

406

encoding, lines = detect_encoding(buffer.readline)

407

buffer.seek(0)

408

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

413

def tokenize(readline):

414

"""

415

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

416

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

417

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

418

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

419

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

420

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

421

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

422

The generator produces 5-tuples with these members: the token type; the

423

token string; a 2-tuple (srow, scol) of ints specifying the row and

424

column where the token begins in the source; a 2-tuple (erow, ecol) of

425

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

426

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

427

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

428

429

The first token sequence will always be an ENCODING token

430

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

431

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

432

# This import is here to avoid problems when the itertools module is not

433

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

434

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

435

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

436

rl_gen = iter(readline, b"")

437

empty = repeat(b"")

438

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

439

440

441

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

442

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

443

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

444

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

445

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

446

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

447

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

448

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

449

if encoding == "utf-8-sig":

450

# BOM will already have been stripped.

451

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

452

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

453

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

454

try:

455

line = readline()

456

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

457

line = b''

458

459

if encoding is not None:

460

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

461

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

462

pos, max = 0, len(line)

463

464

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

465

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

466

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

467

endmatch = endprog.match(line)

468

if endmatch:

469

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

470

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

471

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

472

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

473

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

474

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

475

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

476

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

477

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

478

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

479

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

480

else:

481

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

482

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

483

continue

484

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

485

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

486

if not line: break

487

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

488

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

489

if line[pos] == ' ':

490

column += 1

491

elif line[pos] == '\t':

492

column = (column//tabsize + 1)*tabsize

493

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

500

501

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

502

if line[pos] == '#':

503

comment_token = line[pos:].rstrip('\r\n')

504

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

505

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

506

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

507

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

508

(lnum, nl_pos), (lnum, len(line)), line)

509

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

510

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

511

(lnum, pos), (lnum, len(line)), line)

512

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

513

514

if column > indents[-1]: # count indents or dedents

515

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

516

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

517

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

518

if column not in indents:

519

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

520

"unindent does not match any outer indentation level",

521

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

522

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

523

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

524

525

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

526

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

527

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

528

continued = 0

529

530

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

531

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

532

if pseudomatch: # scan for tokens

533

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

534

spos, epos, pos = (lnum, start), (lnum, end), end

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

535

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

536

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

537

if (initial in numchars or # ordinary number

538

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

539

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

540

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

541

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

542

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

543

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

544

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

545

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

546

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

547

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

548

endmatch = endprog.match(line, pos)

549

if endmatch: # all on one line

550

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

551

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

552

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

553

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

554

strstart = (lnum, start) # multiple lines

555

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

556

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

557

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

558

elif initial in single_quoted or \

559

token[:2] in single_quoted or \

560

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

561

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

562

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

563

endprog = _compile(endpats[initial] or

564

endpats[token[1]] or

565

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

566

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

567

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

568

break

569

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

570

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

571

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

572

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

573

elif initial == '\\': # continued stmt

574

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

575

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

576

if initial in '([{':

577

parenlev += 1

578

elif initial in ')]}':

579

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

580

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

581

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

582

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

583

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

584

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

585

586

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

587

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

588

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

589

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

590

591

# An undocumented, backwards compatible, API for all the places in the standard

592

# library that expect to be able to use tokenize with strings

593

def generate_tokens(readline):

594

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

595

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

600

def perror(message):

601

print(message, file=sys.stderr)

602

603

def error(message, filename=None, location=None):

604

if location:

605

args = (filename,) + location + (message,)

606

perror("%s:%d:%d: error: %s" % args)

607

elif filename:

608

perror("%s: error: %s" % (filename, message))

609

else:

610

perror("error: %s" % message)

611

sys.exit(1)

612

613

# Parse the arguments and options

614

parser = argparse.ArgumentParser(prog='python -m tokenize')

615

parser.add_argument(dest='filename', nargs='?',

616

metavar='filename.py',

617

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

618

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

619

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

620

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

626

with builtins.open(filename, 'rb') as f:

627

tokens = list(tokenize(f.readline))

628

else:

629

filename = "<stdin>"

630

tokens = _tokenize(sys.stdin.readline, None)

631

632

# Output the tokenization

633

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

634

token_type = token.type

635

if args.exact:

636

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

637

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

638

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

639

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

640

except IndentationError as err:

641

line, column = err.args[1][1:3]

642

error(err.args[0], filename, (line, column))

643

except TokenError as err:

644

line, column = err.args[1]

645

error(err.args[0], filename, (line, column))

646

except SyntaxError as err:

647

error(err, filename)

648

except IOError as err:

649

error(err)

650

except KeyboardInterrupt:

651

print("interrupted\n")

652

except Exception as err:

653

perror("unexpected error: %s" % err)

654

raise

655

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

656

if __name__ == "__main__":

Meador Inge