Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

27

import builtins

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

30

from io import TextIOWrapper

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame^]

31

from itertools import chain

import re

import sys

from token import *

Serhiy Storchaka

2013-09-16 23:51:56 +0300

[diff] [blame]

36

cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

37

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

38

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

40

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

41

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

42

del token

43

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

44

COMMENT = N_TOKENS

45

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

46

NL = N_TOKENS + 1

47

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

48

ENCODING = N_TOKENS + 2

49

tok_name[ENCODING] = 'ENCODING'

50

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

51

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

89

'<<=': LEFTSHIFTEQUAL,

90

'>>=': RIGHTSHIFTEQUAL,

91

'**=': DOUBLESTAREQUAL,

92

'//': DOUBLESLASH,

93

'//=': DOUBLESLASHEQUAL,

94

'@': AT

95

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

96

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

97

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

98

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

99

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

100

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

101

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

102

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

103

@property

104

def exact_type(self):

105

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

106

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

110

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

111

def any(*choices): return group(*choices) + '*'

112

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

113

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

114

# Note: we use unicode matching for names ("\w") but ascii matching for

115

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

116

Whitespace = r'[ \f\t]*'

117

Comment = r'#[^\r\n]*'

118

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

119

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

120

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

121

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

122

Binnumber = r'0[bB][01]+'

123

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

124

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

125

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

126

Exponent = r'[eE][-+]?[0-9]+'

127

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

128

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

129

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

130

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

131

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

132

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

133

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

134

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

135

# Tail end of ' string.

136

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

137

# Tail end of " string.

138

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

139

# Tail end of ''' string.

140

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

141

# Tail end of """ string.

142

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

143

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

144

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

145

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

146

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

147

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

148

# Because of leftmost-then-longest match semantics, be sure to put the

149

# longest operators first (e.g., if = came before ==, == would get

150

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

151

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

152

r"//=?", r"->",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

153

r"[+\-*/%&|^=<>]=?",

154

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

155

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

156

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

157

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

158

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

159

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

160

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

161

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

162

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

163

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

164

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

165

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

166

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

167

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

168

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

169

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

170

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

171

def _compile(expr):

172

return re.compile(expr, re.UNICODE)

173

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

174

endpats = {"'": Single, '"': Double,

175

"'''": Single3, '"""': Double3,

176

"r'''": Single3, 'r"""': Double3,

177

"b'''": Single3, 'b"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

178

"R'''": Single3, 'R"""': Double3,

179

"B'''": Single3, 'B"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

180

"br'''": Single3, 'br"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

181

"bR'''": Single3, 'bR"""': Double3,

182

"Br'''": Single3, 'Br"""': Double3,

183

"BR'''": Single3, 'BR"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

184

"rb'''": Single3, 'rb"""': Double3,

185

"Rb'''": Single3, 'Rb"""': Double3,

186

"rB'''": Single3, 'rB"""': Double3,

187

"RB'''": Single3, 'RB"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

188

"u'''": Single3, 'u"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

189

"R'''": Single3, 'R"""': Double3,

190

"U'''": Single3, 'U"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

191

'r': None, 'R': None, 'b': None, 'B': None,

192

'u': None, 'U': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

193

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

194

triple_quoted = {}

195

for t in ("'''", '"""',

196

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

197

"b'''", 'b"""', "B'''", 'B"""',

198

"br'''", 'br"""', "Br'''", 'Br"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

199

"bR'''", 'bR"""', "BR'''", 'BR"""',

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

200

"rb'''", 'rb"""', "rB'''", 'rB"""',

201

"Rb'''", 'Rb"""', "RB'''", 'RB"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

202

"u'''", 'u"""', "U'''", 'U"""',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

203

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

208

"b'", 'b"', "B'", 'B"',

209

"br'", 'br"', "Br'", 'Br"',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

210

"bR'", 'bR"', "BR'", 'BR"' ,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

211

"rb'", 'rb"', "rB'", 'rB"',

212

"Rb'", 'Rb"', "RB'", 'RB"' ,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

213

"u'", 'u"', "U'", 'U"',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

214

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

215

single_quoted[t] = t

216

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

217

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

218

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

219

class TokenError(Exception): pass

220

221

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

222

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

223

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

230

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

231

232

def add_whitespace(self, start):

233

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

234

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

235

raise ValueError("start ({},{}) precedes previous end ({},{})"

236

.format(row, col, self.prev_row, self.prev_col))

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

237

col_offset = col - self.prev_col

238

if col_offset:

239

self.tokens.append(" " * col_offset)

240

241

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame^]

242

it = iter(iterable)

243

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

244

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame^]

245

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

246

break

247

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

248

if tok_type == ENCODING:

249

self.encoding = token

250

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

251

self.add_whitespace(start)

252

self.tokens.append(token)

253

self.prev_row, self.prev_col = end

254

if tok_type in (NEWLINE, NL):

255

self.prev_row += 1

256

self.prev_col = 0

257

return "".join(self.tokens)

258

259

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

260

indents = []

261

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame^]

262

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

263

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame^]

264

265

for tok in chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

266

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

267

if toknum == ENCODING:

268

self.encoding = tokval

269

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

270

271

if toknum in (NAME, NUMBER):

272

tokval += ' '

273

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

274

# Insert a space between two consecutive strings

275

if toknum == STRING:

276

if prevstring:

277

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

282

if toknum == INDENT:

283

indents.append(tokval)

284

continue

285

elif toknum == DEDENT:

286

indents.pop()

287

continue

288

elif toknum in (NEWLINE, NL):

289

startline = True

290

elif startline and indents:

291

toks_append(indents[-1])

292

startline = False

293

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

294

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

295

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

296

def untokenize(iterable):

297

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

298

It returns a bytes object, encoded using the ENCODING

299

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

300

301

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

302

with at least two elements, a token number and token value. If

303

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

304

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

305

Round-trip invariant for full input:

306

Untokenized source will match input source exactly

307

308

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

309

# Output bytes will tokenize the back to the input

310

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

311

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

312

readline = BytesIO(newcode).readline

313

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

314

assert t1 == t2

315

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

316

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

317

out = ut.untokenize(iterable)

318

if ut.encoding is not None:

319

out = out.encode(ut.encoding)

320

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

321

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

322

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

323

def _get_normal_name(orig_enc):

324

"""Imitates get_normal_name in tokenizer.c."""

325

# Only care about the first 12 characters.

326

enc = orig_enc[:12].lower().replace("_", "-")

327

if enc == "utf-8" or enc.startswith("utf-8-"):

328

return "utf-8"

329

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

330

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

334

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

335

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

336

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

337

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

338

in the same way as the tokenize() generator.

339

340

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

341

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

342

343

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

344

cookie as specified in pep-0263. If both a bom and a cookie are present,

345

but disagree, a SyntaxError will be raised. If the encoding cookie is an

346

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

347

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

348

349

If no encoding is specified, then the default of 'utf-8' will be returned.

350

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

351

try:

352

filename = readline.__self__.name

353

except AttributeError:

354

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

355

bom_found = False

356

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

357

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

362

return b''

363

364

def find_cookie(line):

365

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

366

# Decode as UTF-8. Either the line is an encoding declaration,

367

# in which case it should be pure ASCII, or it must be UTF-8

368

# per default encoding.

369

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

370

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

371

msg = "invalid or missing encoding declaration"

372

if filename is not None:

373

msg = '{} for {!r}'.format(msg, filename)

374

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

375

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

376

match = cookie_re.match(line_string)

377

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

378

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

379

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

380

try:

381

codec = lookup(encoding)

382

except LookupError:

383

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

384

if filename is None:

385

msg = "unknown encoding: " + encoding

386

else:

387

msg = "unknown encoding for {!r}: {}".format(filename,

388

encoding)

389

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

390

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

391

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

392

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

393

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

394

if filename is None:

395

msg = 'encoding problem: utf-8'

396

else:

397

msg = 'encoding problem for {!r}: utf-8'.format(filename)

398

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

399

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

400

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

401

402

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

403

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

404

bom_found = True

405

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

406

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

407

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

408

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

409

410

encoding = find_cookie(first)

411

if encoding:

412

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

413

if not blank_re.match(first):

414

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

415

416

second = read_or_stop()

417

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

418

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

419

420

encoding = find_cookie(second)

421

if encoding:

422

return encoding, [first, second]

423

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

424

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

425

426

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

427

def open(filename):

428

"""Open a file in read only mode using the encoding detected by

429

detect_encoding().

430

"""

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

431

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

432

encoding, lines = detect_encoding(buffer.readline)

433

buffer.seek(0)

434

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

439

def tokenize(readline):

440

"""

441

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

442

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

443

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

444

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

445

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

446

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

447

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

448

The generator produces 5-tuples with these members: the token type; the

449

token string; a 2-tuple (srow, scol) of ints specifying the row and

450

column where the token begins in the source; a 2-tuple (erow, ecol) of

451

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

452

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

453

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

454

455

The first token sequence will always be an ENCODING token

456

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

457

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

458

# This import is here to avoid problems when the itertools module is not

459

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

460

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

461

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

462

rl_gen = iter(readline, b"")

463

empty = repeat(b"")

464

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

465

466

467

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

468

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

469

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

470

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

471

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

472

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

473

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

474

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

475

if encoding == "utf-8-sig":

476

# BOM will already have been stripped.

477

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

478

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

479

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

480

try:

481

line = readline()

482

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

483

line = b''

484

485

if encoding is not None:

486

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

487

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

488

pos, max = 0, len(line)

489

490

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

491

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

492

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

493

endmatch = endprog.match(line)

494

if endmatch:

495

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

496

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

497

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

498

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

499

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

500

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

501

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

502

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

503

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

504

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

505

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

506

else:

507

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

508

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

509

continue

510

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

511

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

512

if not line: break

513

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

514

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

515

if line[pos] == ' ':

516

column += 1

517

elif line[pos] == '\t':

518

column = (column//tabsize + 1)*tabsize

519

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

526

527

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

528

if line[pos] == '#':

529

comment_token = line[pos:].rstrip('\r\n')

530

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

531

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

532

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

533

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

534

(lnum, nl_pos), (lnum, len(line)), line)

535

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

536

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

537

(lnum, pos), (lnum, len(line)), line)

538

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

539

540

if column > indents[-1]: # count indents or dedents

541

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

542

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

543

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

544

if column not in indents:

545

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

546

"unindent does not match any outer indentation level",

547

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

548

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

549

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

550

551

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

552

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

553

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

554

continued = 0

555

556

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

557

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

558

if pseudomatch: # scan for tokens

559

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

560

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

561

if start == end:

562

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

563

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

564

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

565

if (initial in numchars or # ordinary number

566

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

567

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

568

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

569

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

570

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

571

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

572

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

573

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

574

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

575

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

576

endmatch = endprog.match(line, pos)

577

if endmatch: # all on one line

578

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

579

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

580

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

581

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

582

strstart = (lnum, start) # multiple lines

583

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

584

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

585

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

586

elif initial in single_quoted or \

587

token[:2] in single_quoted or \

588

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

589

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

590

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

591

endprog = _compile(endpats[initial] or

592

endpats[token[1]] or

593

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

594

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

595

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

596

break

597

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

598

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

599

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

600

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

601

elif initial == '\\': # continued stmt

602

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

603

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

604

if initial in '([{':

605

parenlev += 1

606

elif initial in ')]}':

607

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

608

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

609

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

610

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

611

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

612

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

613

614

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

615

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

616

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

617

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

618

619

# An undocumented, backwards compatible, API for all the places in the standard

620

# library that expect to be able to use tokenize with strings

621

def generate_tokens(readline):

622

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

623

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

628

def perror(message):

629

print(message, file=sys.stderr)

630

631

def error(message, filename=None, location=None):

632

if location:

633

args = (filename,) + location + (message,)

634

perror("%s:%d:%d: error: %s" % args)

635

elif filename:

636

perror("%s: error: %s" % (filename, message))

637

else:

638

perror("error: %s" % message)

639

sys.exit(1)

640

641

# Parse the arguments and options

642

parser = argparse.ArgumentParser(prog='python -m tokenize')

643

parser.add_argument(dest='filename', nargs='?',

644

metavar='filename.py',

645

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

646

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

647

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

648

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

654

with builtins.open(filename, 'rb') as f:

655

tokens = list(tokenize(f.readline))

656

else:

657

filename = "<stdin>"

658

tokens = _tokenize(sys.stdin.readline, None)

659

660

# Output the tokenization

661

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

662

token_type = token.type

663

if args.exact:

664

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

665

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

666

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

667

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

668

except IndentationError as err:

669

line, column = err.args[1][1:3]

670

error(err.args[0], filename, (line, column))

671

except TokenError as err:

672

line, column = err.args[1]

673

error(err.args[0], filename, (line, column))

674

except SyntaxError as err:

675

error(err, filename)

676

except IOError as err:

677

error(err)

678

except KeyboardInterrupt:

679

print("interrupted\n")

680

except Exception as err:

681

perror("unexpected error: %s" % err)

682

raise

683

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

684

if __name__ == "__main__":

Meador Inge