Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

27

import builtins

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

28

import re

29

import sys

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

30

from token import *

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

31

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

32

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

33

from io import TextIOWrapper

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

34

cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

35

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

36

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

37

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

38

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

39

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

40

del token

41

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

42

COMMENT = N_TOKENS

43

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

44

NL = N_TOKENS + 1

45

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

46

ENCODING = N_TOKENS + 2

47

tok_name[ENCODING] = 'ENCODING'

48

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

49

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

87

'<<=': LEFTSHIFTEQUAL,

88

'>>=': RIGHTSHIFTEQUAL,

89

'**=': DOUBLESTAREQUAL,

90

'//': DOUBLESLASH,

91

'//=': DOUBLESLASHEQUAL,

92

'@': AT

93

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

94

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

95

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

96

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

97

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

98

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

99

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

100

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

101

@property

102

def exact_type(self):

103

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

104

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

108

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

109

def any(*choices): return group(*choices) + '*'

110

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

111

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

112

# Note: we use unicode matching for names ("\w") but ascii matching for

113

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

114

Whitespace = r'[ \f\t]*'

115

Comment = r'#[^\r\n]*'

116

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

117

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

118

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

119

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

120

Binnumber = r'0[bB][01]+'

121

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

122

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

123

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

124

Exponent = r'[eE][-+]?[0-9]+'

125

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

126

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

127

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

128

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

129

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

130

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

131

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

132

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

133

# Tail end of ' string.

134

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

135

# Tail end of " string.

136

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

137

# Tail end of ''' string.

138

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

139

# Tail end of """ string.

140

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

141

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

142

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

143

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

144

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

145

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

146

# Because of leftmost-then-longest match semantics, be sure to put the

147

# longest operators first (e.g., if = came before ==, == would get

148

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

149

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

150

r"//=?", r"->",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

151

r"[+\-*/%&|^=<>]=?",

152

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

153

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

154

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

155

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

156

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

157

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

158

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

159

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

160

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

161

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

162

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

163

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

164

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

165

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

166

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

167

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

168

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

169

def _compile(expr):

170

return re.compile(expr, re.UNICODE)

171

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

172

endpats = {"'": Single, '"': Double,

173

"'''": Single3, '"""': Double3,

174

"r'''": Single3, 'r"""': Double3,

175

"b'''": Single3, 'b"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

176

"R'''": Single3, 'R"""': Double3,

177

"B'''": Single3, 'B"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

178

"br'''": Single3, 'br"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

179

"bR'''": Single3, 'bR"""': Double3,

180

"Br'''": Single3, 'Br"""': Double3,

181

"BR'''": Single3, 'BR"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

182

"rb'''": Single3, 'rb"""': Double3,

183

"Rb'''": Single3, 'Rb"""': Double3,

184

"rB'''": Single3, 'rB"""': Double3,

185

"RB'''": Single3, 'RB"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

186

"u'''": Single3, 'u"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

187

"R'''": Single3, 'R"""': Double3,

188

"U'''": Single3, 'U"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

189

'r': None, 'R': None, 'b': None, 'B': None,

190

'u': None, 'U': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

191

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

192

triple_quoted = {}

193

for t in ("'''", '"""',

194

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

195

"b'''", 'b"""', "B'''", 'B"""',

196

"br'''", 'br"""', "Br'''", 'Br"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

197

"bR'''", 'bR"""', "BR'''", 'BR"""',

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

198

"rb'''", 'rb"""', "rB'''", 'rB"""',

199

"Rb'''", 'Rb"""', "RB'''", 'RB"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

200

"u'''", 'u"""', "U'''", 'U"""',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

201

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

206

"b'", 'b"', "B'", 'B"',

207

"br'", 'br"', "Br'", 'Br"',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

208

"bR'", 'bR"', "BR'", 'BR"' ,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

209

"rb'", 'rb"', "rB'", 'rB"',

210

"Rb'", 'Rb"', "RB'", 'RB"' ,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

211

"u'", 'u"', "U'", 'U"',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

212

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

213

single_quoted[t] = t

214

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

215

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

216

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

217

class TokenError(Exception): pass

218

219

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

220

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

221

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

228

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

229

230

def add_whitespace(self, start):

231

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame^]

232

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

233

raise ValueError("start ({},{}) precedes previous end ({},{})"

234

.format(row, col, self.prev_row, self.prev_col))

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

235

col_offset = col - self.prev_col

236

if col_offset:

237

self.tokens.append(" " * col_offset)

238

239

def untokenize(self, iterable):

240

for t in iterable:

241

if len(t) == 2:

242

self.compat(t, iterable)

243

break

244

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

245

if tok_type == ENCODING:

246

self.encoding = token

247

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

248

self.add_whitespace(start)

249

self.tokens.append(token)

250

self.prev_row, self.prev_col = end

251

if tok_type in (NEWLINE, NL):

252

self.prev_row += 1

253

self.prev_col = 0

254

return "".join(self.tokens)

255

256

def compat(self, token, iterable):

257

startline = False

258

indents = []

259

toks_append = self.tokens.append

260

toknum, tokval = token

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

261

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

262

if toknum in (NAME, NUMBER):

263

tokval += ' '

264

if toknum in (NEWLINE, NL):

265

startline = True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

266

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

267

for tok in iterable:

268

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

269

if toknum == ENCODING:

270

self.encoding = tokval

271

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

272

273

if toknum in (NAME, NUMBER):

274

tokval += ' '

275

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

276

# Insert a space between two consecutive strings

277

if toknum == STRING:

278

if prevstring:

279

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

284

if toknum == INDENT:

285

indents.append(tokval)

286

continue

287

elif toknum == DEDENT:

288

indents.pop()

289

continue

290

elif toknum in (NEWLINE, NL):

291

startline = True

292

elif startline and indents:

293

toks_append(indents[-1])

294

startline = False

295

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

296

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

297

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

298

def untokenize(iterable):

299

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

300

It returns a bytes object, encoded using the ENCODING

301

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

302

303

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

304

with at least two elements, a token number and token value. If

305

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

306

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

307

Round-trip invariant for full input:

308

Untokenized source will match input source exactly

309

310

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

311

# Output bytes will tokenize the back to the input

312

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

313

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

314

readline = BytesIO(newcode).readline

315

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

316

assert t1 == t2

317

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

318

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

out = ut.untokenize(iterable)

320

if ut.encoding is not None:

321

out = out.encode(ut.encoding)

322

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

323

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

324

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

325

def _get_normal_name(orig_enc):

326

"""Imitates get_normal_name in tokenizer.c."""

327

# Only care about the first 12 characters.

328

enc = orig_enc[:12].lower().replace("_", "-")

329

if enc == "utf-8" or enc.startswith("utf-8-"):

330

return "utf-8"

331

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

332

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

336

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

337

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

338

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

339

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

340

in the same way as the tokenize() generator.

341

342

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

343

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

344

345

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

346

cookie as specified in pep-0263. If both a bom and a cookie are present,

347

but disagree, a SyntaxError will be raised. If the encoding cookie is an

348

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

349

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

350

351

If no encoding is specified, then the default of 'utf-8' will be returned.

352

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

353

try:

354

filename = readline.__self__.name

355

except AttributeError:

356

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

357

bom_found = False

358

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

359

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

364

return b''

365

366

def find_cookie(line):

367

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

368

# Decode as UTF-8. Either the line is an encoding declaration,

369

# in which case it should be pure ASCII, or it must be UTF-8

370

# per default encoding.

371

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

372

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

373

msg = "invalid or missing encoding declaration"

374

if filename is not None:

375

msg = '{} for {!r}'.format(msg, filename)

376

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

377

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

378

match = cookie_re.match(line_string)

379

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

380

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

381

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

382

try:

383

codec = lookup(encoding)

384

except LookupError:

385

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

386

if filename is None:

387

msg = "unknown encoding: " + encoding

388

else:

389

msg = "unknown encoding for {!r}: {}".format(filename,

390

encoding)

391

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

392

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

393

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

394

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

395

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

396

if filename is None:

397

msg = 'encoding problem: utf-8'

398

else:

399

msg = 'encoding problem for {!r}: utf-8'.format(filename)

400

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

401

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

402

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

403

404

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

405

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

406

bom_found = True

407

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

408

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

409

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

410

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

411

412

encoding = find_cookie(first)

413

if encoding:

414

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

415

if not blank_re.match(first):

416

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

417

418

second = read_or_stop()

419

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

420

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

421

422

encoding = find_cookie(second)

423

if encoding:

424

return encoding, [first, second]

425

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

426

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

427

428

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

429

def open(filename):

430

"""Open a file in read only mode using the encoding detected by

431

detect_encoding().

432

"""

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

433

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

434

encoding, lines = detect_encoding(buffer.readline)

435

buffer.seek(0)

436

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

441

def tokenize(readline):

442

"""

443

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

444

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

445

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

446

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

447

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

448

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

449

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

450

The generator produces 5-tuples with these members: the token type; the

451

token string; a 2-tuple (srow, scol) of ints specifying the row and

452

column where the token begins in the source; a 2-tuple (erow, ecol) of

453

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

454

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

455

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

456

457

The first token sequence will always be an ENCODING token

458

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

459

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

460

# This import is here to avoid problems when the itertools module is not

461

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

462

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

463

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

464

rl_gen = iter(readline, b"")

465

empty = repeat(b"")

466

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

467

468

469

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

470

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

471

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

472

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

473

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

474

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

475

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

476

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

477

if encoding == "utf-8-sig":

478

# BOM will already have been stripped.

479

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

480

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

481

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

482

try:

483

line = readline()

484

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

485

line = b''

486

487

if encoding is not None:

488

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

489

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

490

pos, max = 0, len(line)

491

492

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

493

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

494

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

495

endmatch = endprog.match(line)

496

if endmatch:

497

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

498

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

499

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

500

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

501

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

502

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

503

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

504

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

505

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

506

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

507

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

508

else:

509

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

510

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

511

continue

512

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

513

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

514

if not line: break

515

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

516

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

517

if line[pos] == ' ':

518

column += 1

519

elif line[pos] == '\t':

520

column = (column//tabsize + 1)*tabsize

521

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

528

529

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

530

if line[pos] == '#':

531

comment_token = line[pos:].rstrip('\r\n')

532

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

533

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

534

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

535

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

536

(lnum, nl_pos), (lnum, len(line)), line)

537

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

538

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

539

(lnum, pos), (lnum, len(line)), line)

540

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

541

542

if column > indents[-1]: # count indents or dedents

543

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

544

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

545

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

546

if column not in indents:

547

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

548

"unindent does not match any outer indentation level",

549

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

550

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

551

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

552

553

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

554

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

555

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

556

continued = 0

557

558

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

559

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

560

if pseudomatch: # scan for tokens

561

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

562

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

563

if start == end:

564

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

565

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

566

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

567

if (initial in numchars or # ordinary number

568

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

569

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

570

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

571

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

572

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

573

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

574

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

575

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

576

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

577

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

578

endmatch = endprog.match(line, pos)

579

if endmatch: # all on one line

580

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

581

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

582

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

583

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

584

strstart = (lnum, start) # multiple lines

585

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

586

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

587

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

588

elif initial in single_quoted or \

589

token[:2] in single_quoted or \

590

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

591

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

592

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

593

endprog = _compile(endpats[initial] or

594

endpats[token[1]] or

595

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

596

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

597

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

598

break

599

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

600

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

601

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

602

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

603

elif initial == '\\': # continued stmt

604

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

605

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

606

if initial in '([{':

607

parenlev += 1

608

elif initial in ')]}':

609

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

610

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

611

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

612

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

613

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

614

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

615

616

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

617

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

618

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

619

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

620

621

# An undocumented, backwards compatible, API for all the places in the standard

622

# library that expect to be able to use tokenize with strings

623

def generate_tokens(readline):

624

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

625

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

630

def perror(message):

631

print(message, file=sys.stderr)

632

633

def error(message, filename=None, location=None):

634

if location:

635

args = (filename,) + location + (message,)

636

perror("%s:%d:%d: error: %s" % args)

637

elif filename:

638

perror("%s: error: %s" % (filename, message))

639

else:

640

perror("error: %s" % message)

641

sys.exit(1)

642

643

# Parse the arguments and options

644

parser = argparse.ArgumentParser(prog='python -m tokenize')

645

parser.add_argument(dest='filename', nargs='?',

646

metavar='filename.py',

647

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

648

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

649

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

650

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

656

with builtins.open(filename, 'rb') as f:

657

tokens = list(tokenize(f.readline))

658

else:

659

filename = "<stdin>"

660

tokens = _tokenize(sys.stdin.readline, None)

661

662

# Output the tokenization

663

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

664

token_type = token.type

665

if args.exact:

666

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

667

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

668

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

669

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

670

except IndentationError as err:

671

line, column = err.args[1][1:3]

672

error(err.args[0], filename, (line, column))

673

except TokenError as err:

674

line, column = err.args[1]

675

error(err.args[0], filename, (line, column))

676

except SyntaxError as err:

677

error(err, filename)

678

except IOError as err:

679

error(err)

680

except KeyboardInterrupt:

681

print("interrupted\n")

682

except Exception as err:

683

perror("unexpected error: %s" % err)

684

raise

685

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

686

if __name__ == "__main__":

Meador Inge