Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

27

import builtins

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

30

from io import TextIOWrapper

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

31

from itertools import chain

import re

import sys

from token import *

Serhiy Storchaka

2013-09-16 23:51:56 +0300

[diff] [blame]

36

cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

37

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

38

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

40

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

41

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

42

del token

43

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

44

COMMENT = N_TOKENS

45

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

46

NL = N_TOKENS + 1

47

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

48

ENCODING = N_TOKENS + 2

49

tok_name[ENCODING] = 'ENCODING'

50

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

51

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

89

'<<=': LEFTSHIFTEQUAL,

90

'>>=': RIGHTSHIFTEQUAL,

91

'**=': DOUBLESTAREQUAL,

92

'//': DOUBLESLASH,

93

'//=': DOUBLESLASHEQUAL,

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame^]

94

'@': AT,

95

'@=': ATEQUAL,

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

96

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

97

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

98

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

99

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

100

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

101

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

102

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

103

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

104

@property

105

def exact_type(self):

106

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

107

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

111

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

112

def any(*choices): return group(*choices) + '*'

113

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

114

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

115

# Note: we use unicode matching for names ("\w") but ascii matching for

116

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

117

Whitespace = r'[ \f\t]*'

118

Comment = r'#[^\r\n]*'

119

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

120

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

121

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

122

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

123

Binnumber = r'0[bB][01]+'

124

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

125

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

126

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

127

Exponent = r'[eE][-+]?[0-9]+'

128

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

129

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

130

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

131

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

132

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

133

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

134

StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

135

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

136

# Tail end of ' string.

137

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

138

# Tail end of " string.

139

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

140

# Tail end of ''' string.

141

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

142

# Tail end of """ string.

143

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

144

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

145

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

146

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

147

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

148

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

149

# Because of leftmost-then-longest match semantics, be sure to put the

150

# longest operators first (e.g., if = came before ==, == would get

151

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

152

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

153

r"//=?", r"->",

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame^]

154

r"[+\-*/%&@|^=<>]=?",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

155

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

156

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

157

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

158

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

159

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

160

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

161

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

162

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

163

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

164

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

165

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

166

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

167

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

168

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

169

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

170

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

171

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

172

def _compile(expr):

173

return re.compile(expr, re.UNICODE)

174

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

175

endpats = {"'": Single, '"': Double,

176

"'''": Single3, '"""': Double3,

177

"r'''": Single3, 'r"""': Double3,

178

"b'''": Single3, 'b"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

179

"R'''": Single3, 'R"""': Double3,

180

"B'''": Single3, 'B"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

181

"br'''": Single3, 'br"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

182

"bR'''": Single3, 'bR"""': Double3,

183

"Br'''": Single3, 'Br"""': Double3,

184

"BR'''": Single3, 'BR"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

185

"rb'''": Single3, 'rb"""': Double3,

186

"Rb'''": Single3, 'Rb"""': Double3,

187

"rB'''": Single3, 'rB"""': Double3,

188

"RB'''": Single3, 'RB"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

189

"u'''": Single3, 'u"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

190

"R'''": Single3, 'R"""': Double3,

191

"U'''": Single3, 'U"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

192

'r': None, 'R': None, 'b': None, 'B': None,

193

'u': None, 'U': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

194

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

195

triple_quoted = {}

196

for t in ("'''", '"""',

197

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

198

"b'''", 'b"""', "B'''", 'B"""',

199

"br'''", 'br"""', "Br'''", 'Br"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

200

"bR'''", 'bR"""', "BR'''", 'BR"""',

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

201

"rb'''", 'rb"""', "rB'''", 'rB"""',

202

"Rb'''", 'Rb"""', "RB'''", 'RB"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

203

"u'''", 'u"""', "U'''", 'U"""',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

204

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

209

"b'", 'b"', "B'", 'B"',

210

"br'", 'br"', "Br'", 'Br"',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

211

"bR'", 'bR"', "BR'", 'BR"' ,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

212

"rb'", 'rb"', "rB'", 'rB"',

213

"Rb'", 'Rb"', "RB'", 'RB"' ,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

214

"u'", 'u"', "U'", 'U"',

Christian Heimes

0b3847d

2012-06-20 11:17:58 +0200

[diff] [blame]

215

):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

216

single_quoted[t] = t

217

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

218

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

219

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

220

class TokenError(Exception): pass

221

222

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

223

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

224

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

231

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

232

233

def add_whitespace(self, start):

234

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

235

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

236

raise ValueError("start ({},{}) precedes previous end ({},{})"

237

.format(row, col, self.prev_row, self.prev_col))

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

238

row_offset = row - self.prev_row

Terry Jan Reedy

f106f8f

2014-02-23 23:39:57 -0500

[diff] [blame]

239

if row_offset:

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

240

self.tokens.append("\\\n" * row_offset)

241

self.prev_col = 0

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

242

col_offset = col - self.prev_col

243

if col_offset:

244

self.tokens.append(" " * col_offset)

245

246

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

247

it = iter(iterable)

248

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

249

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

250

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

251

break

252

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

253

if tok_type == ENCODING:

254

self.encoding = token

255

continue

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

256

if tok_type == ENDMARKER:

257

break

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

258

self.add_whitespace(start)

259

self.tokens.append(token)

260

self.prev_row, self.prev_col = end

261

if tok_type in (NEWLINE, NL):

262

self.prev_row += 1

263

self.prev_col = 0

264

return "".join(self.tokens)

265

266

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

267

indents = []

268

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

269

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

270

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

271

272

for tok in chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

273

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

274

if toknum == ENCODING:

275

self.encoding = tokval

276

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

277

278

if toknum in (NAME, NUMBER):

279

tokval += ' '

280

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

281

# Insert a space between two consecutive strings

282

if toknum == STRING:

283

if prevstring:

284

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

289

if toknum == INDENT:

290

indents.append(tokval)

291

continue

292

elif toknum == DEDENT:

293

indents.pop()

294

continue

295

elif toknum in (NEWLINE, NL):

296

startline = True

297

elif startline and indents:

298

toks_append(indents[-1])

299

startline = False

300

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

301

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

302

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

303

def untokenize(iterable):

304

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

305

It returns a bytes object, encoded using the ENCODING

306

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

307

308

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

309

with at least two elements, a token number and token value. If

310

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

311

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

312

Round-trip invariant for full input:

313

Untokenized source will match input source exactly

314

315

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

316

# Output bytes will tokenize the back to the input

317

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

318

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

readline = BytesIO(newcode).readline

320

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

321

assert t1 == t2

322

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

323

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

324

out = ut.untokenize(iterable)

325

if ut.encoding is not None:

326

out = out.encode(ut.encoding)

327

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

328

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

329

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

330

def _get_normal_name(orig_enc):

331

"""Imitates get_normal_name in tokenizer.c."""

332

# Only care about the first 12 characters.

333

enc = orig_enc[:12].lower().replace("_", "-")

334

if enc == "utf-8" or enc.startswith("utf-8-"):

335

return "utf-8"

336

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

337

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

341

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

342

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

343

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

344

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

345

in the same way as the tokenize() generator.

346

347

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

348

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

349

350

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

351

cookie as specified in pep-0263. If both a bom and a cookie are present,

352

but disagree, a SyntaxError will be raised. If the encoding cookie is an

353

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

354

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

355

356

If no encoding is specified, then the default of 'utf-8' will be returned.

357

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

358

try:

359

filename = readline.__self__.name

360

except AttributeError:

361

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

362

bom_found = False

363

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

364

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

369

return b''

370

371

def find_cookie(line):

372

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

373

# Decode as UTF-8. Either the line is an encoding declaration,

374

# in which case it should be pure ASCII, or it must be UTF-8

375

# per default encoding.

376

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

377

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

378

msg = "invalid or missing encoding declaration"

379

if filename is not None:

380

msg = '{} for {!r}'.format(msg, filename)

381

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

382

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

383

match = cookie_re.match(line_string)

384

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

385

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

386

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

387

try:

388

codec = lookup(encoding)

389

except LookupError:

390

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

391

if filename is None:

392

msg = "unknown encoding: " + encoding

393

else:

394

msg = "unknown encoding for {!r}: {}".format(filename,

395

encoding)

396

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

397

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

398

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

399

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

400

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

401

if filename is None:

402

msg = 'encoding problem: utf-8'

403

else:

404

msg = 'encoding problem for {!r}: utf-8'.format(filename)

405

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

406

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

407

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

408

409

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

410

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

411

bom_found = True

412

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

413

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

414

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

415

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

416

417

encoding = find_cookie(first)

418

if encoding:

419

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

420

if not blank_re.match(first):

421

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

422

423

second = read_or_stop()

424

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

425

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

426

427

encoding = find_cookie(second)

428

if encoding:

429

return encoding, [first, second]

430

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

431

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

432

433

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

434

def open(filename):

435

"""Open a file in read only mode using the encoding detected by

436

detect_encoding().

437

"""

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

438

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

439

encoding, lines = detect_encoding(buffer.readline)

440

buffer.seek(0)

441

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

446

def tokenize(readline):

447

"""

448

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

449

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

450

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

451

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

452

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

453

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

454

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

455

The generator produces 5-tuples with these members: the token type; the

456

token string; a 2-tuple (srow, scol) of ints specifying the row and

457

column where the token begins in the source; a 2-tuple (erow, ecol) of

458

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

459

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

460

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

461

462

The first token sequence will always be an ENCODING token

463

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

464

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

465

# This import is here to avoid problems when the itertools module is not

466

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

467

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

468

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

469

rl_gen = iter(readline, b"")

470

empty = repeat(b"")

471

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

472

473

474

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

475

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

476

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

477

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

478

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

479

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

480

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

481

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

482

if encoding == "utf-8-sig":

483

# BOM will already have been stripped.

484

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

485

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

486

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

487

try:

488

line = readline()

489

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

490

line = b''

491

492

if encoding is not None:

493

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

494

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

495

pos, max = 0, len(line)

496

497

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

498

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

499

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

500

endmatch = endprog.match(line)

501

if endmatch:

502

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

503

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

504

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

505

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

506

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

507

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

508

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

509

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

510

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

511

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

512

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

513

else:

514

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

515

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

516

continue

517

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

518

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

519

if not line: break

520

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

521

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

522

if line[pos] == ' ':

523

column += 1

524

elif line[pos] == '\t':

525

column = (column//tabsize + 1)*tabsize

526

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

533

534

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

535

if line[pos] == '#':

536

comment_token = line[pos:].rstrip('\r\n')

537

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

538

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

539

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

540

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

541

(lnum, nl_pos), (lnum, len(line)), line)

542

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

543

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

544

(lnum, pos), (lnum, len(line)), line)

545

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

546

547

if column > indents[-1]: # count indents or dedents

548

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

549

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

550

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

551

if column not in indents:

552

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

553

"unindent does not match any outer indentation level",

554

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

555

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

556

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

557

558

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

559

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

560

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

561

continued = 0

562

563

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

564

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

565

if pseudomatch: # scan for tokens

566

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

567

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

568

if start == end:

569

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

570

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

571

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

572

if (initial in numchars or # ordinary number

573

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

574

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

575

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

576

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

577

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

578

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

579

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

580

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

581

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

582

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

583

endmatch = endprog.match(line, pos)

584

if endmatch: # all on one line

585

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

586

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

587

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

588

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

589

strstart = (lnum, start) # multiple lines

590

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

591

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

592

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

593

elif initial in single_quoted or \

594

token[:2] in single_quoted or \

595

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

596

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

597

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

598

endprog = _compile(endpats[initial] or

599

endpats[token[1]] or

600

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

601

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

602

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

603

break

604

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

605

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

606

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

607

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

608

elif initial == '\\': # continued stmt

609

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

610

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

611

if initial in '([{':

612

parenlev += 1

613

elif initial in ')]}':

614

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

615

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

616

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

617

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

618

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

619

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

620

621

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

622

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

623

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

624

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

625

626

# An undocumented, backwards compatible, API for all the places in the standard

627

# library that expect to be able to use tokenize with strings

628

def generate_tokens(readline):

629

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

630

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

635

def perror(message):

636

print(message, file=sys.stderr)

637

638

def error(message, filename=None, location=None):

639

if location:

640

args = (filename,) + location + (message,)

641

perror("%s:%d:%d: error: %s" % args)

642

elif filename:

643

perror("%s: error: %s" % (filename, message))

644

else:

645

perror("error: %s" % message)

646

sys.exit(1)

647

648

# Parse the arguments and options

649

parser = argparse.ArgumentParser(prog='python -m tokenize')

650

parser.add_argument(dest='filename', nargs='?',

651

metavar='filename.py',

652

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

653

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

654

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

655

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

661

with builtins.open(filename, 'rb') as f:

662

tokens = list(tokenize(f.readline))

663

else:

664

filename = "<stdin>"

665

tokens = _tokenize(sys.stdin.readline, None)

666

667

# Output the tokenization

668

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

669

token_type = token.type

670

if args.exact:

671

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

672

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

673

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

674

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

675

except IndentationError as err:

676

line, column = err.args[1][1:3]

677

error(err.args[0], filename, (line, column))

678

except TokenError as err:

679

line, column = err.args[1]

680

error(err.args[0], filename, (line, column))

681

except SyntaxError as err:

682

error(err, filename)

Andrew Svetlov

f7a17b4

2012-12-25 16:47:37 +0200

[diff] [blame]

683

except OSError as err:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

684

error(err)

685

except KeyboardInterrupt:

686

print("interrupted\n")

687

except Exception as err:

688

perror("unexpected error: %s" % err)

689

raise

690

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

691

if __name__ == "__main__":

Meador Inge