Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

27

import builtins

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

28

import re

29

import sys

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

30

from token import *

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

31

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

32

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

33

from io import TextIOWrapper

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

34

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

35

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

36

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

37

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

38

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

del token

40

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

41

COMMENT = N_TOKENS

42

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

43

NL = N_TOKENS + 1

44

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

45

ENCODING = N_TOKENS + 2

46

tok_name[ENCODING] = 'ENCODING'

47

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

48

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

86

'<<=': LEFTSHIFTEQUAL,

87

'>>=': RIGHTSHIFTEQUAL,

88

'**=': DOUBLESTAREQUAL,

89

'//': DOUBLESLASH,

90

'//=': DOUBLESLASHEQUAL,

91

'@': AT

92

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

93

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

94

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

95

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

96

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

97

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

98

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

99

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

100

@property

101

def exact_type(self):

102

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

103

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

107

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

108

def any(*choices): return group(*choices) + '*'

109

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

110

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

111

# Note: we use unicode matching for names ("\w") but ascii matching for

112

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

113

Whitespace = r'[ \f\t]*'

114

Comment = r'#[^\r\n]*'

115

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

116

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

117

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

118

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

119

Binnumber = r'0[bB][01]+'

120

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

121

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

122

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

123

Exponent = r'[eE][-+]?[0-9]+'

124

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

125

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

126

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

127

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

128

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

129

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

130

# Tail end of ' string.

131

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

132

# Tail end of " string.

133

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

134

# Tail end of ''' string.

135

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

136

# Tail end of """ string.

137

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

138

Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

139

# Single-line ' or " string.

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

140

String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

141

r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

142

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

143

# Because of leftmost-then-longest match semantics, be sure to put the

144

# longest operators first (e.g., if = came before ==, == would get

145

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

146

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

147

r"//=?", r"->",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

148

r"[+\-*/%&|^=<>]=?",

149

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

150

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

151

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

152

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

153

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

154

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

155

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

156

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

157

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

158

# First (or only) line of ' or " string.

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

159

ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

160

group("'", r'\\\r?\n'),

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

161

r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

162

group('"', r'\\\r?\n'))

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

163

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

164

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

165

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

166

def _compile(expr):

167

return re.compile(expr, re.UNICODE)

168

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

169

endpats = {"'": Single, '"': Double,

170

"'''": Single3, '"""': Double3,

171

"r'''": Single3, 'r"""': Double3,

172

"b'''": Single3, 'b"""': Double3,

173

"br'''": Single3, 'br"""': Double3,

174

"R'''": Single3, 'R"""': Double3,

175

"B'''": Single3, 'B"""': Double3,

176

"bR'''": Single3, 'bR"""': Double3,

177

"Br'''": Single3, 'Br"""': Double3,

178

"BR'''": Single3, 'BR"""': Double3,

179

'r': None, 'R': None, 'b': None, 'B': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

180

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

181

triple_quoted = {}

182

for t in ("'''", '"""',

183

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

184

"b'''", 'b"""', "B'''", 'B"""',

185

"br'''", 'br"""', "Br'''", 'Br"""',

186

"bR'''", 'bR"""', "BR'''", 'BR"""'):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

2007-11-12 17:40:10 +0000

[diff] [blame]

191

"b'", 'b"', "B'", 'B"',

192

"br'", 'br"', "Br'", 'Br"',

193

"bR'", 'bR"', "BR'", 'BR"' ):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

194

single_quoted[t] = t

195

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

196

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

197

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

198

class TokenError(Exception): pass

199

200

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

201

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

202

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

209

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

210

211

def add_whitespace(self, start):

212

row, col = start

213

assert row <= self.prev_row

214

col_offset = col - self.prev_col

215

if col_offset:

216

self.tokens.append(" " * col_offset)

217

218

def untokenize(self, iterable):

219

for t in iterable:

220

if len(t) == 2:

221

self.compat(t, iterable)

222

break

223

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

224

if tok_type == ENCODING:

225

self.encoding = token

226

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

227

self.add_whitespace(start)

228

self.tokens.append(token)

229

self.prev_row, self.prev_col = end

230

if tok_type in (NEWLINE, NL):

231

self.prev_row += 1

232

self.prev_col = 0

233

return "".join(self.tokens)

234

235

def compat(self, token, iterable):

236

startline = False

237

indents = []

238

toks_append = self.tokens.append

239

toknum, tokval = token

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

240

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

241

if toknum in (NAME, NUMBER):

242

tokval += ' '

243

if toknum in (NEWLINE, NL):

244

startline = True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

245

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

246

for tok in iterable:

247

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

248

if toknum == ENCODING:

249

self.encoding = tokval

250

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

251

252

if toknum in (NAME, NUMBER):

253

tokval += ' '

254

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

255

# Insert a space between two consecutive strings

256

if toknum == STRING:

257

if prevstring:

258

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

263

if toknum == INDENT:

264

indents.append(tokval)

265

continue

266

elif toknum == DEDENT:

267

indents.pop()

268

continue

269

elif toknum in (NEWLINE, NL):

270

startline = True

271

elif startline and indents:

272

toks_append(indents[-1])

273

startline = False

274

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

275

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

276

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

277

def untokenize(iterable):

278

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

279

It returns a bytes object, encoded using the ENCODING

280

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

281

282

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

283

with at least two elements, a token number and token value. If

284

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

285

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

286

Round-trip invariant for full input:

287

Untokenized source will match input source exactly

288

289

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

290

# Output bytes will tokenize the back to the input

291

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

292

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

293

readline = BytesIO(newcode).readline

294

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

295

assert t1 == t2

296

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

297

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

298

out = ut.untokenize(iterable)

299

if ut.encoding is not None:

300

out = out.encode(ut.encoding)

301

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

302

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

303

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

304

def _get_normal_name(orig_enc):

305

"""Imitates get_normal_name in tokenizer.c."""

306

# Only care about the first 12 characters.

307

enc = orig_enc[:12].lower().replace("_", "-")

308

if enc == "utf-8" or enc.startswith("utf-8-"):

309

return "utf-8"

310

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

311

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

315

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

316

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

317

The detect_encoding() function is used to detect the encoding that should

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

318

be used to decode a Python source file. It requires one argment, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

in the same way as the tokenize() generator.

320

321

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

322

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

323

324

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

325

cookie as specified in pep-0263. If both a bom and a cookie are present,

326

but disagree, a SyntaxError will be raised. If the encoding cookie is an

327

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

328

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

329

330

If no encoding is specified, then the default of 'utf-8' will be returned.

331

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

332

bom_found = False

333

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

334

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

339

return b''

340

341

def find_cookie(line):

342

try:

343

line_string = line.decode('ascii')

344

except UnicodeDecodeError:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

345

return None

346

347

matches = cookie_re.findall(line_string)

348

if not matches:

349

return None

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

350

encoding = _get_normal_name(matches[0])

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

351

try:

352

codec = lookup(encoding)

353

except LookupError:

354

# This behaviour mimics the Python interpreter

355

raise SyntaxError("unknown encoding: " + encoding)

356

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

357

if bom_found:

358

if codec.name != 'utf-8':

359

# This behaviour mimics the Python interpreter

360

raise SyntaxError('encoding problem: utf-8')

361

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

362

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

363

364

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

365

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

366

bom_found = True

367

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

368

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

369

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

370

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

371

372

encoding = find_cookie(first)

373

if encoding:

374

return encoding, [first]

375

376

second = read_or_stop()

377

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

378

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

379

380

encoding = find_cookie(second)

381

if encoding:

382

return encoding, [first, second]

383

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

384

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

385

386

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

387

def open(filename):

388

"""Open a file in read only mode using the encoding detected by

389

detect_encoding().

390

"""

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

391

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

392

encoding, lines = detect_encoding(buffer.readline)

393

buffer.seek(0)

394

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

399

def tokenize(readline):

400

"""

401

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

402

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

403

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

404

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

405

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

406

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

407

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

408

The generator produces 5-tuples with these members: the token type; the

409

token string; a 2-tuple (srow, scol) of ints specifying the row and

410

column where the token begins in the source; a 2-tuple (erow, ecol) of

411

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

412

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

413

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

414

415

The first token sequence will always be an ENCODING token

416

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

417

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

418

# This import is here to avoid problems when the itertools module is not

419

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

420

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

421

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

422

rl_gen = iter(readline, b"")

423

empty = repeat(b"")

424

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

425

426

427

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

428

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

429

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

430

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

431

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

432

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

433

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

434

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

435

if encoding == "utf-8-sig":

436

# BOM will already have been stripped.

437

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

438

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

439

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

440

try:

441

line = readline()

442

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

443

line = b''

444

445

if encoding is not None:

446

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

447

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

448

pos, max = 0, len(line)

449

450

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

451

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

452

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

453

endmatch = endprog.match(line)

454

if endmatch:

455

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

456

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

457

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

458

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

459

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

460

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

461

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

462

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

463

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

464

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

465

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

466

else:

467

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

468

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

469

continue

470

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

471

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

472

if not line: break

473

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

474

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

475

if line[pos] == ' ':

476

column += 1

477

elif line[pos] == '\t':

478

column = (column//tabsize + 1)*tabsize

479

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

486

487

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

488

if line[pos] == '#':

489

comment_token = line[pos:].rstrip('\r\n')

490

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

491

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

492

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

493

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

494

(lnum, nl_pos), (lnum, len(line)), line)

495

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

496

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

497

(lnum, pos), (lnum, len(line)), line)

498

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

499

500

if column > indents[-1]: # count indents or dedents

501

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

502

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

503

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

504

if column not in indents:

505

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

506

"unindent does not match any outer indentation level",

507

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

508

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

509

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

510

511

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

512

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

513

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

514

continued = 0

515

516

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

517

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

518

if pseudomatch: # scan for tokens

519

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

520

spos, epos, pos = (lnum, start), (lnum, end), end

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

521

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

522

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

523

if (initial in numchars or # ordinary number

524

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

525

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

526

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

527

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

528

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

529

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

530

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

531

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

532

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

533

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

534

endmatch = endprog.match(line, pos)

535

if endmatch: # all on one line

536

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

537

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

538

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

539

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

540

strstart = (lnum, start) # multiple lines

541

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

542

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

543

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

544

elif initial in single_quoted or \

545

token[:2] in single_quoted or \

546

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

547

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

548

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

549

endprog = _compile(endpats[initial] or

550

endpats[token[1]] or

551

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

552

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

553

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

554

break

555

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

556

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

557

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

558

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

559

elif initial == '\\': # continued stmt

560

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

561

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

562

if initial in '([{':

563

parenlev += 1

564

elif initial in ')]}':

565

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

566

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

567

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

568

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

569

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

570

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

571

572

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

573

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

574

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

575

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

576

577

# An undocumented, backwards compatible, API for all the places in the standard

578

# library that expect to be able to use tokenize with strings

579

def generate_tokens(readline):

580

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

581

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

586

def perror(message):

587

print(message, file=sys.stderr)

588

589

def error(message, filename=None, location=None):

590

if location:

591

args = (filename,) + location + (message,)

592

perror("%s:%d:%d: error: %s" % args)

593

elif filename:

594

perror("%s: error: %s" % (filename, message))

595

else:

596

perror("error: %s" % message)

597

sys.exit(1)

598

599

# Parse the arguments and options

600

parser = argparse.ArgumentParser(prog='python -m tokenize')

601

parser.add_argument(dest='filename', nargs='?',

602

metavar='filename.py',

603

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

604

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

605

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

606

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

612

with builtins.open(filename, 'rb') as f:

613

tokens = list(tokenize(f.readline))

614

else:

615

filename = "<stdin>"

616

tokens = _tokenize(sys.stdin.readline, None)

617

618

# Output the tokenization

619

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

620

token_type = token.type

621

if args.exact:

622

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

623

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

624

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

625

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

626

except IndentationError as err:

627

line, column = err.args[1][1:3]

628

error(err.args[0], filename, (line, column))

629

except TokenError as err:

630

line, column = err.args[1]

631

error(err.args[0], filename, (line, column))

632

except SyntaxError as err:

633

error(err, filename)

634

except IOError as err:

635

error(err)

636

except KeyboardInterrupt:

637

print("interrupted\n")

638

except Exception as err:

639

perror("unexpected error: %s" % err)

640

raise

641

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

642

if __name__ == "__main__":

Meador Inge