Blame - Lib/tokenize.py - platform/external/python/cpython2

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

27

import builtins

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

28

import re

29

import sys

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

30

from token import *

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

31

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

32

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

33

from io import TextIOWrapper

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

34

cookie_re = re.compile("coding[:=]\s*([-\w.]+)")

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

35

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

36

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

37

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

38

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

39

del token

40

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

41

COMMENT = N_TOKENS

42

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

43

NL = N_TOKENS + 1

44

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

45

ENCODING = N_TOKENS + 2

46

tok_name[ENCODING] = 'ENCODING'

47

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

48

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

86

'<<=': LEFTSHIFTEQUAL,

87

'>>=': RIGHTSHIFTEQUAL,

88

'**=': DOUBLESTAREQUAL,

89

'//': DOUBLESLASH,

90

'//=': DOUBLESLASHEQUAL,

91

'@': AT

92

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

93

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

94

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

95

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

96

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

97

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

98

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

99

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

100

@property

101

def exact_type(self):

102

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

103

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

107

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

108

def any(*choices): return group(*choices) + '*'

109

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

110

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

111

# Note: we use unicode matching for names ("\w") but ascii matching for

112

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

113

Whitespace = r'[ \f\t]*'

114

Comment = r'#[^\r\n]*'

115

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

116

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

117

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

118

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

119

Binnumber = r'0[bB][01]+'

120

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

121

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

122

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

123

Exponent = r'[eE][-+]?[0-9]+'

124

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

125

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

126

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

127

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

128

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

129

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

130

StringPrefix = r'(?:[uU][rR]?|[bB][rR]|[rR][bB]|[rR]|[uU])?'

131

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

132

# Tail end of ' string.

133

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

134

# Tail end of " string.

135

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

136

# Tail end of ''' string.

137

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

138

# Tail end of """ string.

139

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

140

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

141

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

142

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

143

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

144

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

145

# Because of leftmost-then-longest match semantics, be sure to put the

146

# longest operators first (e.g., if = came before ==, == would get

147

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

148

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

149

r"//=?", r"->",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

150

r"[+\-*/%&|^=<>]=?",

151

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

152

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

153

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

154

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

155

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

156

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

157

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

158

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

159

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

160

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

161

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

162

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

163

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

164

group('"', r'\\\r?\n'))

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

165

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

166

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

167

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

168

def _compile(expr):

169

return re.compile(expr, re.UNICODE)

170

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

171

endpats = {"'": Single, '"': Double,

172

"'''": Single3, '"""': Double3,

173

"r'''": Single3, 'r"""': Double3,

174

"b'''": Single3, 'b"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

175

"R'''": Single3, 'R"""': Double3,

176

"B'''": Single3, 'B"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

177

"br'''": Single3, 'br"""': Double3,

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

178

"bR'''": Single3, 'bR"""': Double3,

179

"Br'''": Single3, 'Br"""': Double3,

180

"BR'''": Single3, 'BR"""': Double3,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

181

"rb'''": Single3, 'rb"""': Double3,

182

"Rb'''": Single3, 'Rb"""': Double3,

183

"rB'''": Single3, 'rB"""': Double3,

184

"RB'''": Single3, 'RB"""': Double3,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

185

"u'''": Single3, 'u"""': Double3,

186

"ur'''": Single3, 'ur"""': Double3,

187

"R'''": Single3, 'R"""': Double3,

188

"U'''": Single3, 'U"""': Double3,

189

"uR'''": Single3, 'uR"""': Double3,

190

"Ur'''": Single3, 'Ur"""': Double3,

191

"UR'''": Single3, 'UR"""': Double3,

192

'r': None, 'R': None, 'b': None, 'B': None,

193

'u': None, 'U': None}

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

194

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

195

triple_quoted = {}

196

for t in ("'''", '"""',

197

"r'''", 'r"""', "R'''", 'R"""',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

198

"b'''", 'b"""', "B'''", 'B"""',

199

"br'''", 'br"""', "Br'''", 'Br"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

200

"bR'''", 'bR"""', "BR'''", 'BR"""',

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

201

"rb'''", 'rb"""', "rB'''", 'rB"""',

202

"Rb'''", 'Rb"""', "RB'''", 'RB"""',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

203

"u'''", 'u"""', "U'''", 'U"""',

204

"ur'''", 'ur"""', "Ur'''", 'Ur"""',

205

"uR'''", 'uR"""', "UR'''", 'UR"""'):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

Guido van Rossum

4fe72f9

2007-11-12 17:40:10 +0000

[diff] [blame]

210

"b'", 'b"', "B'", 'B"',

211

"br'", 'br"', "Br'", 'Br"',

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

212

"bR'", 'bR"', "BR'", 'BR"' ,

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

213

"rb'", 'rb"', "rB'", 'rB"',

214

"Rb'", 'Rb"', "RB'", 'RB"' ,

Armin Ronacher

2012-03-04 12:04:06 +0000

[diff] [blame]

215

"u'", 'u"', "U'", 'U"',

216

"ur'", 'ur"', "Ur'", 'Ur"',

217

"uR'", 'uR"', "UR'", 'UR"' ):

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

218

single_quoted[t] = t

219

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

220

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

221

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

222

class TokenError(Exception): pass

223

224

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

225

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

226

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

233

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

234

235

def add_whitespace(self, start):

236

row, col = start

237

assert row <= self.prev_row

238

col_offset = col - self.prev_col

239

if col_offset:

240

self.tokens.append(" " * col_offset)

241

242

def untokenize(self, iterable):

243

for t in iterable:

244

if len(t) == 2:

245

self.compat(t, iterable)

246

break

247

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

248

if tok_type == ENCODING:

249

self.encoding = token

250

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

251

self.add_whitespace(start)

252

self.tokens.append(token)

253

self.prev_row, self.prev_col = end

254

if tok_type in (NEWLINE, NL):

255

self.prev_row += 1

256

self.prev_col = 0

257

return "".join(self.tokens)

258

259

def compat(self, token, iterable):

260

startline = False

261

indents = []

262

toks_append = self.tokens.append

263

toknum, tokval = token

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

264

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

265

if toknum in (NAME, NUMBER):

266

tokval += ' '

267

if toknum in (NEWLINE, NL):

268

startline = True

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

269

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

270

for tok in iterable:

271

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

272

if toknum == ENCODING:

273

self.encoding = tokval

274

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

275

276

if toknum in (NAME, NUMBER):

277

tokval += ' '

278

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

279

# Insert a space between two consecutive strings

280

if toknum == STRING:

281

if prevstring:

282

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

287

if toknum == INDENT:

288

indents.append(tokval)

289

continue

290

elif toknum == DEDENT:

291

indents.pop()

292

continue

293

elif toknum in (NEWLINE, NL):

294

startline = True

295

elif startline and indents:

296

toks_append(indents[-1])

297

startline = False

298

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

299

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

300

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

301

def untokenize(iterable):

302

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

303

It returns a bytes object, encoded using the ENCODING

304

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

305

306

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

307

with at least two elements, a token number and token value. If

308

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

309

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

310

Round-trip invariant for full input:

311

Untokenized source will match input source exactly

312

313

Round-trip invariant for limited intput:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

314

# Output bytes will tokenize the back to the input

315

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

316

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

317

readline = BytesIO(newcode).readline

318

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

319

assert t1 == t2

320

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

321

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

322

out = ut.untokenize(iterable)

323

if ut.encoding is not None:

324

out = out.encode(ut.encoding)

325

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

326

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

327

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

328

def _get_normal_name(orig_enc):

329

"""Imitates get_normal_name in tokenizer.c."""

330

# Only care about the first 12 characters.

331

enc = orig_enc[:12].lower().replace("_", "-")

332

if enc == "utf-8" or enc.startswith("utf-8-"):

333

return "utf-8"

334

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

335

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

339

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

340

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

341

The detect_encoding() function is used to detect the encoding that should

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

342

be used to decode a Python source file. It requires one argment, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

343

in the same way as the tokenize() generator.

344

345

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

346

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

347

348

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

349

cookie as specified in pep-0263. If both a bom and a cookie are present,

350

but disagree, a SyntaxError will be raised. If the encoding cookie is an

351

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

352

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

353

354

If no encoding is specified, then the default of 'utf-8' will be returned.

355

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

356

bom_found = False

357

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

358

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

363

return b''

364

365

def find_cookie(line):

366

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

367

# Decode as UTF-8. Either the line is an encoding declaration,

368

# in which case it should be pure ASCII, or it must be UTF-8

369

# per default encoding.

370

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

371

except UnicodeDecodeError:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

372

raise SyntaxError("invalid or missing encoding declaration")

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

373

374

matches = cookie_re.findall(line_string)

375

if not matches:

376

return None

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

377

encoding = _get_normal_name(matches[0])

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

378

try:

379

codec = lookup(encoding)

380

except LookupError:

381

# This behaviour mimics the Python interpreter

382

raise SyntaxError("unknown encoding: " + encoding)

383

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

384

if bom_found:

385

if codec.name != 'utf-8':

386

# This behaviour mimics the Python interpreter

387

raise SyntaxError('encoding problem: utf-8')

388

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

389

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

390

391

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

392

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

393

bom_found = True

394

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

395

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

396

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

397

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

398

399

encoding = find_cookie(first)

400

if encoding:

401

return encoding, [first]

402

403

second = read_or_stop()

404

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

405

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

406

407

encoding = find_cookie(second)

408

if encoding:

409

return encoding, [first, second]

410

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

411

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

412

413

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

414

def open(filename):

415

"""Open a file in read only mode using the encoding detected by

416

detect_encoding().

417

"""

Brett Cannon

f304278

2011-02-22 03:25:12 +0000

[diff] [blame]

418

buffer = builtins.open(filename, 'rb')

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

419

encoding, lines = detect_encoding(buffer.readline)

420

buffer.seek(0)

421

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

426

def tokenize(readline):

427

"""

428

The tokenize() generator requires one argment, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

429

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

430

readline() method of built-in file objects. Each call to the function

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

431

should return one line of input as bytes. Alternately, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

432

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

433

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

434

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

435

The generator produces 5-tuples with these members: the token type; the

436

token string; a 2-tuple (srow, scol) of ints specifying the row and

437

column where the token begins in the source; a 2-tuple (erow, ecol) of

438

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

439

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

440

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

441

442

The first token sequence will always be an ENCODING token

443

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

444

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

445

# This import is here to avoid problems when the itertools module is not

446

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

447

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

448

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

449

rl_gen = iter(readline, b"")

450

empty = repeat(b"")

451

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

452

453

454

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

455

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

456

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

457

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

458

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

459

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

460

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

461

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

462

if encoding == "utf-8-sig":

463

# BOM will already have been stripped.

464

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

465

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

466

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

467

try:

468

line = readline()

469

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

470

line = b''

471

472

if encoding is not None:

473

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

474

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

475

pos, max = 0, len(line)

476

477

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

478

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

479

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

480

endmatch = endprog.match(line)

481

if endmatch:

482

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

483

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

484

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

485

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

486

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

487

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

488

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

489

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

490

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

491

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

492

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

493

else:

494

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

495

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

496

continue

497

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

498

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

499

if not line: break

500

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

501

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

502

if line[pos] == ' ':

503

column += 1

504

elif line[pos] == '\t':

505

column = (column//tabsize + 1)*tabsize

506

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

513

514

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

515

if line[pos] == '#':

516

comment_token = line[pos:].rstrip('\r\n')

517

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

518

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

519

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

520

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

521

(lnum, nl_pos), (lnum, len(line)), line)

522

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

523

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

524

(lnum, pos), (lnum, len(line)), line)

525

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

526

527

if column > indents[-1]: # count indents or dedents

528

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

529

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

530

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

531

if column not in indents:

532

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

533

"unindent does not match any outer indentation level",

534

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

535

indents = indents[:-1]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

536

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

537

538

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

539

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

540

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

541

continued = 0

542

543

while pos < max:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

544

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

545

if pseudomatch: # scan for tokens

546

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

547

spos, epos, pos = (lnum, start), (lnum, end), end

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

548

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

549

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

550

if (initial in numchars or # ordinary number

551

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

552

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

553

elif initial in '\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

554

yield TokenInfo(NL if parenlev > 0 else NEWLINE,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

555

token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

556

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

557

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

558

yield TokenInfo(COMMENT, token, spos, epos, line)

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

559

elif token in triple_quoted:

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

560

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

561

endmatch = endprog.match(line, pos)

562

if endmatch: # all on one line

563

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

564

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

565

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

566

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

567

strstart = (lnum, start) # multiple lines

568

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

569

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

570

break

Guido van Rossum

2002-08-24 06:54:19 +0000

[diff] [blame]

571

elif initial in single_quoted or \

572

token[:2] in single_quoted or \

573

token[:3] in single_quoted:

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

574

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

575

strstart = (lnum, start)

Antoine Pitrou

2011-10-11 15:45:56 +0200

[diff] [blame]

576

endprog = _compile(endpats[initial] or

577

endpats[token[1]] or

578

endpats[token[2]])

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

579

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

580

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

581

break

582

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

583

yield TokenInfo(STRING, token, spos, epos, line)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

584

elif initial.isidentifier(): # ordinary name

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

585

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

586

elif initial == '\\': # continued stmt

587

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

588

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

589

if initial in '([{':

590

parenlev += 1

591

elif initial in ')]}':

592

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

593

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

594

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

595

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

596

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

597

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

598

599

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

600

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

601

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

602

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

603

604

# An undocumented, backwards compatible, API for all the places in the standard

605

# library that expect to be able to use tokenize with strings

606

def generate_tokens(readline):

607

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

608

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

613

def perror(message):

614

print(message, file=sys.stderr)

615

616

def error(message, filename=None, location=None):

617

if location:

618

args = (filename,) + location + (message,)

619

perror("%s:%d:%d: error: %s" % args)

620

elif filename:

621

perror("%s: error: %s" % (filename, message))

622

else:

623

perror("error: %s" % message)

624

sys.exit(1)

625

626

# Parse the arguments and options

627

parser = argparse.ArgumentParser(prog='python -m tokenize')

628

parser.add_argument(dest='filename', nargs='?',

629

metavar='filename.py',

630

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

631

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

632

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

633

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

639

with builtins.open(filename, 'rb') as f:

640

tokens = list(tokenize(f.readline))

641

else:

642

filename = "<stdin>"

643

tokens = _tokenize(sys.stdin.readline, None)

644

645

# Output the tokenization

646

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

647

token_type = token.type

648

if args.exact:

649

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

650

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

651

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

652

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

653

except IndentationError as err:

654

line, column = err.args[1][1:3]

655

error(err.args[0], filename, (line, column))

656

except TokenError as err:

657

line, column = err.args[1]

658

error(err.args[0], filename, (line, column))

659

except SyntaxError as err:

660

error(err, filename)

661

except IOError as err:

662

error(err)

663

except KeyboardInterrupt:

664

print("interrupted\n")

665

except Exception as err:

666

perror("unexpected error: %s" % err)

667

raise

668

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

669

if __name__ == "__main__":

Meador Inge